From df71837d5024e2524cd51c93621e558aa7dd9f3f Mon Sep 17 00:00:00 2001
From: Trent Jaeger <tjaeger@cse.psu.edu>
Date: Tue, 13 Dec 2005 23:12:27 -0800
Subject: [LSM-IPSec]: Security association restriction.

This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets.  Extensions to the SELinux LSM are
included that leverage the patch for this purpose.

This patch implements the changes necessary to the XFRM subsystem,
pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a
socket to use only authorized security associations (or no security
association) to send/receive network packets.

Patch purpose:

The patch is designed to enable access control per packets based on
the strongly authenticated IPSec security association.  Such access
controls augment the existing ones based on network interface and IP
address.  The former are very coarse-grained, and the latter can be
spoofed.  By using IPSec, the system can control access to remote
hosts based on cryptographic keys generated using the IPSec mechanism.
This enables access control on a per-machine basis or per-application
if the remote machine is running the same mechanism and trusted to
enforce the access control policy.

Patch design approach:

The overall approach is that policy (xfrm_policy) entries set by
user-level programs (e.g., setkey for ipsec-tools) are extended with a
security context that is used at policy selection time in the XFRM
subsystem to restrict the sockets that can send/receive packets via
security associations (xfrm_states) that are built from those
policies.

A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.

Patch implementation details:

On output, the policy retrieved (via xfrm_policy_lookup or
xfrm_sk_policy_lookup) must be authorized for the security context of
the socket and the same security context is required for resultant
security association (retrieved or negotiated via racoon in
ipsec-tools).  This is enforced in xfrm_state_find.

On input, the policy retrieved must also be authorized for the socket
(at __xfrm_policy_check), and the security context of the policy must
also match the security association being used.

The patch has virtually no impact on packets that do not use IPSec.
The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as
before.

Also, if IPSec is used without security contexts, the impact is
minimal.  The LSM must allow such policies to be selected for the
combination of socket and remote machine, but subsequent IPSec
processing proceeds as in the original case.

Testing:

The pfkey interface is tested using the ipsec-tools.  ipsec-tools have
been modified (a separate ipsec-tools patch is available for version
0.5) that supports assignment of xfrm_policy entries and security
associations with security contexts via setkey and the negotiation
using the security contexts via racoon.

The xfrm_user interface is tested via ad hoc programs that set
security contexts.  These programs are also available from me, and
contain programs for setting, getting, and deleting policy for testing
this interface.  Testing of sa functions was done by tracing kernel
behavior.

Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pfkeyv2.h  |  13 +++-
 include/linux/security.h | 132 +++++++++++++++++++++++++++++++
 include/linux/xfrm.h     |  29 +++++++
 include/net/flow.h       |   7 +-
 include/net/xfrm.h       |  27 ++++++-
 net/core/flow.c          |   8 +-
 net/key/af_key.c         | 197 +++++++++++++++++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c   |  88 +++++++++++++--------
 net/xfrm/xfrm_state.c    |   9 ++-
 net/xfrm/xfrm_user.c     | 148 +++++++++++++++++++++++++++++++++--
 security/Kconfig         |  13 ++++
 security/dummy.c         |  45 ++++++++++-
 12 files changed, 655 insertions(+), 61 deletions(-)

diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
index 724066778af..6351c4055ac 100644
--- a/include/linux/pfkeyv2.h
+++ b/include/linux/pfkeyv2.h
@@ -216,6 +216,16 @@ struct sadb_x_nat_t_port {
 } __attribute__((packed));
 /* sizeof(struct sadb_x_nat_t_port) == 8 */
 
+/* Generic LSM security context */
+struct sadb_x_sec_ctx {
+	uint16_t	sadb_x_sec_len;
+	uint16_t	sadb_x_sec_exttype;
+	uint8_t		sadb_x_ctx_alg;  /* LSMs: e.g., selinux == 1 */
+	uint8_t		sadb_x_ctx_doi;
+	uint16_t	sadb_x_ctx_len;
+} __attribute__((packed));
+/* sizeof(struct sadb_sec_ctx) = 8 */
+
 /* Message types */
 #define SADB_RESERVED		0
 #define SADB_GETSPI		1
@@ -325,7 +335,8 @@ struct sadb_x_nat_t_port {
 #define SADB_X_EXT_NAT_T_SPORT		21
 #define SADB_X_EXT_NAT_T_DPORT		22
 #define SADB_X_EXT_NAT_T_OA		23
-#define SADB_EXT_MAX			23
+#define SADB_X_EXT_SEC_CTX		24
+#define SADB_EXT_MAX			24
 
 /* Identity Extension values */
 #define SADB_IDENTTYPE_RESERVED	0
diff --git a/include/linux/security.h b/include/linux/security.h
index f7e0ae01871..ef753654daa 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -59,6 +59,12 @@ struct sk_buff;
 struct sock;
 struct sockaddr;
 struct socket;
+struct flowi;
+struct dst_entry;
+struct xfrm_selector;
+struct xfrm_policy;
+struct xfrm_state;
+struct xfrm_user_sec_ctx;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb);
@@ -788,6 +794,52 @@ struct swap_info_struct;
  *      which is used to copy security attributes between local stream sockets.
  * @sk_free_security:
  *	Deallocate security structure.
+ * @sk_getsid:
+ *	Retrieve the LSM-specific sid for the sock to enable caching of network
+ *	authorizations.
+ *
+ * Security hooks for XFRM operations.
+ *
+ * @xfrm_policy_alloc_security:
+ *	@xp contains the xfrm_policy being added to Security Policy Database
+ *	used by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level policy update program (e.g., setkey).
+ *	Allocate a security structure to the xp->selector.security field.
+ *	The security field is initialized to NULL when the xfrm_policy is
+ *	allocated.
+ *	Return 0 if operation was successful (memory to allocate, legal context)
+ * @xfrm_policy_clone_security:
+ *	@old contains an existing xfrm_policy in the SPD.
+ *	@new contains a new xfrm_policy being cloned from old.
+ *	Allocate a security structure to the new->selector.security field
+ *	that contains the information from the old->selector.security field.
+ *	Return 0 if operation was successful (memory to allocate).
+ * @xfrm_policy_free_security:
+ *	@xp contains the xfrm_policy
+ *	Deallocate xp->selector.security.
+ * @xfrm_state_alloc_security:
+ *	@x contains the xfrm_state being added to the Security Association
+ *	Database by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level SA generation program (e.g., setkey or racoon).
+ *	Allocate a security structure to the x->sel.security field.  The
+ *	security field is initialized to NULL when the xfrm_state is
+ *	allocated.
+ *	Return 0 if operation was successful (memory to allocate, legal context).
+ * @xfrm_state_free_security:
+ *	@x contains the xfrm_state.
+ *	Deallocate x>sel.security.
+ * @xfrm_policy_lookup:
+ *	@xp contains the xfrm_policy for which the access control is being
+ *	checked.
+ *	@sk_sid contains the sock security label that is used to authorize
+ *	access to the policy xp.
+ *	@dir contains the direction of the flow (input or output).
+ *	Check permission when a sock selects a xfrm_policy for processing
+ *	XFRMs on a packet.  The hook is called when selecting either a
+ *	per-socket policy or a generic xfrm policy.
+ *	Return 0 if permission is granted.
  *
  * Security hooks affecting all Key Management operations
  *
@@ -1237,8 +1289,18 @@ struct security_operations {
 	int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
 	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
 	void (*sk_free_security) (struct sock *sk);
+	unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
+	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
+	int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+	void (*xfrm_state_free_security) (struct xfrm_state *x);
+	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
 	int (*key_alloc)(struct key *key);
@@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk)
 {
 	return security_ops->sk_free_security(sk);
 }
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return security_ops->sk_getsid(sk, fl, dir);
+}
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct socket * sock,
 					       struct socket * other, 
@@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
 static inline void security_sk_free(struct sock *sk)
 {
 }
+
+static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return security_ops->xfrm_policy_alloc_security(xp, sec_ctx);
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return security_ops->xfrm_policy_clone_security(old, new);
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+	security_ops->xfrm_policy_free_security(xp);
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return security_ops->xfrm_state_alloc_security(x, sec_ctx);
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+	security_ops->xfrm_state_free_security(x);
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return security_ops->xfrm_policy_lookup(xp, sk_sid, dir);
+}
+#else	/* CONFIG_SECURITY_NETWORK_XFRM */
+static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return 0;
+}
+
+static inline void security_xfrm_policy_free(struct xfrm_policy *xp)
+{
+}
+
+static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline void security_xfrm_state_free(struct xfrm_state *x)
+{
+}
+
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return 0;
+}
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 static inline int security_key_alloc(struct key *key)
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 0fb077d6844..82fbb758e28 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -27,6 +27,22 @@ struct xfrm_id
 	__u8		proto;
 };
 
+struct xfrm_sec_ctx {
+	__u8	ctx_doi;
+	__u8	ctx_alg;
+	__u16	ctx_len;
+	__u32	ctx_sid;
+	char	ctx_str[0];
+};
+
+/* Security Context Domains of Interpretation */
+#define XFRM_SC_DOI_RESERVED 0
+#define XFRM_SC_DOI_LSM 1
+
+/* Security Context Algorithms */
+#define XFRM_SC_ALG_RESERVED 0
+#define XFRM_SC_ALG_SELINUX 1
+
 /* Selector, used as selector both on policy rules (SPD) and SAs. */
 
 struct xfrm_selector
@@ -146,6 +162,18 @@ enum {
 
 #define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)
 
+/*
+ * Generic LSM security context for comunicating to user space
+ * NOTE: Same format as sadb_x_sec_ctx
+ */
+struct xfrm_user_sec_ctx {
+	__u16			len;
+	__u16			exttype;
+	__u8			ctx_alg;  /* LSMs: e.g., selinux == 1 */
+	__u8			ctx_doi;
+	__u16			ctx_len;
+};
+
 struct xfrm_user_tmpl {
 	struct xfrm_id		id;
 	__u16			family;
@@ -176,6 +204,7 @@ enum xfrm_attr_type_t {
 	XFRMA_TMPL,		/* 1 or more struct xfrm_user_tmpl */
 	XFRMA_SA,
 	XFRMA_POLICY,
+	XFRMA_SEC_CTX,		/* struct xfrm_sec_ctx */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/flow.h b/include/net/flow.h
index 9a5c94b1a0e..ec7eb86eb20 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -84,11 +84,12 @@ struct flowi {
 #define FLOW_DIR_OUT	1
 #define FLOW_DIR_FWD	2
 
-typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir,
+struct sock;
+typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
-			       flow_resolve_t resolver);
+extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+	 		       flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1cdb8791213..487abca3ca6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -144,6 +144,9 @@ struct xfrm_state
 	 * transformer. */
 	struct xfrm_type	*type;
 
+	/* Security context */
+	struct xfrm_sec_ctx	*security;
+
 	/* Private data of this transformer, format is opaque,
 	 * interpreted by xfrm_type methods. */
 	void			*data;
@@ -298,6 +301,7 @@ struct xfrm_policy
 	__u8			flags;
 	__u8			dead;
 	__u8			xfrm_nr;
+	struct xfrm_sec_ctx	*security;
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 };
 
@@ -510,6 +514,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl,
 	return 0;
 }
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+/*	If neither has a context --> match
+ * 	Otherwise, both must have a context and the sids, doi, alg must match
+ */
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+	return ((!s1 && !s2) ||
+		(s1 && s2 &&
+		 (s1->ctx_sid == s2->ctx_sid) &&
+		 (s1->ctx_doi == s2->ctx_doi) &&
+		 (s1->ctx_alg == s2->ctx_alg)));
+}
+#else
+static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
+{
+	return 1;
+}
+#endif
+
 /* A struct encoding bundle of transformations to apply to some set of flow.
  *
  * dst->child points to the next element of bundle.
@@ -878,8 +901,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
 extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
-				      int delete);
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+					  struct xfrm_sec_ctx *ctx, int delete);
 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
 void xfrm_policy_flush(void);
 u32 xfrm_get_acqseq(void);
diff --git a/net/core/flow.c b/net/core/flow.c
index 7e95b39de9f..c4f25385029 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -23,6 +23,7 @@
 #include <net/flow.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
+#include <linux/security.h>
 
 struct flow_cache_entry {
 	struct flow_cache_entry	*next;
@@ -30,6 +31,7 @@ struct flow_cache_entry {
 	u8			dir;
 	struct flowi		key;
 	u32			genid;
+	u32			sk_sid;
 	void			*object;
 	atomic_t		*object_ref;
 };
@@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
+void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
 	struct flow_cache_entry *fle, **head;
@@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
+		    fle->sk_sid == sk_sid &&
 		    flow_key_compare(key, &fle->key) == 0) {
 			if (fle->genid == atomic_read(&flow_cache_genid)) {
 				void *ret = fle->object;
@@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
+			fle->sk_sid = sk_sid;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
 			flow_count(cpu)++;
@@ -221,7 +225,7 @@ nocache:
 		void *obj;
 		atomic_t *obj_ref;
 
-		resolver(key, family, dir, &obj, &obj_ref);
+		resolver(key, sk_sid, family, dir, &obj, &obj_ref);
 
 		if (fle) {
 			fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 39031684b65..d32f7791f1e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = {
 	[SADB_X_EXT_NAT_T_SPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
 	[SADB_X_EXT_NAT_T_DPORT]	= (u8) sizeof(struct sadb_x_nat_t_port),
 	[SADB_X_EXT_NAT_T_OA]		= (u8) sizeof(struct sadb_address),
+	[SADB_X_EXT_SEC_CTX]		= (u8) sizeof(struct sadb_x_sec_ctx),
 };
 
 /* Verify sadb_address_{len,prefixlen} against sa_family.  */
@@ -383,6 +384,55 @@ static int verify_address_len(void *p)
 	return 0;
 }
 
+static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
+{
+	int len = 0;
+
+	len += sizeof(struct sadb_x_sec_ctx);
+	len += sec_ctx->sadb_x_ctx_len;
+	len += sizeof(uint64_t) - 1;
+	len /= sizeof(uint64_t);
+
+	return len;
+}
+
+static inline int verify_sec_ctx_len(void *p)
+{
+	struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p;
+	int len;
+
+	if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE)
+		return -EINVAL;
+
+	len = pfkey_sec_ctx_len(sec_ctx);
+
+	if (sec_ctx->sadb_x_sec_len != len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx)
+{
+	struct xfrm_user_sec_ctx *uctx = NULL;
+	int ctx_size = sec_ctx->sadb_x_ctx_len;
+
+	uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL);
+
+	if (!uctx)
+		return NULL;
+
+	uctx->len = pfkey_sec_ctx_len(sec_ctx);
+	uctx->exttype = sec_ctx->sadb_x_sec_exttype;
+	uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi;
+	uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg;
+	uctx->ctx_len = sec_ctx->sadb_x_ctx_len;
+	memcpy(uctx + 1, sec_ctx + 1,
+	       uctx->ctx_len);
+
+	return uctx;
+}
+
 static int present_and_same_family(struct sadb_address *src,
 				   struct sadb_address *dst)
 {
@@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h
 				if (verify_address_len(p))
 					return -EINVAL;
 			}				
+			if (ext_type == SADB_X_EXT_SEC_CTX) {
+				if (verify_sec_ctx_len(p))
+					return -EINVAL;
+			}
 			ext_hdrs[ext_type-1] = p;
 		}
 		p   += ext_len;
@@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 	struct sadb_key *key;
 	struct sadb_x_sa2 *sa2;
 	struct sockaddr_in *sin;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
+	int ctx_size = 0;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct sockaddr_in6 *sin6;
 #endif
@@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 			sizeof(struct sadb_address)*2 + 
 				sockaddr_size*2 +
 					sizeof(struct sadb_x_sa2);
+
+	if ((xfrm_ctx = x->security)) {
+		ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
+		size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
+	}
+
 	/* identity & sensitivity */
 
 	if ((x->props.family == AF_INET &&
@@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 		n_port->sadb_x_nat_t_port_reserved = 0;
 	}
 
+	/* security context */
+	if (xfrm_ctx) {
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb,
+				sizeof(struct sadb_x_sec_ctx) + ctx_size);
+		sec_ctx->sadb_x_sec_len =
+		  (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
 	return skb;
 }
 
@@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 	struct sadb_lifetime *lifetime;
 	struct sadb_sa *sa;
 	struct sadb_key *key;
+	struct sadb_x_sec_ctx *sec_ctx;
 	uint16_t proto;
 	int err;
 	
@@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 		x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime;
 		x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime;
 	}
+
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			goto out;
+
+		err = security_xfrm_state_alloc(x, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
 	key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1];
 	if (sa->sadb_sa_auth) {
 		int keysize = 0;
@@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
 	return 0;
 }
 
+static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp)
+{
+  struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+
+	if (xfrm_ctx) {
+		int len = sizeof(struct sadb_x_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+		return PFKEY_ALIGN8(len);
+	}
+	return 0;
+}
+
 static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
 {
 	int sockaddr_size = pfkey_sockaddr_size(xp->family);
@@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp)
 		(sockaddr_size * 2) +
 		sizeof(struct sadb_x_policy) +
 		(xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) +
-				(socklen * 2)));
+				(socklen * 2))) +
+		pfkey_xfrm_policy2sec_ctx_size(xp);
 }
 
 static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp)
@@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 	struct sadb_lifetime *lifetime;
 	struct sadb_x_policy *pol;
 	struct sockaddr_in   *sin;
+	struct sadb_x_sec_ctx *sec_ctx;
+	struct xfrm_sec_ctx *xfrm_ctx;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	struct sockaddr_in6  *sin6;
 #endif
@@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 			}
 		}
 	}
+
+	/* security context */
+	if ((xfrm_ctx = xp->security)) {
+		int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp);
+
+		sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size);
+		sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t);
+		sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX;
+		sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi;
+		sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg;
+		sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len;
+		memcpy(sec_ctx + 1, xfrm_ctx->ctx_str,
+		       xfrm_ctx->ctx_len);
+	}
+
 	hdr->sadb_msg_len = size / sizeof(uint64_t);
 	hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
 }
@@ -1976,12 +2099,13 @@ out:
 
 static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
-	int err;
+	int err = 0;
 	struct sadb_lifetime *lifetime;
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
 	struct xfrm_policy *xp;
 	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	if (xp->selector.dport)
 		xp->selector.dport_mask = ~0;
 
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx) {
+			err = -ENOBUFS;
+			goto out;
+		}
+
+		err = security_xfrm_policy_alloc(xp, uctx);
+		kfree(uctx);
+
+		if (err)
+			goto out;
+	}
+
 	xp->lft.soft_byte_limit = XFRM_INF;
 	xp->lft.hard_byte_limit = XFRM_INF;
 	xp->lft.soft_packet_limit = XFRM_INF;
@@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 
 	err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
 				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
-	if (err) {
-		kfree(xp);
-		return err;
-	}
+
+	if (err)
+		goto out;
 
 	if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
 		c.event = XFRM_MSG_UPDPOLICY;
@@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	return 0;
 
 out:
+	security_xfrm_policy_free(xp);
 	kfree(xp);
 	return err;
 }
@@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	int err;
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
-	struct xfrm_policy *xp;
+	struct xfrm_policy *xp, tmp;
 	struct xfrm_selector sel;
 	struct km_event c;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	if (sel.dport)
 		sel.dport_mask = ~0;
 
-	xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1);
+	sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1];
+	memset(&tmp, 0, sizeof(struct xfrm_policy));
+
+	if (sec_ctx != NULL) {
+		struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+
+		if (!uctx)
+			return -ENOMEM;
+
+		err = security_xfrm_policy_alloc(&tmp, uctx);
+		kfree(uctx);
+
+		if (err)
+			return err;
+	}
+
+	xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
+	security_xfrm_policy_free(&tmp);
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 {
 	struct xfrm_policy *xp;
 	struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
+	struct sadb_x_sec_ctx *sec_ctx;
 
 	switch (family) {
 	case AF_INET:
@@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 	    (*dir = parse_ipsecrequests(xp, pol)) < 0)
 		goto out;
 
+	/* security context too */
+	if (len >= (pol->sadb_x_policy_len*8 +
+	    sizeof(struct sadb_x_sec_ctx))) {
+		char *p = (char *)pol;
+		struct xfrm_user_sec_ctx *uctx;
+
+		p += pol->sadb_x_policy_len*8;
+		sec_ctx = (struct sadb_x_sec_ctx *)p;
+		if (len < pol->sadb_x_policy_len*8 +
+		    sec_ctx->sadb_x_sec_len)
+			goto out;
+		if ((*dir = verify_sec_ctx_len(p)))
+			goto out;
+		uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
+		*dir = security_xfrm_policy_alloc(xp, uctx);
+		kfree(uctx);
+
+		if (*dir)
+			goto out;
+	}
+
 	*dir = pol->sadb_x_policy_dir-1;
 	return xp;
 
 out:
+	security_xfrm_policy_free(xp);
 	kfree(xp);
 	return NULL;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d19e274b9c4..64a447375fd 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -10,7 +10,7 @@
  * 	YOSHIFUJI Hideaki
  * 		Split up af-specific portion
  *	Derek Atkins <derek@ihtfp.com>		Add the post_input processor
- * 	
+ *
  */
 
 #include <asm/bug.h>
@@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		BUG();
 
+	security_xfrm_policy_free(policy);
 	kfree(policy);
 }
 EXPORT_SYMBOL(__xfrm_policy_destroy);
@@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
-		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) {
+		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
+		    xfrm_sec_ctx_match(pol->security, policy->security)) {
 			if (excl) {
 				write_unlock_bh(&xfrm_policy_lock);
 				return -EEXIST;
@@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
 
-struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
-				      int delete)
+struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+					  struct xfrm_sec_ctx *ctx, int delete)
 {
 	struct xfrm_policy *pol, **p;
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
-		if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) {
+		if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
+		    (xfrm_sec_ctx_match(ctx, pol->security))) {
 			xfrm_pol_hold(pol);
 			if (delete)
 				*p = pol->next;
@@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
 	}
 	return pol;
 }
-EXPORT_SYMBOL(xfrm_policy_bysel);
+EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 
 struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
 {
@@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
 
 /* Find policy to apply to this flow. */
 
-static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
+static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp)
 {
 	struct xfrm_policy *pol;
@@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 			continue;
 
 		match = xfrm_selector_match(sel, fl, family);
+
 		if (match) {
-			xfrm_pol_hold(pol);
-			break;
+ 			if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
+				xfrm_pol_hold(pol);
+				break;
+			}
 		}
 	}
 	read_unlock_bh(&xfrm_policy_lock);
@@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 		*obj_refp = &pol->refcnt;
 }
 
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
+static inline int policy_to_flow_dir(int dir)
+{
+	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
+ 	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
+ 	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
+ 		return dir;
+ 	switch (dir) {
+ 	default:
+ 	case XFRM_POLICY_IN:
+ 		return FLOW_DIR_IN;
+ 	case XFRM_POLICY_OUT:
+ 		return FLOW_DIR_OUT;
+ 	case XFRM_POLICY_FWD:
+ 		return FLOW_DIR_FWD;
+	};
+}
+
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
 {
 	struct xfrm_policy *pol;
 
 	read_lock_bh(&xfrm_policy_lock);
 	if ((pol = sk->sk_policy[dir]) != NULL) {
-		int match = xfrm_selector_match(&pol->selector, fl,
+ 		int match = xfrm_selector_match(&pol->selector, fl,
 						sk->sk_family);
+ 		int err = 0;
+
 		if (match)
+		  err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
+
+ 		if (match && !err)
 			xfrm_pol_hold(pol);
 		else
 			pol = NULL;
@@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
 
 	if (newp) {
 		newp->selector = old->selector;
+		if (security_xfrm_policy_clone(old, newp)) {
+			kfree(newp);
+			return NULL;  /* ENOMEM */
+		}
 		newp->lft = old->lft;
 		newp->curlft = old->curlft;
 		newp->action = old->action;
@@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
 	return err;
 }
 
-static inline int policy_to_flow_dir(int dir)
-{
-	if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-	    XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-	    XFRM_POLICY_FWD == FLOW_DIR_FWD)
-		return dir;
-	switch (dir) {
-	default:
-	case XFRM_POLICY_IN:
-		return FLOW_DIR_IN;
-	case XFRM_POLICY_OUT:
-		return FLOW_DIR_OUT;
-	case XFRM_POLICY_FWD:
-		return FLOW_DIR_FWD;
-	};
-}
 
 static int stale_bundle(struct dst_entry *dst);
 
@@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 	int err;
 	u32 genid;
 	u16 family = dst_orig->ops->family;
+	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+	u32 sk_sid = security_sk_sid(sk, fl, dir);
 restart:
 	genid = atomic_read(&flow_cache_genid);
 	policy = NULL;
 	if (sk && sk->sk_policy[1])
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
 
 	if (!policy) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
 			return 0;
 
-		policy = flow_cache_lookup(fl, family,
-					   policy_to_flow_dir(XFRM_POLICY_OUT),
+		policy = flow_cache_lookup(fl, sk_sid, family, dir,
 					   xfrm_policy_lookup);
 	}
 
@@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 {
 	struct xfrm_policy *pol;
 	struct flowi fl;
+	u8 fl_dir = policy_to_flow_dir(dir);
+	u32 sk_sid;
 
 	if (_decode_session(skb, &fl, family) < 0)
 		return 0;
 
+	sk_sid = security_sk_sid(sk, &fl, fl_dir);
+
 	/* First, check used SA against their selectors. */
 	if (skb->sp) {
 		int i;
 
 		for (i=skb->sp->len-1; i>=0; i--) {
-		  struct sec_decap_state *xvec = &(skb->sp->x[i]);
+			struct sec_decap_state *xvec = &(skb->sp->x[i]);
 			if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family))
 				return 0;
 
@@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	pol = NULL;
 	if (sk && sk->sk_policy[dir])
-		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
+		pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
 
 	if (!pol)
-		pol = flow_cache_lookup(&fl, family,
-					policy_to_flow_dir(dir),
+		pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
 					xfrm_policy_lookup);
 
 	if (!pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 479effc9766..e12d0be5f97 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -10,7 +10,7 @@
  * 		Split up af-specific functions
  *	Derek Atkins <derek@ihtfp.com>
  *		Add UDP Encapsulation
- * 	
+ *
  */
 
 #include <linux/workqueue.h>
@@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
+	security_xfrm_state_free(x);
 	kfree(x);
 }
 
@@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			      selector.
 			 */
 			if (x->km.state == XFRM_STATE_VALID) {
-				if (!xfrm_selector_match(&x->sel, fl, family))
+				if (!xfrm_selector_match(&x->sel, fl, family) ||
+				    !xfrm_sec_ctx_match(pol->security, x->security))
 					continue;
 				if (!best ||
 				    best->km.dying > x->km.dying ||
@@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 				acquire_in_progress = 1;
 			} else if (x->km.state == XFRM_STATE_ERROR ||
 				   x->km.state == XFRM_STATE_EXPIRED) {
-				if (xfrm_selector_match(&x->sel, fl, family))
+ 				if (xfrm_selector_match(&x->sel, fl, family) &&
+				    xfrm_sec_ctx_match(pol->security, x->security))
 					error = -ESRCH;
 			}
 		}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0cdd9a07e04..92e2b804c60 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -7,7 +7,7 @@
  * 	Kazunori MIYAZAWA @USAGI
  * 	Kunihiro Ishiguro <kunihiro@ipinfusion.com>
  * 		IPv6 support
- * 	
+ *
  */
 
 #include <linux/module.h>
@@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
 	return 0;
 }
 
+
+static inline int verify_sec_ctx_len(struct rtattr **xfrma)
+{
+	struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1];
+	struct xfrm_user_sec_ctx *uctx;
+	int len = 0;
+
+	if (!rt)
+		return 0;
+
+	if (rt->rta_len < sizeof(*uctx))
+		return -EINVAL;
+
+	uctx = RTA_DATA(rt);
+
+	if (uctx->ctx_len > PAGE_SIZE)
+		return -EINVAL;
+
+	len += sizeof(struct xfrm_user_sec_ctx);
+	len += uctx->ctx_len;
+
+	if (uctx->len != len)
+		return -EINVAL;
+
+	return 0;
+}
+
+
 static int verify_newsa_info(struct xfrm_usersa_info *p,
 			     struct rtattr **xfrma)
 {
@@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_encap_tmpl(xfrma)))
 		goto out;
+	if ((err = verify_sec_ctx_len(xfrma)))
+		goto out;
 
 	err = -EINVAL;
 	switch (p->mode) {
@@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a
 	return 0;
 }
 
+
+static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp)
+{
+	struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+	int len = 0;
+
+	if (xfrm_ctx) {
+		len += sizeof(struct xfrm_user_sec_ctx);
+		len += xfrm_ctx->ctx_len;
+	}
+	return len;
+}
+
+static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
+{
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!u_arg)
+		return 0;
+
+	uctx = RTA_DATA(u_arg);
+	return security_xfrm_state_alloc(x, uctx);
+}
+
 static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
 {
 	memcpy(&x->id, &p->id, sizeof(x->id));
@@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
 	if (err)
 		goto error;
 
+	if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1])))
+		goto error;
+
 	x->km.seq = p->seq;
 
 	return x;
@@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	int err;
 	struct km_event c;
 
-	err = verify_newsa_info(p, (struct rtattr **) xfrma);
+	err = verify_newsa_info(p, (struct rtattr **)xfrma);
 	if (err)
 		return err;
 
-	x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err);
+	x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err);
 	if (!x)
 		return err;
 
@@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
 	if (x->encap)
 		RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
 
+	if (x->security) {
+		int ctx_size = sizeof(struct xfrm_sec_ctx) +
+				x->security->ctx_len;
+		struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+		struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+		uctx->exttype = XFRMA_SEC_CTX;
+		uctx->len = ctx_size;
+		uctx->ctx_doi = x->security->ctx_doi;
+		uctx->ctx_alg = x->security->ctx_alg;
+		uctx->ctx_len = x->security->ctx_len;
+		memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
+	}
 	nlh->nlmsg_len = skb->tail - b;
 out:
 	sp->this_idx++;
@@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
 	return verify_policy_dir(p->dir);
 }
 
+static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma)
+{
+	struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1];
+	struct xfrm_user_sec_ctx *uctx;
+
+	if (!rt)
+		return 0;
+
+	uctx = RTA_DATA(rt);
+	return security_xfrm_policy_alloc(pol, uctx);
+}
+
 static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 			   int nr)
 {
@@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
 	}
 
 	copy_from_user_policy(xp, p);
-	err = copy_from_user_tmpl(xp, xfrma);
+
+	if (!(err = copy_from_user_tmpl(xp, xfrma)))
+		err = copy_from_user_sec_ctx(xp, xfrma);
+
 	if (err) {
 		*errp = err;
 		kfree(xp);
@@ -700,10 +785,13 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	int excl;
 
 	err = verify_newpolicy_info(p);
+	if (err)
+		return err;
+	err = verify_sec_ctx_len((struct rtattr **)xfrma);
 	if (err)
 		return err;
 
-	xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err);
+	xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err);
 	if (!xp)
 		return err;
 
@@ -761,6 +849,27 @@ rtattr_failure:
 	return -1;
 }
 
+static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	if (xp->security) {
+		int ctx_size = sizeof(struct xfrm_sec_ctx) +
+				xp->security->ctx_len;
+		struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size);
+		struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+		uctx->exttype = XFRMA_SEC_CTX;
+		uctx->len = ctx_size;
+		uctx->ctx_doi = xp->security->ctx_doi;
+		uctx->ctx_alg = xp->security->ctx_alg;
+		uctx->ctx_len = xp->security->ctx_len;
+		memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len);
+	}
+	return 0;
+
+ rtattr_failure:
+	return -1;
+}
+
 static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
 {
 	struct xfrm_dump_info *sp = ptr;
@@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 	copy_to_user_policy(xp, p, dir);
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 out:
@@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 
 	if (p->index)
 		xp = xfrm_policy_byid(p->dir, p->index, delete);
-	else
-		xp = xfrm_policy_bysel(p->dir, &p->sel, delete);
+	else {
+		struct rtattr **rtattrs = (struct rtattr **)xfrma;
+		struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
+		struct xfrm_policy tmp;
+
+		err = verify_sec_ctx_len(rtattrs);
+		if (err)
+			return err;
+
+		memset(&tmp, 0, sizeof(struct xfrm_policy));
+		if (rt) {
+			struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt);
+
+			if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
+				return err;
+		}
+		xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
+		security_xfrm_policy_free(&tmp);
+	}
 	if (xp == NULL)
 		return -ENOENT;
 
@@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
@@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
 
 	len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
 	len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire));
+	len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
@@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 	copy_to_user_policy(xp, &upe->pol, dir);
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_sec_ctx(xp, skb))
+		goto nlmsg_failure;
 	upe->hard = !!hard;
 
 	nlh->nlmsg_len = skb->tail - b;
@@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
 
 	len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
 	len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire));
+	len += RTA_SPACE(xfrm_user_sec_ctx_size(xp));
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
diff --git a/security/Kconfig b/security/Kconfig
index 64d3f1e9ca8..34f593410d5 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,19 @@ config SECURITY_NETWORK
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_NETWORK_XFRM
+	bool "XFRM (IPSec) Networking Security Hooks"
+	depends on XFRM && SECURITY_NETWORK
+	help
+	  This enables the XFRM (IPSec) networking security hooks.
+	  If enabled, a security module can use these hooks to
+	  implement per-packet access controls based on labels
+	  derived from IPSec policy.  Non-IPSec communications are
+	  designated as unlabelled, and only sockets authorized
+	  to communicate unlabelled data can send without using
+	  IPSec.
+	  If you are unsure how to answer this question, answer N.
+
 config SECURITY_CAPABILITIES
 	tristate "Default Linux Capabilities"
 	depends on SECURITY
diff --git a/security/dummy.c b/security/dummy.c
index 3ca5f2b828a..a15c54709fd 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr
 static inline void dummy_sk_free_security (struct sock *sk)
 {
 }
+
+static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	return 0;
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	return 0;
+}
+
+static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp)
+{
+}
+
+static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static void dummy_xfrm_state_free_security(struct xfrm_state *x)
+{
+}
+
+static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	return 0;
+}
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
 static int dummy_register_security (const char *name, struct security_operations *ops)
 {
 	return -EINVAL;
@@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, socket_getpeersec);
 	set_to_dummy_if_null(ops, sk_alloc_security);
 	set_to_dummy_if_null(ops, sk_free_security);
-#endif	/* CONFIG_SECURITY_NETWORK */
+	set_to_dummy_if_null(ops, sk_getsid);
+ #endif	/* CONFIG_SECURITY_NETWORK */
+#ifdef  CONFIG_SECURITY_NETWORK_XFRM
+	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
+	set_to_dummy_if_null(ops, xfrm_policy_clone_security);
+	set_to_dummy_if_null(ops, xfrm_policy_free_security);
+	set_to_dummy_if_null(ops, xfrm_state_alloc_security);
+	set_to_dummy_if_null(ops, xfrm_state_free_security);
+	set_to_dummy_if_null(ops, xfrm_policy_lookup);
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 #ifdef CONFIG_KEYS
 	set_to_dummy_if_null(ops, key_alloc);
 	set_to_dummy_if_null(ops, key_free);
-- 
cgit v1.2.3


From d28d1e080132f28ab773291f10ad6acca4c8bba2 Mon Sep 17 00:00:00 2001
From: Trent Jaeger <tjaeger@cse.psu.edu>
Date: Tue, 13 Dec 2005 23:12:40 -0800
Subject: [LSM-IPSec]: Per-packet access control.

This patch series implements per packet access control via the
extension of the Linux Security Modules (LSM) interface by hooks in
the XFRM and pfkey subsystems that leverage IPSec security
associations to label packets.  Extensions to the SELinux LSM are
included that leverage the patch for this purpose.

This patch implements the changes necessary to the SELinux LSM to
create, deallocate, and use security contexts for policies
(xfrm_policy) and security associations (xfrm_state) that enable
control of a socket's ability to send and receive packets.

Patch purpose:

The patch is designed to enable the SELinux LSM to implement access
control on individual packets based on the strongly authenticated
IPSec security association.  Such access controls augment the existing
ones in SELinux based on network interface and IP address.  The former
are very coarse-grained, and the latter can be spoofed.  By using
IPSec, the SELinux can control access to remote hosts based on
cryptographic keys generated using the IPSec mechanism.  This enables
access control on a per-machine basis or per-application if the remote
machine is running the same mechanism and trusted to enforce the
access control policy.

Patch design approach:

The patch's main function is to authorize a socket's access to a IPSec
policy based on their security contexts.  Since the communication is
implemented by a security association, the patch ensures that the
security association's negotiated and used have the same security
context.  The patch enables allocation and deallocation of such
security contexts for policies and security associations.  It also
enables copying of the security context when policies are cloned.
Lastly, the patch ensures that packets that are sent without using a
IPSec security assocation with a security context are allowed to be
sent in that manner.

A presentation available at
www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf
from the SELinux symposium describes the overall approach.

Patch implementation details:

The function which authorizes a socket to perform a requested
operation (send/receive) on a IPSec policy (xfrm_policy) is
selinux_xfrm_policy_lookup.  The Netfilter and rcv_skb hooks ensure
that if a IPSec SA with a securit y association has not been used,
then the socket is allowed to send or receive the packet,
respectively.

The patch implements SELinux function for allocating security contexts
when policies (xfrm_policy) are created via the pfkey or xfrm_user
interfaces via selinux_xfrm_policy_alloc.  When a security association
is built, SELinux allocates the security context designated by the
XFRM subsystem which is based on that of the authorized policy via
selinux_xfrm_state_alloc.

When a xfrm_policy is cloned, the security context of that policy, if
any, is copied to the clone via selinux_xfrm_policy_clone.

When a xfrm_policy or xfrm_state is freed, its security context, if
any is also freed at selinux_xfrm_policy_free or
selinux_xfrm_state_free.

Testing:

The SELinux authorization function is tested using ipsec-tools.  We
created policies and security associations with particular security
contexts and added SELinux access control policy entries to verify the
authorization decision.  We also made sure that packets for which no
security context was supplied (which either did or did not use
security associations) were authorized using an unlabelled context.

Signed-off-by: Trent Jaeger <tjaeger@cse.psu.edu>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 security/selinux/Makefile                    |   2 +
 security/selinux/hooks.c                     |  39 ++++
 security/selinux/include/av_perm_to_string.h |   2 +
 security/selinux/include/av_permissions.h    |   2 +
 security/selinux/include/xfrm.h              |  54 +++++
 security/selinux/xfrm.c                      | 311 +++++++++++++++++++++++++++
 6 files changed, 410 insertions(+)
 create mode 100644 security/selinux/include/xfrm.h
 create mode 100644 security/selinux/xfrm.c

diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index b038cd0fae2..06d54d9d20a 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o
 
 selinux-$(CONFIG_SECURITY_NETWORK) += netif.o
 
+selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
+
 EXTRA_CFLAGS += -Isecurity/selinux/include
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fc774436a26..3d496eae1b4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -73,6 +73,7 @@
 #include "avc.h"
 #include "objsec.h"
 #include "netif.h"
+#include "xfrm.h"
 
 #define XATTR_SELINUX_SUFFIX "selinux"
 #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -3349,6 +3350,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		err = avc_has_perm(sock_sid, port_sid,
 				   sock_class, recv_perm, &ad);
 	}
+
+	if (!err)
+		err = selinux_xfrm_sock_rcv_skb(sock_sid, skb);
+
 out:	
 	return err;
 }
@@ -3401,6 +3406,24 @@ static void selinux_sk_free_security(struct sock *sk)
 	sk_free_security(sk);
 }
 
+static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir)
+{
+	struct inode_security_struct *isec;
+	u32 sock_sid = SECINITSID_ANY_SOCKET;
+
+	if (!sk)
+		return selinux_no_sk_sid(fl);
+
+	read_lock_bh(&sk->sk_callback_lock);
+	isec = get_sock_isec(sk);
+
+	if (isec)
+		sock_sid = isec->sid;
+
+	read_unlock_bh(&sk->sk_callback_lock);
+	return sock_sid;
+}
+
 static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
 {
 	int err = 0;
@@ -3536,6 +3559,11 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
 		                   send_perm, &ad) ? NF_DROP : NF_ACCEPT;
 	}
 
+	if (err != NF_ACCEPT)
+		goto out;
+
+	err = selinux_xfrm_postroute_last(isec->sid, skb);
+
 out:
 	return err;
 }
@@ -4380,6 +4408,16 @@ static struct security_operations selinux_ops = {
 	.socket_getpeersec =		selinux_socket_getpeersec,
 	.sk_alloc_security =		selinux_sk_alloc_security,
 	.sk_free_security =		selinux_sk_free_security,
+	.sk_getsid = 			selinux_sk_getsid_security,
+#endif
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
+	.xfrm_policy_clone_security =	selinux_xfrm_policy_clone,
+	.xfrm_policy_free_security =	selinux_xfrm_policy_free,
+	.xfrm_state_alloc_security =	selinux_xfrm_state_alloc,
+	.xfrm_state_free_security =	selinux_xfrm_state_free,
+	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
 #endif
 };
 
@@ -4491,6 +4529,7 @@ static int __init selinux_nf_ip_init(void)
 		panic("SELinux: nf_register_hook for IPv6: error %d\n", err);
 
 #endif	/* IPV6 */
+
 out:
 	return err;
 }
diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h
index 1deb59e1b76..71aeb12f07c 100644
--- a/security/selinux/include/av_perm_to_string.h
+++ b/security/selinux/include/av_perm_to_string.h
@@ -238,3 +238,5 @@
    S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost")
    S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto")
    S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom")
+   S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELFROM, "relabelfrom")
+   S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELTO, "relabelto")
diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h
index a78b5d59c9f..d1d0996049e 100644
--- a/security/selinux/include/av_permissions.h
+++ b/security/selinux/include/av_permissions.h
@@ -908,6 +908,8 @@
 
 #define ASSOCIATION__SENDTO                       0x00000001UL
 #define ASSOCIATION__RECVFROM                     0x00000002UL
+#define ASSOCIATION__RELABELFROM                  0x00000004UL
+#define ASSOCIATION__RELABELTO                    0x00000008UL
 
 #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL      0x00000001UL
 #define NETLINK_KOBJECT_UEVENT_SOCKET__READ       0x00000002UL
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
new file mode 100644
index 00000000000..8e87996c6dd
--- /dev/null
+++ b/security/selinux/include/xfrm.h
@@ -0,0 +1,54 @@
+/*
+ * SELinux support for the XFRM LSM hooks
+ *
+ * Author : Trent Jaeger, <jaegert@us.ibm.com>
+ */
+#ifndef _SELINUX_XFRM_H_
+#define _SELINUX_XFRM_H_
+
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
+void selinux_xfrm_policy_free(struct xfrm_policy *xp);
+int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+void selinux_xfrm_state_free(struct xfrm_state *x);
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+
+/*
+ * Extract the security blob from the sock (it's actually on the socket)
+ */
+static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
+{
+	if (!sk->sk_socket)
+		return NULL;
+
+	return SOCK_INODE(sk->sk_socket)->i_security;
+}
+
+
+static inline u32 selinux_no_sk_sid(struct flowi *fl)
+{
+	/* NOTE: no sock occurs on ICMP reply, forwards, ... */
+	/* icmp_reply: authorize as kernel packet */
+	if (fl && fl->proto == IPPROTO_ICMP) {
+		return SECINITSID_KERNEL;
+	}
+
+	return SECINITSID_ANY_SOCKET;
+}
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb);
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb);
+#else
+static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+{
+	return NF_ACCEPT;
+}
+#endif
+
+#endif /* _SELINUX_XFRM_H_ */
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
new file mode 100644
index 00000000000..c4d87d4dca7
--- /dev/null
+++ b/security/selinux/xfrm.c
@@ -0,0 +1,311 @@
+/*
+ *  NSA Security-Enhanced Linux (SELinux) security module
+ *
+ *  This file contains the SELinux XFRM hook function implementations.
+ *
+ *  Authors:  Serge Hallyn <sergeh@us.ibm.com>
+ *	      Trent Jaeger <jaegert@us.ibm.com>
+ *
+ *  Copyright (C) 2005 International Business Machines Corporation
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2,
+ *	as published by the Free Software Foundation.
+ */
+
+/*
+ * USAGE:
+ * NOTES:
+ *   1. Make sure to enable the following options in your kernel config:
+ *	CONFIG_SECURITY=y
+ *	CONFIG_SECURITY_NETWORK=y
+ *	CONFIG_SECURITY_NETWORK_XFRM=y
+ *	CONFIG_SECURITY_SELINUX=m/y
+ * ISSUES:
+ *   1. Caching packets, so they are not dropped during negotiation
+ *   2. Emulating a reasonable SO_PEERSEC across machines
+ *   3. Testing addition of sk_policy's with security context via setsockopt
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/xfrm.h>
+#include <net/xfrm.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <asm/semaphore.h>
+
+#include "avc.h"
+#include "objsec.h"
+#include "xfrm.h"
+
+
+/*
+ * Returns true if an LSM/SELinux context
+ */
+static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
+{
+	return (ctx &&
+		(ctx->ctx_doi == XFRM_SC_DOI_LSM) &&
+		(ctx->ctx_alg == XFRM_SC_ALG_SELINUX));
+}
+
+/*
+ * Returns true if the xfrm contains a security blob for SELinux
+ */
+static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
+{
+	return selinux_authorizable_ctx(x->security);
+}
+
+/*
+ * LSM hook implementation that authorizes that a socket can be used
+ * with the corresponding xfrm_sec_ctx and direction.
+ */
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+{
+	int rc = 0;
+	u32 sel_sid = SECINITSID_UNLABELED;
+	struct xfrm_sec_ctx *ctx;
+
+	/* Context sid is either set to label or ANY_ASSOC */
+	if ((ctx = xp->security)) {
+		if (!selinux_authorizable_ctx(ctx))
+			return -EINVAL;
+
+		sel_sid = ctx->ctx_sid;
+	}
+
+	rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION,
+			  ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM :
+			   ((dir == FLOW_DIR_OUT) ?  ASSOCIATION__SENDTO :
+			    (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))),
+			  NULL);
+
+	return rc;
+}
+
+/*
+ * Security blob allocation for xfrm_policy and xfrm_state
+ * CTX does not have a meaningful value on input
+ */
+static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx)
+{
+	int rc = 0;
+	struct task_security_struct *tsec = current->security;
+	struct xfrm_sec_ctx *ctx;
+
+	BUG_ON(!uctx);
+	BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX);
+
+	if (uctx->ctx_len >= PAGE_SIZE)
+		return -ENOMEM;
+
+	*ctxp = ctx = kmalloc(sizeof(*ctx) +
+			      uctx->ctx_len,
+			      GFP_KERNEL);
+
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ctx_doi = uctx->ctx_doi;
+	ctx->ctx_len = uctx->ctx_len;
+	ctx->ctx_alg = uctx->ctx_alg;
+
+	memcpy(ctx->ctx_str,
+	       uctx+1,
+	       ctx->ctx_len);
+	rc = security_context_to_sid(ctx->ctx_str,
+				     ctx->ctx_len,
+				     &ctx->ctx_sid);
+
+	if (rc)
+		goto out;
+
+	/*
+	 * Does the subject have permission to set security or permission to
+	 * do the relabel?
+	 * Must be permitted to relabel from default socket type (process type)
+	 * to specified context
+	 */
+	rc = avc_has_perm(tsec->sid, tsec->sid,
+			  SECCLASS_ASSOCIATION,
+			  ASSOCIATION__RELABELFROM, NULL);
+	if (rc)
+		goto out;
+
+	rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+			  SECCLASS_ASSOCIATION,
+			  ASSOCIATION__RELABELTO, NULL);
+	if (rc)
+		goto out;
+
+	return rc;
+
+out:
+	*ctxp = 0;
+	kfree(ctx);
+	return rc;
+}
+
+/*
+ * LSM hook implementation that allocs and transfers uctx spec to
+ * xfrm_policy.
+ */
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx)
+{
+	int err;
+
+	BUG_ON(!xp);
+
+	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx);
+	return err;
+}
+
+
+/*
+ * LSM hook implementation that copies security data structure from old to
+ * new for policy cloning.
+ */
+int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
+{
+	struct xfrm_sec_ctx *old_ctx, *new_ctx;
+
+	old_ctx = old->security;
+
+	if (old_ctx) {
+		new_ctx = new->security = kmalloc(sizeof(*new_ctx) +
+						  old_ctx->ctx_len,
+						  GFP_KERNEL);
+
+		if (!new_ctx)
+			return -ENOMEM;
+
+		memcpy(new_ctx, old_ctx, sizeof(*new_ctx));
+		memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len);
+	}
+	return 0;
+}
+
+/*
+ * LSM hook implementation that frees xfrm_policy security information.
+ */
+void selinux_xfrm_policy_free(struct xfrm_policy *xp)
+{
+	struct xfrm_sec_ctx *ctx = xp->security;
+	if (ctx)
+		kfree(ctx);
+}
+
+/*
+ * LSM hook implementation that allocs and transfers sec_ctx spec to
+ * xfrm_state.
+ */
+int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx)
+{
+	int err;
+
+	BUG_ON(!x);
+
+	err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx);
+	return err;
+}
+
+/*
+ * LSM hook implementation that frees xfrm_state security information.
+ */
+void selinux_xfrm_state_free(struct xfrm_state *x)
+{
+	struct xfrm_sec_ctx *ctx = x->security;
+	if (ctx)
+		kfree(ctx);
+}
+
+/*
+ * LSM hook that controls access to unlabelled packets.  If
+ * a xfrm_state is authorizable (defined by macro) then it was
+ * already authorized by the IPSec process.  If not, then
+ * we need to check for unlabelled access since this may not have
+ * gone thru the IPSec process.
+ */
+int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+{
+	int i, rc = 0;
+	struct sec_path *sp;
+
+	sp = skb->sp;
+
+	if (sp) {
+		/*
+		 * __xfrm_policy_check does not approve unless xfrm_policy_ok
+		 * says that spi's match for policy and the socket.
+		 *
+		 *  Only need to verify the existence of an authorizable sp.
+		 */
+		for (i = 0; i < sp->len; i++) {
+			struct xfrm_state *x = sp->x[i].xvec;
+
+			if (x && selinux_authorizable_xfrm(x))
+				goto accept;
+		}
+	}
+
+	/* check SELinux sock for unlabelled access */
+	rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__RECVFROM, NULL);
+	if (rc)
+		goto drop;
+
+accept:
+	return 0;
+
+drop:
+	return rc;
+}
+
+/*
+ * POSTROUTE_LAST hook's XFRM processing:
+ * If we have no security association, then we need to determine
+ * whether the socket is allowed to send to an unlabelled destination.
+ * If we do have a authorizable security association, then it has already been
+ * checked in xfrm_policy_lookup hook.
+ */
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+{
+	struct dst_entry *dst;
+	int rc = 0;
+
+	dst = skb->dst;
+
+	if (dst) {
+		struct dst_entry *dst_test;
+
+		for (dst_test = dst; dst_test != 0;
+		     dst_test = dst_test->child) {
+			struct xfrm_state *x = dst_test->xfrm;
+
+			if (x && selinux_authorizable_xfrm(x))
+				goto accept;
+		}
+	}
+
+	rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__SENDTO, NULL);
+	if (rc)
+		goto drop;
+
+accept:
+	return NF_ACCEPT;
+
+drop:
+	return NF_DROP;
+}
-- 
cgit v1.2.3


From 018da8f44c066d5fc390011b8c953135f973b3a9 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 13 Dec 2005 23:13:00 -0800
Subject: [TCP] BIC: remove low utilization code.

The latest BICTCP patch at:
http://www.csc.ncsu.edu:8080/faculty/rhee/export/bitcp/index_files/Page546.htm

disables the low_utilization feature of BICTCP because it doesn't work
in some cases. This patch removes it.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bic.c | 81 +-----------------------------------------------------
 1 file changed, 1 insertion(+), 80 deletions(-)

diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 1d0cd86621b..cf8c75f08ef 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -30,8 +30,6 @@ static int fast_convergence = 1;
 static int max_increment = 16;
 static int low_window = 14;
 static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
-static int low_utilization_threshold = 153;
-static int low_utilization_period = 2;
 static int initial_ssthresh = 100;
 static int smooth_part = 20;
 
@@ -43,10 +41,6 @@ module_param(low_window, int, 0644);
 MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
 module_param(beta, int, 0644);
 MODULE_PARM_DESC(beta, "beta for multiplicative increase");
-module_param(low_utilization_threshold, int, 0644);
-MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
-module_param(low_utilization_period, int, 0644);
-MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
 module_param(initial_ssthresh, int, 0644);
 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
 module_param(smooth_part, int, 0644);
@@ -60,11 +54,6 @@ struct bictcp {
 	u32	loss_cwnd;	/* congestion window at last loss */
 	u32	last_cwnd;	/* the last snd_cwnd */
 	u32	last_time;	/* time when updated last_cwnd */
-	u32	delay_min;	/* min delay */
-	u32	delay_max;	/* max delay */
-	u32	last_delay;
-	u8	low_utilization;/* 0: high; 1: low */
-	u32	low_utilization_start;	/* starting time of low utilization detection*/
 	u32	epoch_start;	/* beginning of an epoch */
 #define ACK_RATIO_SHIFT	4
 	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
@@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca)
 	ca->loss_cwnd = 0;
 	ca->last_cwnd = 0;
 	ca->last_time = 0;
-	ca->delay_min = 0;
-	ca->delay_max = 0;
-	ca->last_delay = 0;
-	ca->low_utilization = 0;
-	ca->low_utilization_start = 0;
 	ca->epoch_start = 0;
 	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
 }
@@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 	}
 
 	/* if in slow start or link utilization is very low */
-	if ( ca->loss_cwnd == 0 ||
-	     (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+	if (ca->loss_cwnd == 0) {
 		if (ca->cnt > 20) /* increase cwnd 5% per RTT */
 			ca->cnt = 20;
 	}
@@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 		ca->cnt = 1;
 }
 
-
-/* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct sock *sk, int flag)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	struct bictcp *ca = inet_csk_ca(sk);
-	u32 dist, delay;
-
-	/* No time stamp */
-	if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
-	     /* Discard delay samples right after fast recovery */
-	     tcp_time_stamp < ca->epoch_start + HZ ||
-	     /* this delay samples may not be accurate */
-	     flag == 0) {
-		ca->last_delay = 0;
-		goto notlow;
-	}
-
-	delay = ca->last_delay<<3;	/* use the same scale as tp->srtt*/
-	ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	if (delay == 0) 		/* no previous delay sample */
-		goto notlow;
-
-	/* first time call or link delay decreases */
-	if (ca->delay_min == 0 || ca->delay_min > delay) {
-		ca->delay_min = ca->delay_max = delay;
-		goto notlow;
-	}
-
-	if (ca->delay_max < delay)
-		ca->delay_max = delay;
-
-	/* utilization is low, if avg delay < dist*threshold
-	   for checking_period time */
-	dist = ca->delay_max - ca->delay_min;
-	if (dist <= ca->delay_min>>6 ||
-	    tp->srtt - ca->delay_min >=  (dist*low_utilization_threshold)>>10)
-		goto notlow;
-
-	if (ca->low_utilization_start == 0) {
-		ca->low_utilization = 0;
-		ca->low_utilization_start = tcp_time_stamp;
-	} else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
-			> low_utilization_period*HZ) {
-		ca->low_utilization = 1;
-	}
-
-	return;
-
- notlow:
-	ca->low_utilization = 0;
-	ca->low_utilization_start = 0;
-
-}
-
 static void bictcp_cong_avoid(struct sock *sk, u32 ack,
 			      u32 seq_rtt, u32 in_flight, int data_acked)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bictcp *ca = inet_csk_ca(sk);
 
-	bictcp_low_utilization(sk, data_acked);
-
 	if (!tcp_is_cwnd_limited(sk, in_flight))
 		return;
 
@@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
 
 	ca->epoch_start = 0;	/* end of epoch */
 
-	/* in case of wrong delay_max*/
-	if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
-		ca->delay_max = ca->delay_min
-			+ ((ca->delay_max - ca->delay_min)* 90) / 100;
-
 	/* Wmax and fast convergence */
 	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
 		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
-- 
cgit v1.2.3


From 05d054503a9c4652212b8730150608787547ecc3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 13 Dec 2005 23:13:13 -0800
Subject: [TCP] BIC: spelling and whitespace

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index cf8c75f08ef..035f2092d73 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -210,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state)
 		bictcp_reset(inet_csk_ca(sk));
 }
 
-/* Track delayed acknowledgement ratio using sliding window
+/* Track delayed acknowledgment ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
 static void bictcp_acked(struct sock *sk, u32 cnt)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
-	if (cnt > 0 && 	icsk->icsk_ca_state == TCP_CA_Open) {
+	if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
 		struct bictcp *ca = inet_csk_ca(sk);
 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
 		ca->delayed_ack += cnt;
-- 
cgit v1.2.3


From df3271f3361b61ce02da0026b4a53e63bc2720cb Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 13 Dec 2005 23:13:28 -0800
Subject: [TCP] BIC: CUBIC window growth (2.0)

Replace existing BIC version 1.1 with new version 2.0.
The main change is to replace the window growth function
with a cubic function as described in:
  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |   8 +
 net/ipv4/Makefile    |   1 +
 net/ipv4/tcp_cubic.c | 445 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 454 insertions(+)
 create mode 100644 net/ipv4/tcp_cubic.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e55136ae09f..011cca7ae02 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -456,6 +456,14 @@ config TCP_CONG_BIC
 	increase provides TCP friendliness.
 	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
 
+config TCP_CONG_CUBIC
+	tristate "CUBIC TCP"
+	default m
+	---help---
+	This is version 2.0 of BIC-TCP which uses a cubic growth function
+	among other techniques.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
 config TCP_CONG_WESTWOOD
 	tristate "TCP Westwood+"
 	default m
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f0435d00db6..c54edd76de0 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
new file mode 100644
index 00000000000..bb5dc4bfb6b
--- /dev/null
+++ b/net/ipv4/tcp_cubic.c
@@ -0,0 +1,445 @@
+/*
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
+ *
+ * This is from the implementation of CUBIC TCP in
+ * Injong Rhee, Lisong Xu.
+ *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant
+ *  in PFLDnet 2005
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define BICTCP_B		4	 /*
+					  * In binary search,
+					  * go to point (max+min)/N
+					  */
+#define	BICTCP_HZ		10	/* BIC HZ 2^10 = 1024 */
+
+static int fast_convergence = 1;
+static int max_increment = 16;
+static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh = 100;
+static int bic_scale = 41;
+static int tcp_friendliness = 1;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0644);
+MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, int, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	loss_cwnd;	/* congestion window at last loss */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	bic_origin_point;/* origin point of bic function */
+	u32	bic_K;		/* time to origin point from the beginning of the current epoch */
+	u32	delay_min;	/* min delay */
+	u32	epoch_start;	/* beginning of an epoch */
+	u32	ack_cnt;	/* number of acks */
+	u32	tcp_cwnd;	/* estimated tcp cwnd */
+#define ACK_RATIO_SHIFT	4
+	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->loss_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->bic_origin_point = 0;
+	ca->bic_K = 0;
+	ca->delay_min = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+	ca->ack_cnt = 0;
+	ca->tcp_cwnd = 0;
+}
+
+static void bictcp_init(struct sock *sk)
+{
+	bictcp_reset(inet_csk_ca(sk));
+	if (initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* 65536 times the cubic root */
+static const u64 cubic_table[8]
+	= {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};
+
+/*
+ * calculate the cubic root of x
+ * the basic idea is that x can be expressed as i*8^j
+ * so cubic_root(x) = cubic_root(i)*2^j
+ *  in the following code, x is i, and y is 2^j
+ *  because of integer calculation, there are errors in calculation
+ *  so finally use binary search to find out the exact solution
+ */
+static u32 cubic_root(u64 x)
+{
+        u64 y, app, target, start, end, mid, start_diff, end_diff;
+
+        if (x == 0)
+                return 0;
+
+        target = x;
+
+        /* first estimate lower and upper bound */
+        y = 1;
+        while (x >= 8){
+                x = (x >> 3);
+                y = (y << 1);
+        }
+        start = (y*cubic_table[x])>>16;
+        if (x==7)
+                end = (y<<1);
+        else
+                end = (y*cubic_table[x+1]+65535)>>16;
+
+        /* binary search for more accurate one */
+        while (start < end-1) {
+                mid = (start+end) >> 1;
+                app = mid*mid*mid;
+                if (app < target)
+                        start = mid;
+                else if (app > target)
+                        end = mid;
+                else
+                        return mid;
+        }
+
+        /* find the most accurate one from start and end */
+        app = start*start*start;
+        if (app < target)
+                start_diff = target - app;
+        else
+                start_diff = app - target;
+        app = end*end*end;
+        if (app < target)
+                end_diff = target - app;
+        else
+                end_diff = app - target;
+
+        if (start_diff < end_diff)
+                return (u32)start;
+        else
+                return (u32)end;
+}
+
+static inline u32 bictcp_K(u32 dist, u32 srtt)
+{
+        u64 d64;
+        u32 d32;
+        u32 count;
+        u32 result;
+
+        /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+           so K = cubic_root( (wmax-cwnd)*rtt/c )
+           the unit of K is bictcp_HZ=2^10, not HZ
+
+           c = bic_scale >> 10
+           rtt = (tp->srtt >> 3 ) / HZ
+
+           the following code has been designed and tested for
+           cwnd < 1 million packets
+           RTT < 100 seconds
+           HZ < 1,000,00  (corresponding to 10 nano-second)
+
+        */
+
+        /* 1/c * 2^2*bictcp_HZ */
+        d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale;
+        d64 = (__u64)d32;
+
+        /* srtt * 2^count / HZ
+           1) to get a better accuracy of the following d32,
+           the larger the "count", the better the accuracy
+           2) and avoid overflow of the following d64
+           the larger the "count", the high possibility of overflow
+           3) so find a "count" between bictcp_hz-3 and bictcp_hz
+           "count" may be less than bictcp_HZ,
+           then d64 becomes 0. that is OK
+        */
+        d32 = srtt;
+        count = 0;
+        while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){
+                d32 = d32 << 1;
+                count++;
+        }
+        d32 = d32 / HZ;
+
+        /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)  */
+        d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ);
+
+        /* cubic root */
+        d64 = cubic_root(d64);
+
+        result = (u32)d64;
+        return result;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	u64 d64;
+	u32 d32, t, srtt, bic_target, min_cnt, max_cnt;
+
+	ca->ack_cnt++;	/* count the number of ACKs */
+
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_time_stamp;
+
+	srtt = (HZ << 3)/10;	/* use real time-based growth function */
+
+	if (ca->epoch_start == 0) {
+		ca->epoch_start = tcp_time_stamp;	/* record the beginning of an epoch */
+		ca->ack_cnt = 1;			/* start counting */
+		ca->tcp_cwnd = cwnd;			/* syn with cubic */
+
+		if (ca->last_max_cwnd <= cwnd) {
+			ca->bic_K = 0;
+			ca->bic_origin_point = cwnd;
+		} else {
+			ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt);
+			ca->bic_origin_point = ca->last_max_cwnd;
+		}
+	}
+
+        /* cubic function - calc*/
+        /* calculate c * time^3 / rtt,
+         *  while considering overflow in calculation of time^3
+	 * (so time^3 is done by using d64)
+	 * and without the support of division of 64bit numbers
+	 * (so all divisions are done by using d32)
+         *  also NOTE the unit of those veriables
+         *	  time  = (t - K) / 2^bictcp_HZ
+         *	  c = bic_scale >> 10
+	 * rtt  = (srtt >> 3) / HZ
+	 * !!! The following code does not have overflow problems,
+	 * if the cwnd < 1 million packets !!!
+         */
+
+	/* change the unit from HZ to bictcp_HZ */
+        t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
+	     << BICTCP_HZ) / HZ;
+
+        if (t < ca->bic_K)		/* t - K */
+                d32 = ca->bic_K - t;
+        else
+                d32 = t - ca->bic_K;
+
+        d64 = (u64)d32;
+        d32 = (bic_scale << 3) * HZ / srtt;			/* 1024*c/rtt */
+        d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ);	/* c/rtt * (t-K)^3 */
+        d32 = (u32)d64;
+        if (t < ca->bic_K)                                	/* below origin*/
+                bic_target = ca->bic_origin_point - d32;
+        else                                                	/* above origin*/
+                bic_target = ca->bic_origin_point + d32;
+
+        /* cubic function - calc bictcp_cnt*/
+        if (bic_target > cwnd) {
+		ca->cnt = cwnd / (bic_target - cwnd);
+        } else {
+                ca->cnt = 100 * cwnd;              /* very small increment*/
+        }
+
+	if (ca->delay_min > 0) {
+		/* max increment = Smax * rtt / 0.1  */
+		min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
+		if (ca->cnt < min_cnt)
+			ca->cnt = min_cnt;
+	}
+
+        /* slow start and low utilization  */
+	if (ca->loss_cwnd == 0)		/* could be aggressive in slow start */
+		ca->cnt = 50;
+
+	/* TCP Friendly */
+	if (tcp_friendliness) {
+		u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta);
+		d32 = (cwnd * scale) >> 3;
+	        while (ca->ack_cnt > d32) {		/* update tcp cwnd */
+	                ca->ack_cnt -= d32;
+        	        ca->tcp_cwnd++;
+		}
+
+		if (ca->tcp_cwnd > cwnd){	/* if bic is slower than tcp */
+			d32 = ca->tcp_cwnd - cwnd;
+			max_cnt = cwnd / d32;
+			if (ca->cnt > max_cnt)
+				ca->cnt = max_cnt;
+		}
+        }
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+
+/* Keep track of minimum rtt */
+static inline void measure_delay(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+	u32 delay;
+
+	/* No time stamp */
+	if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+	     /* Discard delay samples right after fast recovery */
+	    (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
+		return;
+
+	delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	if (delay == 0)
+		delay = 1;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
+}
+
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
+			      u32 seq_rtt, u32 in_flight, int data_acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	if (data_acked)
+		measure_delay(sk);
+
+	if (!tcp_is_cwnd_limited(sk, in_flight))
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp);
+	else {
+		bictcp_update(ca, tp->snd_cwnd);
+
+		/* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= ca->cnt) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+
+}
+
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tp->snd_cwnd;
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+	return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct sock *sk)
+{
+	struct bictcp *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct sock *sk)
+{
+	return tcp_sk(sk)->snd_ssthresh;
+}
+
+static void bictcp_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss)
+		bictcp_reset(inet_csk_ca(sk));
+}
+
+/* Track delayed acknowledgment ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct sock *sk, u32 cnt)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
+		struct bictcp *ca = inet_csk_ca(sk);
+		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+		ca->delayed_ack += cnt;
+	}
+}
+
+
+static struct tcp_congestion_ops cubictcp = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= bictcp_undo_cwnd,
+	.min_cwnd	= bictcp_min_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "cubic",
+};
+
+static int __init cubictcp_register(void)
+{
+	BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&cubictcp);
+}
+
+static void __exit cubictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&cubictcp);
+}
+
+module_init(cubictcp_register);
+module_exit(cubictcp_unregister);
+
+MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CUBIC TCP");
+MODULE_VERSION("2.0");
-- 
cgit v1.2.3


From 318360646941d6f3d4c6e4ee99107392728a4079 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 13 Dec 2005 23:13:48 -0800
Subject: [NETFILTER] ip_tables: NUMA-aware allocation

Part of a performance problem with ip_tables is that memory allocation
is not NUMA aware, but 'only' SMP aware (ie each CPU normally touch
separate cache lines)

Even with small iptables rules, the cost of this misplacement can be
high on common workloads.  Instead of using one vmalloc() area
(located in the node of the iptables process), we now allocate an area
for each possible CPU, using vmalloc_node() so that memory should be
allocated in the CPU's node if possible.

Port to arp_tables and ip6_tables by Harald Welte.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c | 175 ++++++++++++++++++++++++-----------
 net/ipv4/netfilter/ip_tables.c  | 199 +++++++++++++++++++++++++++-------------
 net/ipv6/netfilter/ip6_tables.c | 190 +++++++++++++++++++++++++-------------
 3 files changed, 382 insertions(+), 182 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3c2e9639bba..bba15630469 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -68,19 +68,14 @@ struct arpt_table_info {
 	unsigned int initial_entries;
 	unsigned int hook_entry[NF_ARP_NUMHOOKS];
 	unsigned int underflow[NF_ARP_NUMHOOKS];
-	char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+	void *entries[NR_CPUS];
 };
 
 static LIST_HEAD(arpt_target);
 static LIST_HEAD(arpt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
 static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 				      char *hdr_addr, int len)
 {
@@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 	outdev = out ? out->name : nulldevname;
 
 	read_lock_bh(&table->lock);
-	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private,
-			       smp_processor_id());
+	table_base = (void *)table->private->entries[smp_processor_id()];
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 	back = get_entry(table_base, table->private->underflow[hook]);
 
@@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp)
 /* Figures out from what hook each rule can be called: returns 0 if
  * there are loops.  Puts hook bitmask in comefrom.
  */
-static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+static int mark_source_chains(struct arpt_table_info *newinfo,
+			      unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
 
@@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
 	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct arpt_entry *e
-			= (struct arpt_entry *)(newinfo->entries + pos);
+			= (struct arpt_entry *)(entry0 + pos);
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
@@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
 						goto next;
 
 					e = (struct arpt_entry *)
-						(newinfo->entries + pos);
+						(entry0 + pos);
 				} while (oldpos == pos + e->next_offset);
 
 				/* Move along one */
 				size = e->next_offset;
 				e = (struct arpt_entry *)
-					(newinfo->entries + pos + size);
+					(entry0 + pos + size);
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali
 					newpos = pos + e->next_offset;
 				}
 				e = (struct arpt_entry *)
-					(newinfo->entries + newpos);
+					(entry0 + newpos);
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
@@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
 static int translate_table(const char *name,
 			   unsigned int valid_hooks,
 			   struct arpt_table_info *newinfo,
+			   void *entry0,
 			   unsigned int size,
 			   unsigned int number,
 			   const unsigned int *hook_entries,
@@ -710,11 +705,11 @@ static int translate_table(const char *name,
 	i = 0;
 
 	/* Walk through entries, checking offsets. */
-	ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
 				 check_entry_size_and_hooks,
 				 newinfo,
-				 newinfo->entries,
-				 newinfo->entries + size,
+				 entry0,
+				 entry0 + size,
 				 hook_entries, underflows, &i);
 	duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
 	if (ret != 0)
@@ -743,29 +738,26 @@ static int translate_table(const char *name,
 		}
 	}
 
-	if (!mark_source_chains(newinfo, valid_hooks)) {
+	if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
 		duprintf("Looping hook\n");
 		return -ELOOP;
 	}
 
 	/* Finally, each sanity check must pass */
 	i = 0;
-	ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
 				 check_entry, name, size, &i);
 
 	if (ret != 0) {
-		ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+		ARPT_ENTRY_ITERATE(entry0, newinfo->size,
 				   cleanup_entry, &i);
 		return ret;
 	}
 
 	/* And one copy for every other CPU */
 	for_each_cpu(i) {
-		if (i == 0)
-			continue;
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
-		       newinfo->entries,
-		       SMP_ALIGN(newinfo->size));
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
 	return ret;
@@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e,
 	return 0;
 }
 
+static inline int set_entry_to_counter(const struct arpt_entry *e,
+				       struct arpt_counters total[],
+				       unsigned int *i)
+{
+	SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static void get_counters(const struct arpt_table_info *t,
 			 struct arpt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
+	unsigned int curcpu;
+
+	/* Instead of clearing (by a previous call to memset())
+	 * the counters and using adds, we set the counters
+	 * with data used by 'current' CPU
+	 * We dont care about preemption here.
+	 */
+	curcpu = raw_smp_processor_id();
+
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[curcpu],
+			   t->size,
+			   set_entry_to_counter,
+			   counters,
+			   &i);
 
 	for_each_cpu(cpu) {
+		if (cpu == curcpu)
+			continue;
 		i = 0;
-		ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
@@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	struct arpt_entry *e;
 	struct arpt_counters *counters;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size,
 		return -ENOMEM;
 
 	/* First, sum counters... */
-	memset(counters, 0, countersize);
 	write_lock_bh(&table->lock);
 	get_counters(table->private, counters);
 	write_unlock_bh(&table->lock);
 
-	/* ... then copy entire thing from CPU 0... */
-	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	/* ... then copy entire thing ... */
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
 	}
@@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
 		struct arpt_entry_target *t;
 
-		e = (struct arpt_entry *)(table->private->entries + off);
+		e = (struct arpt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
 				 + offsetof(struct arpt_entry, counters),
 				 &counters[num],
@@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries,
 	return ret;
 }
 
+static void free_table_info(struct arpt_table_info *info)
+{
+	int cpu;
+	for_each_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+	kfree(info);
+}
+
+static struct arpt_table_info *alloc_table_info(unsigned int size)
+{
+	struct arpt_table_info *newinfo;
+	int cpu;
+	
+	newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+
+	newinfo->size = size;
+
+	for_each_cpu(cpu) {
+		if (size <= PAGE_SIZE)
+			newinfo->entries[cpu] = kmalloc_node(size,
+							GFP_KERNEL,
+							cpu_to_node(cpu));
+		else
+			newinfo->entries[cpu] = vmalloc_node(size,
+							     cpu_to_node(cpu));
+
+		if (newinfo->entries[cpu] == NULL) {
+			free_table_info(newinfo);
+			return NULL;
+		}
+	}
+
+	return newinfo;
+}
+
 static int do_replace(void __user *user, unsigned int len)
 {
 	int ret;
@@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len)
 	struct arpt_table *t;
 	struct arpt_table_info *newinfo, *oldinfo;
 	struct arpt_counters *counters;
+	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len)
 	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
 		return -ENOMEM;
 
-	newinfo = vmalloc(sizeof(struct arpt_table_info)
-			  + SMP_ALIGN(tmp.size) *
-			  		(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
-	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len)
 		ret = -ENOMEM;
 		goto free_newinfo;
 	}
-	memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
 
 	ret = translate_table(tmp.name, tmp.valid_hooks,
-			      newinfo, tmp.size, tmp.num_entries,
+			      newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
 			      tmp.hook_entry, tmp.underflow);
 	if (ret != 0)
 		goto free_newinfo_counters;
@@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len)
 	/* Get the old counters. */
 	get_counters(oldinfo, counters);
 	/* Decrease module usage counts and free resource */
-	ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
-	vfree(oldinfo);
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+
+	free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
 			 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
@@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len)
 	module_put(t->me);
 	up(&arpt_mutex);
  free_newinfo_counters_untrans:
-	ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+	ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	return ret;
 }
 
@@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len)
 	struct arpt_counters_info tmp, *paddc;
 	struct arpt_table *t;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len)
 	}
 
 	i = 0;
-	ARPT_ENTRY_ITERATE(t->private->entries,
+	/* Choose the copy that is on our node */
+	loc_cpu_entry = t->private->entries[smp_processor_id()];
+	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   t->private->size,
 			   add_counter_to_entry,
 			   paddc->counters,
@@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table,
 	struct arpt_table_info *newinfo;
 	static struct arpt_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	void *loc_cpu_entry;
 
-	newinfo = vmalloc(sizeof(struct arpt_table_info)
-			  + SMP_ALIGN(repl->size) *
-			  		(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(repl->size);
 	if (!newinfo) {
 		ret = -ENOMEM;
 		return ret;
 	}
-	memcpy(newinfo->entries, repl->entries, repl->size);
+
+	/* choose the copy on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(table->name, table->valid_hooks,
-			      newinfo, repl->size,
+			      newinfo, loc_cpu_entry, repl->size,
 			      repl->num_entries,
 			      repl->hook_entry,
 			      repl->underflow);
 	duprintf("arpt_register_table: translate table gives %d\n", ret);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
 	ret = down_interruptible(&arpt_mutex);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
@@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table,
 	return ret;
 
  free_unlock:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	goto unlock;
 }
 
 void arpt_unregister_table(struct arpt_table *table)
 {
+	void *loc_cpu_entry;
+
 	down(&arpt_mutex);
 	LIST_DELETE(&arpt_tables, table);
 	up(&arpt_mutex);
 
 	/* Decrease module usage counts and free resources */
-	ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
 			   cleanup_entry, NULL);
-	vfree(table->private);
+	free_table_info(table->private);
 }
 
 /* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 45886c8475e..2a26d167e14 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex);
    context stops packets coming through and allows user context to read
    the counters or update the rules.
 
-   To be cache friendly on SMP, we arrange them like so:
-   [ n-entries ]
-   ... cache-align padding ...
-   [ n-entries ]
-
    Hence the start of any table is given by get_table() below.  */
 
 /* The table itself */
@@ -105,20 +100,15 @@ struct ipt_table_info
 	unsigned int underflow[NF_IP_NUMHOOKS];
 
 	/* ipt_entry tables: one per CPU */
-	char entries[0] ____cacheline_aligned;
+	void *entries[NR_CPUS];
 };
 
 static LIST_HEAD(ipt_target);
 static LIST_HEAD(ipt_match);
 static LIST_HEAD(ipt_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
 #if 0
 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
 #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+	table_base = (void *)table->private->entries[smp_processor_id()];
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip)
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
-mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ipt_table_info *newinfo,
+		   unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
 
@@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
 	for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ipt_entry *e
-			= (struct ipt_entry *)(newinfo->entries + pos);
+			= (struct ipt_entry *)(entry0 + pos);
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
@@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
 						goto next;
 
 					e = (struct ipt_entry *)
-						(newinfo->entries + pos);
+						(entry0 + pos);
 				} while (oldpos == pos + e->next_offset);
 
 				/* Move along one */
 				size = e->next_offset;
 				e = (struct ipt_entry *)
-					(newinfo->entries + pos + size);
+					(entry0 + pos + size);
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
 					newpos = pos + e->next_offset;
 				}
 				e = (struct ipt_entry *)
-					(newinfo->entries + newpos);
+					(entry0 + newpos);
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
@@ -855,6 +845,7 @@ static int
 translate_table(const char *name,
 		unsigned int valid_hooks,
 		struct ipt_table_info *newinfo,
+		void *entry0,
 		unsigned int size,
 		unsigned int number,
 		const unsigned int *hook_entries,
@@ -875,11 +866,11 @@ translate_table(const char *name,
 	duprintf("translate_table: size %u\n", newinfo->size);
 	i = 0;
 	/* Walk through entries, checking offsets. */
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry_size_and_hooks,
 				newinfo,
-				newinfo->entries,
-				newinfo->entries + size,
+				entry0,
+				entry0 + size,
 				hook_entries, underflows, &i);
 	if (ret != 0)
 		return ret;
@@ -907,27 +898,24 @@ translate_table(const char *name,
 		}
 	}
 
-	if (!mark_source_chains(newinfo, valid_hooks))
+	if (!mark_source_chains(newinfo, valid_hooks, entry0))
 		return -ELOOP;
 
 	/* Finally, each sanity check must pass */
 	i = 0;
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry, name, size, &i);
 
 	if (ret != 0) {
-		IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+		IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				  cleanup_entry, &i);
 		return ret;
 	}
 
 	/* And one copy for every other CPU */
 	for_each_cpu(i) {
-		if (i == 0)
-			continue;
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
-		       newinfo->entries,
-		       SMP_ALIGN(newinfo->size));
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
 	return ret;
@@ -943,15 +931,12 @@ replace_table(struct ipt_table *table,
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	{
-		struct ipt_entry *table_base;
-		unsigned int i;
+		int cpu;
 
-		for_each_cpu(i) {
-			table_base =
-				(void *)newinfo->entries
-				+ TABLE_OFFSET(newinfo, i);
-
-			table_base->comefrom = 0xdead57ac;
+		for_each_cpu(cpu) {
+			struct ipt_entry *table_base = newinfo->entries[cpu];
+			if (table_base)
+				table_base->comefrom = 0xdead57ac;
 		}
 	}
 #endif
@@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e,
 	return 0;
 }
 
+static inline int
+set_entry_to_counter(const struct ipt_entry *e,
+		     struct ipt_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static void
 get_counters(const struct ipt_table_info *t,
 	     struct ipt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
+	unsigned int curcpu;
+
+	/* Instead of clearing (by a previous call to memset())
+	 * the counters and using adds, we set the counters
+	 * with data used by 'current' CPU
+	 * We dont care about preemption here.
+	 */
+	curcpu = raw_smp_processor_id();
+
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[curcpu],
+			  t->size,
+			  set_entry_to_counter,
+			  counters,
+			  &i);
 
 	for_each_cpu(cpu) {
+		if (cpu == curcpu)
+			continue;
 		i = 0;
-		IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
@@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size,
 	struct ipt_entry *e;
 	struct ipt_counters *counters;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct ipt_counters) * table->private->number;
-	counters = vmalloc(countersize);
+	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
 		return -ENOMEM;
 
 	/* First, sum counters... */
-	memset(counters, 0, countersize);
 	write_lock_bh(&table->lock);
 	get_counters(table->private, counters);
 	write_unlock_bh(&table->lock);
 
-	/* ... then copy entire thing from CPU 0... */
-	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+	/* choose the copy that is on our node/cpu, ...
+	 * This choice is lazy (because current thread is
+	 * allowed to migrate to another cpu)
+	 */
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	/* ... then copy entire thing ... */
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
 	}
@@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size,
 		struct ipt_entry_match *m;
 		struct ipt_entry_target *t;
 
-		e = (struct ipt_entry *)(table->private->entries + off);
+		e = (struct ipt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ipt_entry, counters),
 				 &counters[num],
@@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries,
 	return ret;
 }
 
+static void free_table_info(struct ipt_table_info *info)
+{
+	int cpu;
+	for_each_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+	kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+	struct ipt_table_info *newinfo;
+	int cpu;
+
+	newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+
+	newinfo->size = size;
+
+	for_each_cpu(cpu) {
+		if (size <= PAGE_SIZE)
+			newinfo->entries[cpu] = kmalloc_node(size,
+				GFP_KERNEL,
+				cpu_to_node(cpu));
+		else
+			newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu));
+		if (newinfo->entries[cpu] == 0) {
+			free_table_info(newinfo);
+			return NULL;
+		}
+	}
+
+	return newinfo;
+}
+
 static int
 do_replace(void __user *user, unsigned int len)
 {
@@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len)
 	struct ipt_table *t;
 	struct ipt_table_info *newinfo, *oldinfo;
 	struct ipt_counters *counters;
+	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len)
 	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
 		return -ENOMEM;
 
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(tmp.size) * 
-			  	(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
-	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+	/* choose the copy that is our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len)
 		ret = -ENOMEM;
 		goto free_newinfo;
 	}
-	memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
 
 	ret = translate_table(tmp.name, tmp.valid_hooks,
-			      newinfo, tmp.size, tmp.num_entries,
+			      newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
 			      tmp.hook_entry, tmp.underflow);
 	if (ret != 0)
 		goto free_newinfo_counters;
@@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len)
 	/* Get the old counters. */
 	get_counters(oldinfo, counters);
 	/* Decrease module usage counts and free resource */
-	IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
-	vfree(oldinfo);
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+	free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
 			 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
@@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len)
 	module_put(t->me);
 	up(&ipt_mutex);
  free_newinfo_counters_untrans:
-	IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+	IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	return ret;
 }
 
@@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len)
 	struct ipt_counters_info tmp, *paddc;
 	struct ipt_table *t;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len)
 	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc(len);
+	paddc = vmalloc_node(len, numa_node_id());
 	if (!paddc)
 		return -ENOMEM;
 
@@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len)
 	}
 
 	i = 0;
-	IPT_ENTRY_ITERATE(t->private->entries,
+	/* Choose the copy that is on our node */
+	loc_cpu_entry = t->private->entries[raw_smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  t->private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
@@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
 	struct ipt_table_info *newinfo;
 	static struct ipt_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	void *loc_cpu_entry;
 
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(repl->size) * 
-			  		(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
 
-	memcpy(newinfo->entries, repl->entries, repl->size);
+	/* choose the copy on our node/cpu
+	 * but dont care of preemption
+	 */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(table->name, table->valid_hooks,
-			      newinfo, repl->size,
+			      newinfo, loc_cpu_entry, repl->size,
 			      repl->num_entries,
 			      repl->hook_entry,
 			      repl->underflow);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
 	ret = down_interruptible(&ipt_mutex);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
@@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
 	return ret;
 
  free_unlock:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	goto unlock;
 }
 
 void ipt_unregister_table(struct ipt_table *table)
 {
+	void *loc_cpu_entry;
+
 	down(&ipt_mutex);
 	LIST_DELETE(&ipt_tables, table);
 	up(&ipt_mutex);
 
 	/* Decrease module usage counts and free resources */
-	IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
 			  cleanup_entry, NULL);
-	vfree(table->private);
+	free_table_info(table->private);
 }
 
 /* Returns 1 if the port is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 95d469271c4..dd80020d874 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -86,11 +86,6 @@ static DECLARE_MUTEX(ip6t_mutex);
    context stops packets coming through and allows user context to read
    the counters or update the rules.
 
-   To be cache friendly on SMP, we arrange them like so:
-   [ n-entries ]
-   ... cache-align padding ...
-   [ n-entries ]
-
    Hence the start of any table is given by get_table() below.  */
 
 /* The table itself */
@@ -108,20 +103,15 @@ struct ip6t_table_info
 	unsigned int underflow[NF_IP6_NUMHOOKS];
 
 	/* ip6t_entry tables: one per CPU */
-	char entries[0] ____cacheline_aligned;
+	void *entries[NR_CPUS];
 };
 
 static LIST_HEAD(ip6t_target);
 static LIST_HEAD(ip6t_match);
 static LIST_HEAD(ip6t_tables);
+#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
 #if 0
 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
 #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -376,8 +366,7 @@ ip6t_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+	table_base = (void *)table->private->entries[smp_processor_id()];
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -649,7 +638,8 @@ unconditional(const struct ip6t_ip6 *ipv6)
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
-mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ip6t_table_info *newinfo,
+		   unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
 
@@ -658,7 +648,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
 	for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ip6t_entry *e
-			= (struct ip6t_entry *)(newinfo->entries + pos);
+			= (struct ip6t_entry *)(entry0 + pos);
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
@@ -708,13 +698,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
 						goto next;
 
 					e = (struct ip6t_entry *)
-						(newinfo->entries + pos);
+						(entry0 + pos);
 				} while (oldpos == pos + e->next_offset);
 
 				/* Move along one */
 				size = e->next_offset;
 				e = (struct ip6t_entry *)
-					(newinfo->entries + pos + size);
+					(entry0 + pos + size);
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -731,7 +721,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks)
 					newpos = pos + e->next_offset;
 				}
 				e = (struct ip6t_entry *)
-					(newinfo->entries + newpos);
+					(entry0 + newpos);
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
@@ -941,6 +931,7 @@ static int
 translate_table(const char *name,
 		unsigned int valid_hooks,
 		struct ip6t_table_info *newinfo,
+		void *entry0,
 		unsigned int size,
 		unsigned int number,
 		const unsigned int *hook_entries,
@@ -961,11 +952,11 @@ translate_table(const char *name,
 	duprintf("translate_table: size %u\n", newinfo->size);
 	i = 0;
 	/* Walk through entries, checking offsets. */
-	ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry_size_and_hooks,
 				newinfo,
-				newinfo->entries,
-				newinfo->entries + size,
+				entry0,
+				entry0 + size,
 				hook_entries, underflows, &i);
 	if (ret != 0)
 		return ret;
@@ -993,27 +984,24 @@ translate_table(const char *name,
 		}
 	}
 
-	if (!mark_source_chains(newinfo, valid_hooks))
+	if (!mark_source_chains(newinfo, valid_hooks, entry0))
 		return -ELOOP;
 
 	/* Finally, each sanity check must pass */
 	i = 0;
-	ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry, name, size, &i);
 
 	if (ret != 0) {
-		IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+		IP6T_ENTRY_ITERATE(entry0, newinfo->size,
 				  cleanup_entry, &i);
 		return ret;
 	}
 
 	/* And one copy for every other CPU */
 	for_each_cpu(i) {
-		if (i == 0)
-			continue;
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
-		       newinfo->entries,
-		       SMP_ALIGN(newinfo->size));
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
 	return ret;
@@ -1029,15 +1017,12 @@ replace_table(struct ip6t_table *table,
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	{
-		struct ip6t_entry *table_base;
-		unsigned int i;
+		int cpu;
 
-		for_each_cpu(i) {
-			table_base =
-				(void *)newinfo->entries
-				+ TABLE_OFFSET(newinfo, i);
-
-			table_base->comefrom = 0xdead57ac;
+		for_each_cpu(cpu) {
+			struct ip6t_entry *table_base = newinfo->entries[cpu];
+			if (table_base)
+				table_base->comefrom = 0xdead57ac;
 		}
 	}
 #endif
@@ -1072,16 +1057,44 @@ add_entry_to_counter(const struct ip6t_entry *e,
 	return 0;
 }
 
+static inline int
+set_entry_to_counter(const struct ip6t_entry *e,
+		     struct ip6t_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static void
 get_counters(const struct ip6t_table_info *t,
 	     struct ip6t_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
+	unsigned int curcpu;
+
+	/* Instead of clearing (by a previous call to memset())
+	 * the counters and using adds, we set the counters
+	 * with data used by 'current' CPU
+	 * We dont care about preemption here.
+	 */
+	curcpu = raw_smp_processor_id();
+
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[curcpu],
+			   t->size,
+			   set_entry_to_counter,
+			   counters,
+			   &i);
 
 	for_each_cpu(cpu) {
+		if (cpu == curcpu)
+			continue;
 		i = 0;
-		IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
@@ -1098,6 +1111,7 @@ copy_entries_to_user(unsigned int total_size,
 	struct ip6t_entry *e;
 	struct ip6t_counters *counters;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1109,13 +1123,13 @@ copy_entries_to_user(unsigned int total_size,
 		return -ENOMEM;
 
 	/* First, sum counters... */
-	memset(counters, 0, countersize);
 	write_lock_bh(&table->lock);
 	get_counters(table->private, counters);
 	write_unlock_bh(&table->lock);
 
-	/* ... then copy entire thing from CPU 0... */
-	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+	/* choose the copy that is on ourc node/cpu */
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
 	}
@@ -1127,7 +1141,7 @@ copy_entries_to_user(unsigned int total_size,
 		struct ip6t_entry_match *m;
 		struct ip6t_entry_target *t;
 
-		e = (struct ip6t_entry *)(table->private->entries + off);
+		e = (struct ip6t_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ip6t_entry, counters),
 				 &counters[num],
@@ -1196,6 +1210,46 @@ get_entries(const struct ip6t_get_entries *entries,
 	return ret;
 }
 
+static void free_table_info(struct ip6t_table_info *info)
+{
+	int cpu;
+	for_each_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+	kfree(info);
+}
+
+static struct ip6t_table_info *alloc_table_info(unsigned int size)
+{
+	struct ip6t_table_info *newinfo;
+	int cpu;
+
+	newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+
+	newinfo->size = size;
+
+	for_each_cpu(cpu) {
+		if (size <= PAGE_SIZE)
+			newinfo->entries[cpu] = kmalloc_node(size,
+							GFP_KERNEL,
+							cpu_to_node(cpu));
+		else
+			newinfo->entries[cpu] = vmalloc_node(size,
+							     cpu_to_node(cpu));
+		if (newinfo->entries[cpu] == NULL) {
+			free_table_info(newinfo);
+			return NULL;
+		}
+	}
+
+	return newinfo;
+}
+
 static int
 do_replace(void __user *user, unsigned int len)
 {
@@ -1204,6 +1258,7 @@ do_replace(void __user *user, unsigned int len)
 	struct ip6t_table *t;
 	struct ip6t_table_info *newinfo, *oldinfo;
 	struct ip6t_counters *counters;
+	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1212,13 +1267,13 @@ do_replace(void __user *user, unsigned int len)
 	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
 		return -ENOMEM;
 
-	newinfo = vmalloc(sizeof(struct ip6t_table_info)
-			  + SMP_ALIGN(tmp.size) *
-			  		(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
 
-	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+	/* choose the copy that is on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1229,10 +1284,9 @@ do_replace(void __user *user, unsigned int len)
 		ret = -ENOMEM;
 		goto free_newinfo;
 	}
-	memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters));
 
 	ret = translate_table(tmp.name, tmp.valid_hooks,
-			      newinfo, tmp.size, tmp.num_entries,
+			      newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
 			      tmp.hook_entry, tmp.underflow);
 	if (ret != 0)
 		goto free_newinfo_counters;
@@ -1271,8 +1325,9 @@ do_replace(void __user *user, unsigned int len)
 	/* Get the old counters. */
 	get_counters(oldinfo, counters);
 	/* Decrease module usage counts and free resource */
-	IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
-	vfree(oldinfo);
+	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+	free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
 			 sizeof(struct ip6t_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
@@ -1284,11 +1339,11 @@ do_replace(void __user *user, unsigned int len)
 	module_put(t->me);
 	up(&ip6t_mutex);
  free_newinfo_counters_untrans:
-	IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+	IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	return ret;
 }
 
@@ -1321,6 +1376,7 @@ do_add_counters(void __user *user, unsigned int len)
 	struct ip6t_counters_info tmp, *paddc;
 	struct ip6t_table *t;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1350,7 +1406,9 @@ do_add_counters(void __user *user, unsigned int len)
 	}
 
 	i = 0;
-	IP6T_ENTRY_ITERATE(t->private->entries,
+	/* Choose the copy that is on our node */
+	loc_cpu_entry = t->private->entries[smp_processor_id()];
+	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  t->private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
@@ -1543,28 +1601,29 @@ int ip6t_register_table(struct ip6t_table *table,
 	struct ip6t_table_info *newinfo;
 	static struct ip6t_table_info bootstrap
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	void *loc_cpu_entry;
 
-	newinfo = vmalloc(sizeof(struct ip6t_table_info)
-			  + SMP_ALIGN(repl->size) *
-			  		(highest_possible_processor_id()+1));
+	newinfo = alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
 
-	memcpy(newinfo->entries, repl->entries, repl->size);
+	/* choose the copy on our node/cpu */
+	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(table->name, table->valid_hooks,
-			      newinfo, repl->size,
+			      newinfo, loc_cpu_entry, repl->size,
 			      repl->num_entries,
 			      repl->hook_entry,
 			      repl->underflow);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
 	ret = down_interruptible(&ip6t_mutex);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
@@ -1593,20 +1652,23 @@ int ip6t_register_table(struct ip6t_table *table,
 	return ret;
 
  free_unlock:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	goto unlock;
 }
 
 void ip6t_unregister_table(struct ip6t_table *table)
 {
+	void *loc_cpu_entry;
+
 	down(&ip6t_mutex);
 	LIST_DELETE(&ip6t_tables, table);
 	up(&ip6t_mutex);
 
 	/* Decrease module usage counts and free resources */
-	IP6T_ENTRY_ITERATE(table->private->entries, table->private->size,
+	loc_cpu_entry = table->private->entries[raw_smp_processor_id()];
+	IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
 			  cleanup_entry, NULL);
-	vfree(table->private);
+	free_table_info(table->private);
 }
 
 /* Returns 1 if the port is matched by the range, 0 otherwise */
-- 
cgit v1.2.3


From d5228a4f49db32d22a39c653281b527ef371129c Mon Sep 17 00:00:00 2001
From: Bart De Schuymer <bdschuym@pandora.be>
Date: Tue, 13 Dec 2005 23:14:08 -0800
Subject: [NETFILTER] ebtables: Support nf_log API from ebt_log and ebt_ulog

This makes ebt_log and ebt_ulog use the new nf_log api.  This enables
the bridging packet filter to log packets e.g. via nfnetlink_log.

Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/Kconfig    |  6 +++-
 net/bridge/netfilter/ebt_log.c  | 72 +++++++++++++++++++++++++++++++----------
 net/bridge/netfilter/ebt_ulog.c | 53 ++++++++++++++++++++++++++++--
 3 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index c70b3be2302..b84fc6075fe 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config BRIDGE_EBT_ULOG
-	tristate "ebt: ulog support"
+	tristate "ebt: ulog support (OBSOLETE)"
 	depends on BRIDGE_NF_EBTABLES
 	help
+	  This option enables the old bridge-specific "ebt_ulog" implementation
+	  which has been obsoleted by the new "nfnetlink_log" code (see
+	  CONFIG_NETFILTER_NETLINK_LOG).
+
 	  This option adds the ulog watcher, that you can use in any rule
 	  in any ebtables table. The packet is passed to a userspace
 	  logging daemon using netlink multicast sockets. This differs
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 662975be3d1..c436e6c6242 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -3,6 +3,7 @@
  *
  *	Authors:
  *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Harald Welte <laforge@netfilter.org>
  *
  *  April, 2002
  *
@@ -10,6 +11,7 @@
 
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_log.h>
+#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/if_arp.h>
@@ -55,27 +57,30 @@ static void print_MAC(unsigned char *p)
 }
 
 #define myNIPQUAD(a) a[0], a[1], a[2], a[3]
-static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
-   const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+static void
+ebt_log_packet(unsigned int pf, unsigned int hooknum,
+   const struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, const struct nf_loginfo *loginfo,
+   const char *prefix)
 {
-	struct ebt_log_info *info = (struct ebt_log_info *)data;
-	char level_string[4] = "< >";
+	unsigned int bitmask;
 
-	level_string[1] = '0' + info->loglevel;
 	spin_lock_bh(&ebt_log_lock);
-	printk(level_string);
-	printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "",
-	   out ? out->name : "");
+	printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level,
+	       prefix, in ? in->name : "", out ? out->name : "");
 
-	printk("MAC source = ");
 	print_MAC(eth_hdr(skb)->h_source);
 	printk("MAC dest = ");
 	print_MAC(eth_hdr(skb)->h_dest);
 
 	printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto));
 
-	if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
+	if (loginfo->type == NF_LOG_TYPE_LOG)
+		bitmask = loginfo->u.log.logflags;
+	else
+		bitmask = NF_LOG_MASK;
+
+	if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto ==
 	   htons(ETH_P_IP)){
 		struct iphdr _iph, *ih;
 
@@ -84,10 +89,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 			printk(" INCOMPLETE IP header");
 			goto out;
 		}
-		printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
-		   NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
-		printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
-		       ih->protocol);
+		printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP "
+		       "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr),
+		       NIPQUAD(ih->daddr), ih->tos, ih->protocol);
 		if (ih->protocol == IPPROTO_TCP ||
 		    ih->protocol == IPPROTO_UDP) {
 			struct tcpudphdr _ports, *pptr;
@@ -104,7 +108,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 		goto out;
 	}
 
-	if ((info->bitmask & EBT_LOG_ARP) &&
+	if ((bitmask & EBT_LOG_ARP) &&
 	    ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) ||
 	     (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) {
 		struct arphdr _arph, *ah;
@@ -144,6 +148,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 out:
 	printk("\n");
 	spin_unlock_bh(&ebt_log_lock);
+
+}
+
+static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
+   const struct net_device *in, const struct net_device *out,
+   const void *data, unsigned int datalen)
+{
+	struct ebt_log_info *info = (struct ebt_log_info *)data;
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = info->loglevel;
+	li.u.log.logflags = info->bitmask;
+
+	nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix);
 }
 
 static struct ebt_watcher log =
@@ -154,13 +173,32 @@ static struct ebt_watcher log =
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ebt_log_logger = {
+	.name 		= "ebt_log",
+	.logfn		= &ebt_log_packet,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
-	return ebt_register_watcher(&log);
+	int ret;
+
+	ret = ebt_register_watcher(&log);
+	if (ret < 0)
+		return ret;
+	if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) {
+		printk(KERN_WARNING "ebt_log: not logging via system console "
+		       "since somebody else already registered for PF_INET\n");
+		/* we cannot make module load fail here, since otherwise 
+		 * ebtables userspace would abort */
+	}
+
+	return 0;
 }
 
 static void __exit fini(void)
 {
+	nf_log_unregister_logger(&ebt_log_logger);
 	ebt_unregister_watcher(&log);
 }
 
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index aae26ae2e61..ce617b3dbbb 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -3,6 +3,7 @@
  *
  *	Authors:
  *	Bart De Schuymer <bdschuym@pandora.be>
+ *	Harald Welte <laforge@netfilter.org>
  *
  *  November, 2004
  *
@@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
 	return skb;
 }
 
-static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out,
-   const void *data, unsigned int datalen)
+   const struct ebt_ulog_info *uloginfo, const char *prefix)
 {
 	ebt_ulog_packet_msg_t *pm;
 	size_t size, copy_len;
 	struct nlmsghdr *nlh;
-	struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
 	unsigned int group = uloginfo->nlgroup;
 	ebt_ulog_buff_t *ub = &ulog_buffers[group];
 	spinlock_t *lock = &ub->lock;
@@ -216,6 +216,39 @@ alloc_failure:
 	goto unlock;
 }
 
+/* this function is registered with the netfilter core */
+static void ebt_log_packet(unsigned int pf, unsigned int hooknum,
+   const struct sk_buff *skb, const struct net_device *in,
+   const struct net_device *out, const struct nf_loginfo *li,
+   const char *prefix)
+{
+	struct ebt_ulog_info loginfo;
+
+	if (!li || li->type != NF_LOG_TYPE_ULOG) {
+		loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
+		loginfo.cprange = 0;
+		loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
+		loginfo.prefix[0] = '\0';
+	} else {
+		loginfo.nlgroup = li->u.ulog.group;
+		loginfo.cprange = li->u.ulog.copy_len;
+		loginfo.qthreshold = li->u.ulog.qthreshold;
+		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+	}
+
+	ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
+   const struct net_device *in, const struct net_device *out,
+   const void *data, unsigned int datalen)
+{
+	struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data;
+
+	ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL);
+}
+
+
 static int ebt_ulog_check(const char *tablename, unsigned int hookmask,
    const struct ebt_entry *e, void *data, unsigned int datalen)
 {
@@ -240,6 +273,12 @@ static struct ebt_watcher ulog = {
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ebt_ulog_logger = {
+	.name		= EBT_ULOG_WATCHER,
+	.logfn		= &ebt_log_packet,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	int i, ret = 0;
@@ -265,6 +304,13 @@ static int __init init(void)
 	else if ((ret = ebt_register_watcher(&ulog)))
 		sock_release(ebtulognl->sk_socket);
 
+	if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) {
+		printk(KERN_WARNING "ebt_ulog: not logging via ulog "
+		       "since somebody else already registered for PF_BRIDGE\n");
+		/* we cannot make module load fail here, since otherwise
+		 * ebtables userspace would abort */
+	}
+
 	return ret;
 }
 
@@ -273,6 +319,7 @@ static void __exit fini(void)
 	ebt_ulog_buff_t *ub;
 	int i;
 
+	nf_log_unregister_logger(&ebt_ulog_logger);
 	ebt_unregister_watcher(&ulog);
 	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
 		ub = &ulog_buffers[i];
-- 
cgit v1.2.3


From 89cee8b1cbb9dac40c92ef1968aea2b45f82fd18 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 13 Dec 2005 23:14:27 -0800
Subject: [IPV4]: Safer reassembly

Another spin of Herbert Xu's "safer ip reassembly" patch
for 2.6.16.

(The original patch is here:
http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2
and my only contribution is to have tested it.)

This patch (optionally) does additional checks before accepting IP
fragments, which can greatly reduce the possibility of reassembling
fragments which originated from different IP datagrams.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Arthur Kepner <akepner@sgi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 23 ++++++++++++
 include/linux/sysctl.h                 |  1 +
 include/net/inetpeer.h                 |  1 +
 include/net/ip.h                       |  2 +
 net/ipv4/inetpeer.c                    |  1 +
 net/ipv4/ip_fragment.c                 | 68 +++++++++++++++++++++++++++++++++-
 net/ipv4/ip_output.c                   |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 +++++
 8 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebc09a159f6..2b7cf19a06a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
 	for the hash secret) for IP fragments.
 	Default: 600
 
+ipfrag_max_dist - INTEGER
+	ipfrag_max_dist is a non-negative integer value which defines the 
+	maximum "disorder" which is allowed among fragments which share a 
+	common IP source address. Note that reordering of packets is 
+	not unusual, but if a large number of fragments arrive from a source 
+	IP address while a particular fragment queue remains incomplete, it 
+	probably indicates that one or more fragments belonging to that queue 
+	have been lost. When ipfrag_max_dist is positive, an additional check 
+	is done on fragments before they are added to a reassembly queue - if 
+	ipfrag_max_dist (or more) fragments have arrived from a particular IP 
+	address between additions to any IP fragment queue using that source 
+	address, it's presumed that one or more fragments in the queue are 
+	lost. The existing fragment queue will be dropped, and a new one 
+	started. An ipfrag_max_dist value of zero disables this check.
+
+	Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
+	result in unnecessarily dropping fragment queues when normal
+	reordering of packets occurs, which could lead to poor application 
+	performance. Using a very large value, e.g. 50000, increases the 
+	likelihood of incorrectly reassembling IP fragments that originate 
+	from different IP datagrams, which could result in data corruption.
+	Default: 64
+
 INET peer storage:
 
 inet_peer_threshold - INTEGER
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4be34ef8c2f..93fa765e47d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -390,6 +390,7 @@ enum
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
+	NET_IPV4_IPFRAG_MAX_DIST=112,
 };
 
 enum {
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 7fda471002b..0965515f40c 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -25,6 +25,7 @@ struct inet_peer
 	__u32			v4daddr;	/* peer's address */
 	__u16			avl_height;
 	__u16			ip_id_count;	/* IP ID for the next packet */
+	atomic_t		rid;		/* Frag reception counter */
 	__u32			tcp_ts;
 	unsigned long		tcp_ts_stamp;
 };
diff --git a/include/net/ip.h b/include/net/ip.h
index e4563bbee6e..4d6294ba038 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm
 #define IPSKB_TRANSLATED	2
 #define IPSKB_FORWARDED		4
 #define IPSKB_XFRM_TUNNEL_SIZE	8
+#define IPSKB_FRAG_COMPLETE	16
 };
 
 struct ipcm_cookie
@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh;
 extern int sysctl_ipfrag_low_thresh;
 extern int sysctl_ipfrag_time;
 extern int sysctl_ipfrag_secret_interval;
+extern int sysctl_ipfrag_max_dist;
 
 /* From inetpeer.c */
 extern int inet_peer_threshold;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2fc3fd38924..ce5fe3f74a3 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
 		return NULL;
 	n->v4daddr = daddr;
 	atomic_set(&n->refcnt, 1);
+	atomic_set(&n->rid, 0);
 	n->ip_id_count = secure_ip_id(daddr);
 	n->tcp_ts_stamp = 0;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8ce0ce2ee48..ce2b70ce401 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -22,6 +22,7 @@
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
  */
 
+#include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -38,6 +39,7 @@
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/checksum.h>
+#include <net/inetpeer.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/inet.h>
@@ -56,6 +58,8 @@
 int sysctl_ipfrag_high_thresh = 256*1024;
 int sysctl_ipfrag_low_thresh = 192*1024;
 
+int sysctl_ipfrag_max_dist = 64;
+
 /* Important NOTE! Fragment queue must be destroyed before MSL expires.
  * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
  */
@@ -89,8 +93,10 @@ struct ipq {
 	spinlock_t	lock;
 	atomic_t	refcnt;
 	struct timer_list timer;	/* when will this queue expire?		*/
-	int		iif;
 	struct timeval	stamp;
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
 };
 
 /* Hash table. */
@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
 	BUG_TRAP(qp->last_in&COMPLETE);
 	BUG_TRAP(del_timer(&qp->timer) == 0);
 
+	if (qp->peer)
+		inet_putpeer(qp->peer);
+
 	/* Release all fragment data. */
 	fp = qp->fragments;
 	while (fp) {
@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	qp->meat = 0;
 	qp->fragments = NULL;
 	qp->iif = 0;
+	qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
 
 	/* Initialize a timer for this entry. */
 	init_timer(&qp->timer);
@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
 	return ip_frag_create(hash, iph, user);
 }
 
+/* Is the fragment too far ahead to be part of ipq? */
+static inline int ip_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = sysctl_ipfrag_max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->fragments && (end - start) > max;
+
+	if (rc) {
+		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	}
+
+	return rc;
+}
+
+static int ip_frag_reinit(struct ipq *qp)
+{
+	struct sk_buff *fp;
+
+	if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
+		atomic_inc(&qp->refcnt);
+		return -ETIMEDOUT;
+	}
+
+	fp = qp->fragments;
+	do {
+		struct sk_buff *xp = fp->next;
+		frag_kfree_skb(fp, NULL);
+		fp = xp;
+	} while (fp);
+
+	qp->last_in = 0;
+	qp->len = 0;
+	qp->meat = 0;
+	qp->fragments = NULL;
+	qp->iif = 0;
+
+	return 0;
+}
+
 /* Add new segment to existing queue. */
 static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (qp->last_in & COMPLETE)
 		goto err;
 
+	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	    unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
+		ipq_kill(qp);
+		goto err;
+	}
+
  	offset = ntohs(skb->nh.iph->frag_off);
 	flags = offset & ~IP_OFFSET;
 	offset &= IP_OFFSET;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eba64e2bd39..2a830de3a69 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 
 	hlen = iph->ihl * 4;
 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
 	/* When frag_list is given, use it. First, check its validity:
 	 * some transformers could create wrong frag_list or break existing
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 01444a02b48..dbf82955aab 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -22,6 +22,7 @@
 extern int sysctl_ip_nonlocal_bind;
 
 #ifdef CONFIG_SYSCTL
+static int zero;
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -613,6 +614,15 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec_jiffies,
 		.strategy	= &sysctl_jiffies
 	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_MAX_DIST,
+		.procname	= "ipfrag_max_dist",
+		.data		= &sysctl_ipfrag_max_dist,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero
+	},
 	{
 		.ctl_name	= NET_TCP_NO_METRICS_SAVE,
 		.procname	= "tcp_no_metrics_save",
-- 
cgit v1.2.3


From 971af18bbfabb7b7c9c548da34a51e30869c08fc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:14:47 -0800
Subject: [IPV6]: Reuse inet_csk_get_port in tcp_v6_get_port

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  6 ++-
 net/dccp/ipv4.c                    |  3 +-
 net/ipv4/inet_connection_sock.c    | 11 +++--
 net/ipv4/tcp_ipv4.c                |  3 +-
 net/ipv6/tcp_ipv6.c                | 95 ++------------------------------------
 5 files changed, 21 insertions(+), 97 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b0c99060b78..edc68e858d5 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -192,8 +192,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk,
 						const __u16 rport,
 						const __u32 raddr,
 						const __u32 laddr);
+extern int inet_csk_bind_conflict(const struct sock *sk,
+				  const struct inet_bind_bucket *tb);
 extern int inet_csk_get_port(struct inet_hashinfo *hashinfo,
-			     struct sock *sk, unsigned short snum);
+			     struct sock *sk, unsigned short snum,
+			     int (*bind_conflict)(const struct sock *sk,
+						  const struct inet_bind_bucket *tb));
 
 extern struct dst_entry* inet_csk_route_req(struct sock *sk,
 					    const struct request_sock *req);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 656e13e38cf..1ac3e30ae79 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
 
 static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
 {
-	return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+	return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+				 inet_csk_bind_conflict);
 }
 
 static void dccp_v4_hash(struct sock *sk)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3fe021f1a56..f05b6e76110 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  */
 int sysctl_local_port_range[2] = { 1024, 4999 };
 
-static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+int inet_csk_bind_conflict(const struct sock *sk,
+			   const struct inet_bind_bucket *tb)
 {
 	const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
 	struct sock *sk2;
@@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke
 	return node != NULL;
 }
 
+EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
+
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
  */
 int inet_csk_get_port(struct inet_hashinfo *hashinfo,
-		      struct sock *sk, unsigned short snum)
+		      struct sock *sk, unsigned short snum,
+		      int (*bind_conflict)(const struct sock *sk,
+					   const struct inet_bind_bucket *tb))
 {
 	struct inet_bind_hashbucket *head;
 	struct hlist_node *node;
@@ -125,7 +130,7 @@ tb_found:
 			goto success;
 		} else {
 			ret = 1;
-			if (inet_csk_bind_conflict(sk, tb))
+			if (bind_conflict(sk, tb))
 				goto fail_unlock;
 		}
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4d5021e1929..2aa19c89a94 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 {
-	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
+	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+				 inet_csk_bind_conflict);
 }
 
 static void tcp_v4_hash(struct sock *sk)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8827389abaf..76c8f5a2f7f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -76,8 +76,8 @@ static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 static struct tcp_func ipv6_mapped;
 static struct tcp_func ipv6_specific;
 
-static inline int tcp_v6_bind_conflict(const struct sock *sk,
-				       const struct inet_bind_bucket *tb)
+int inet6_csk_bind_conflict(const struct sock *sk,
+			    const struct inet_bind_bucket *tb)
 {
 	const struct sock *sk2;
 	const struct hlist_node *node;
@@ -97,97 +97,10 @@ static inline int tcp_v6_bind_conflict(const struct sock *sk,
 	return node != NULL;
 }
 
-/* Grrr, addr_type already calculated by caller, but I don't want
- * to add some silly "cookie" argument to this method just for that.
- * But it doesn't matter, the recalculation is in the rarest path
- * this function ever takes.
- */
 static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 {
-	struct inet_bind_hashbucket *head;
-	struct inet_bind_bucket *tb;
-	struct hlist_node *node;
-	int ret;
-
-	local_bh_disable();
-	if (snum == 0) {
-		int low = sysctl_local_port_range[0];
-		int high = sysctl_local_port_range[1];
-		int remaining = (high - low) + 1;
-		int rover = net_random() % (high - low) + low;
-
-		do {
-			head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, node, &head->chain)
-				if (tb->port == rover)
-					goto next;
-			break;
-		next:
-			spin_unlock(&head->lock);
-			if (++rover > high)
-				rover = low;
-		} while (--remaining > 0);
-
-		/* Exhausted local port range during search?  It is not
-		 * possible for us to be holding one of the bind hash
-		 * locks if this test triggers, because if 'remaining'
-		 * drops to zero, we broke out of the do/while loop at
-		 * the top level, not from the 'break;' statement.
-		 */
-		ret = 1;
-		if (unlikely(remaining <= 0))
-			goto fail;
-
-		/* OK, here is the one we will use. */
-		snum = rover;
-	} else {
-		head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
-		spin_lock(&head->lock);
-		inet_bind_bucket_for_each(tb, node, &head->chain)
-			if (tb->port == snum)
-				goto tb_found;
-	}
-	tb = NULL;
-	goto tb_not_found;
-tb_found:
-	if (tb && !hlist_empty(&tb->owners)) {
-		if (tb->fastreuse > 0 && sk->sk_reuse &&
-		    sk->sk_state != TCP_LISTEN) {
-			goto success;
-		} else {
-			ret = 1;
-			if (tcp_v6_bind_conflict(sk, tb))
-				goto fail_unlock;
-		}
-	}
-tb_not_found:
-	ret = 1;
-	if (tb == NULL) {
-	       	tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
-		if (tb == NULL)
-			goto fail_unlock;
-	}
-	if (hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-			tb->fastreuse = 1;
-		else
-			tb->fastreuse = 0;
-	} else if (tb->fastreuse &&
-		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-		tb->fastreuse = 0;
-
-success:
-	if (!inet_csk(sk)->icsk_bind_hash)
-		inet_bind_hash(sk, tb, snum);
-	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
-	ret = 0;
-
-fail_unlock:
-	spin_unlock(&head->lock);
-fail:
-	local_bh_enable();
-	return ret;
+	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
+				 inet6_csk_bind_conflict);
 }
 
 static __inline__ void __tcp_v6_hash(struct sock *sk)
-- 
cgit v1.2.3


From 90b19d31695371bd3ed256d4c9e280861cd6ae7e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:01 -0800
Subject: [IPV6]: Generalise __tcp_v6_hash, renaming it to __inet6_hash

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h | 26 ++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c            | 34 ++++------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 5a2beed5a77..a4a204f99ea 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk)
 	return inet6_ehashfn(laddr, lport, faddr, fport);
 }
 
+static inline void __inet6_hash(struct inet_hashinfo *hashinfo,
+				struct sock *sk)
+{
+	struct hlist_head *list;
+	rwlock_t *lock;
+
+	BUG_TRAP(sk_unhashed(sk));
+
+	if (sk->sk_state == TCP_LISTEN) {
+		list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &hashinfo->lhash_lock;
+		inet_listen_wlock(hashinfo);
+	} else {
+		unsigned int hash;
+		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
+		hash &= (hashinfo->ehash_size - 1);
+		list = &hashinfo->ehash[hash].chain;
+		lock = &hashinfo->ehash[hash].lock;
+		write_lock(lock);
+	}
+
+	__sk_add_node(sk, list);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(lock);
+}
+
 /*
  * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 76c8f5a2f7f..bf41f84d669 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -103,32 +103,6 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 				 inet6_csk_bind_conflict);
 }
 
-static __inline__ void __tcp_v6_hash(struct sock *sk)
-{
-	struct hlist_head *list;
-	rwlock_t *lock;
-
-	BUG_TRAP(sk_unhashed(sk));
-
-	if (sk->sk_state == TCP_LISTEN) {
-		list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
-		lock = &tcp_hashinfo.lhash_lock;
-		inet_listen_wlock(&tcp_hashinfo);
-	} else {
-		unsigned int hash;
-		sk->sk_hash = hash = inet6_sk_ehashfn(sk);
-		hash &= (tcp_hashinfo.ehash_size - 1);
-		list = &tcp_hashinfo.ehash[hash].chain;
-		lock = &tcp_hashinfo.ehash[hash].lock;
-		write_lock(lock);
-	}
-
-	__sk_add_node(sk, list);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(lock);
-}
-
-
 static void tcp_v6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
@@ -139,7 +113,7 @@ static void tcp_v6_hash(struct sock *sk)
 			return;
 		}
 		local_bh_disable();
-		__tcp_v6_hash(sk);
+		__inet6_hash(&tcp_hashinfo, sk);
 		local_bh_enable();
 	}
 }
@@ -374,7 +348,7 @@ ok:
  		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(port);
- 			__tcp_v6_hash(sk);
+ 			__inet6_hash(&tcp_hashinfo, sk);
  		}
  		spin_unlock(&head->lock);
 
@@ -392,7 +366,7 @@ ok:
 	spin_lock_bh(&head->lock);
 
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__tcp_v6_hash(sk);
+		__inet6_hash(&tcp_hashinfo, sk);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
@@ -1295,7 +1269,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
 
-	__tcp_v6_hash(newsk);
+	__inet6_hash(&tcp_hashinfo, newsk);
 	inet_inherit_port(&tcp_hashinfo, sk, newsk);
 
 	return newsk;
-- 
cgit v1.2.3


From c2977c2213993bff51911f4117281b31c4612591 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:12 -0800
Subject: [ICSK]: make inet_csk_reqsk_queue_hash_add timeout arg unsigned long

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 2 +-
 net/ipv4/inet_connection_sock.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index edc68e858d5..ccc81a1c550 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -211,7 +211,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk,
 
 extern void inet_csk_reqsk_queue_hash_add(struct sock *sk,
 					  struct request_sock *req,
-					  const unsigned timeout);
+					  unsigned long timeout);
 
 static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
 						struct request_sock *req)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f05b6e76110..e2bf508ed77 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -385,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
 EXPORT_SYMBOL_GPL(inet_csk_search_req);
 
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
-				   const unsigned timeout)
+				   unsigned long timeout)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-- 
cgit v1.2.3


From 8129765ac07c2455c927051e3a8b048b619b56ee Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:24 -0800
Subject: [IPV6]: Generalise tcp_v6_search_req & tcp_v6_synq_add

More work is needed tho to introduce inet6_request_sock from
tcp6_request_sock, in the same layout considerations as ipv6_pinfo in
inet_sock, next changeset will do that.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_connection_sock.h | 31 ++++++++++++
 include/net/request_sock.h          |  2 +-
 net/ipv6/Makefile                   |  3 +-
 net/ipv6/inet6_connection_sock.c    | 96 +++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c                 | 78 +++---------------------------
 5 files changed, 137 insertions(+), 73 deletions(-)
 create mode 100644 include/net/inet6_connection_sock.h
 create mode 100644 net/ipv6/inet6_connection_sock.c

diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
new file mode 100644
index 00000000000..aa30ebde70d
--- /dev/null
+++ b/include/net/inet6_connection_sock.h
@@ -0,0 +1,31 @@
+/*
+ * NET		Generic infrastructure for INET6 connection oriented protocols.
+ *
+ * Authors:	Many people, see the TCPv6 sources
+ *
+ * 		From code originally in TCPv6
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET6_CONNECTION_SOCK_H
+#define _INET6_CONNECTION_SOCK_H
+
+#include <linux/types.h>
+
+struct sock;
+struct request_sock;
+
+extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
+						 struct request_sock ***prevp,
+						 const __u16 rport,
+						 const struct in6_addr *raddr,
+						 const struct in6_addr *laddr,
+						 const int iif);
+
+extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+					   struct request_sock *req,
+					   const unsigned long timeout);
+#endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b52cc52ffe3..11641c9384f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
 
 static inline void reqsk_queue_hash_req(struct request_sock_queue *queue,
 					u32 hash, struct request_sock *req,
-					unsigned timeout)
+					unsigned long timeout)
 {
 	struct listen_sock *lopt = queue->listen_opt;
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6460eec834b..9601fd7f9d6 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,8 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
 		protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
 		exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
-		ip6_flowlabel.o ipv6_syms.o netfilter.o
+		ip6_flowlabel.o ipv6_syms.o netfilter.o \
+		inet6_connection_sock.o
 
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
new file mode 100644
index 00000000000..04ff44344f9
--- /dev/null
+++ b/net/ipv6/inet6_connection_sock.c
@@ -0,0 +1,96 @@
+/*
+ * INET        An implementation of the TCP/IP protocol suite for the LINUX
+ *             operating system.  INET is implemented using the  BSD Socket
+ *             interface as the means of communication with the user level.
+ *
+ *             Support for INET6 connection oriented protocols.
+ *
+ * Authors:    See the TCPv6 sources
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/jhash.h>
+
+#include <net/addrconf.h>
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+
+/*
+ * request_sock (formerly open request) hash tables.
+ */
+static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport,
+			   const u32 rnd, const u16 synq_hsize)
+{
+	u32 a = raddr->s6_addr32[0];
+	u32 b = raddr->s6_addr32[1];
+	u32 c = raddr->s6_addr32[2];
+
+	a += JHASH_GOLDEN_RATIO;
+	b += JHASH_GOLDEN_RATIO;
+	c += rnd;
+	__jhash_mix(a, b, c);
+
+	a += raddr->s6_addr32[3];
+	b += (u32)rport;
+	__jhash_mix(a, b, c);
+
+	return c & (synq_hsize - 1);
+}
+
+struct request_sock *inet6_csk_search_req(const struct sock *sk,
+					  struct request_sock ***prevp,
+					  const __u16 rport,
+					  const struct in6_addr *raddr,
+					  const struct in6_addr *laddr,
+					  const int iif)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	struct request_sock *req, **prev;
+
+	for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport,
+						     lopt->hash_rnd,
+						     lopt->nr_table_entries)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		const struct tcp6_request_sock *treq = tcp6_rsk(req);
+
+		if (inet_rsk(req)->rmt_port == rport &&
+		    req->rsk_ops->family == AF_INET6 &&
+		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
+		    (!treq->iif || treq->iif == iif)) {
+			BUG_TRAP(req->sk == NULL);
+			*prevp = prev;
+			return req;
+		}
+	}
+
+	return NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_search_req);
+
+void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
+				    struct request_sock *req,
+				    const unsigned long timeout)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr,
+				      inet_rsk(req)->rmt_port,
+				      lopt->hash_rnd, lopt->nr_table_entries);
+
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+	inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bf41f84d669..5a10d30cec4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -48,6 +48,7 @@
 #include <net/tcp.h>
 #include <net/ndisc.h>
 #include <net/inet6_hashtables.h>
+#include <net/inet6_connection_sock.h>
 #include <net/ipv6.h>
 #include <net/transp_v6.h>
 #include <net/addrconf.h>
@@ -118,60 +119,6 @@ static void tcp_v6_hash(struct sock *sk)
 	}
 }
 
-/*
- * Open request hash tables.
- */
-
-static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
-{
-	u32 a, b, c;
-
-	a = raddr->s6_addr32[0];
-	b = raddr->s6_addr32[1];
-	c = raddr->s6_addr32[2];
-
-	a += JHASH_GOLDEN_RATIO;
-	b += JHASH_GOLDEN_RATIO;
-	c += rnd;
-	__jhash_mix(a, b, c);
-
-	a += raddr->s6_addr32[3];
-	b += (u32) rport;
-	__jhash_mix(a, b, c);
-
-	return c & (TCP_SYNQ_HSIZE - 1);
-}
-
-static struct request_sock *tcp_v6_search_req(const struct sock *sk,
-					      struct request_sock ***prevp,
-					      __u16 rport,
-					      struct in6_addr *raddr,
-					      struct in6_addr *laddr,
-					      int iif)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	struct request_sock *req, **prev;  
-
-	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
-	     (req = *prev) != NULL;
-	     prev = &req->dl_next) {
-		const struct tcp6_request_sock *treq = tcp6_rsk(req);
-
-		if (inet_rsk(req)->rmt_port == rport &&
-		    req->rsk_ops->family == AF_INET6 &&
-		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
-		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
-		    (!treq->iif || treq->iif == iif)) {
-			BUG_TRAP(req->sk == NULL);
-			*prevp = prev;
-			return req;
-		}
-	}
-
-	return NULL;
-}
-
 static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len,
 				   struct in6_addr *saddr, 
 				   struct in6_addr *daddr, 
@@ -662,8 +609,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (sock_owned_by_user(sk))
 			goto out;
 
-		req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
-					&hdr->saddr, inet6_iif(skb));
+		req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
+					   &hdr->saddr, inet6_iif(skb));
 		if (!req)
 			goto out;
 
@@ -978,8 +925,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 	struct sock *nsk;
 
 	/* Find possible connection requests. */
-	req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
-				&skb->nh.ipv6h->daddr, inet6_iif(skb));
+	req = inet6_csk_search_req(sk, &prev, th->source,
+				   &skb->nh.ipv6h->saddr,
+				   &skb->nh.ipv6h->daddr, inet6_iif(skb));
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
@@ -1003,17 +951,6 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 	return sk;
 }
 
-static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
-{
-	struct inet_connection_sock *icsk = inet_csk(sk);
-	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
-	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
-	inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
-}
-
-
 /* FIXME: this is substantially similar to the ipv4 code.
  * Can some kind of merge be done? -- erics
  */
@@ -1083,8 +1020,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (tcp_v6_send_synack(sk, req, NULL))
 		goto drop;
 
-	tcp_v6_synq_add(sk, req);
-
+	inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	return 0;
 
 drop:
-- 
cgit v1.2.3


From ca304b6104ffdd120bb6687a88a0625e58bc71cd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:40 -0800
Subject: [IPV6]: Introduce inet6_rsk()

And inet6_rsk_offset in inet_request_sock, for the same reasons as
inet_sock's pinfo6 member.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ip.h               |  4 ++++
 include/linux/ipv6.h             | 39 +++++++++++++++++++++++++++++++++------
 net/ipv4/inet_diag.c             |  8 ++++----
 net/ipv6/inet6_connection_sock.c |  4 ++--
 net/ipv6/tcp_ipv6.c              | 19 +++++++++----------
 5 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/include/linux/ip.h b/include/linux/ip.h
index 33e8a19a1a0..5a560daeade 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -110,6 +110,10 @@ struct ip_options {
 
 struct inet_request_sock {
 	struct request_sock	req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u16			inet6_rsk_offset;
+	/* 2 bytes hole, try to pack */
+#endif
 	u32			loc_addr;
 	u32			rmt_addr;
 	u16			rmt_port;
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index e0b922785d9..7d3e86d9576 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -199,18 +199,17 @@ static inline int inet6_iif(const struct sk_buff *skb)
 	return IP6CB(skb)->iif;
 }
 
-struct tcp6_request_sock {
-	struct tcp_request_sock	req;
+struct inet6_request_sock {
 	struct in6_addr		loc_addr;
 	struct in6_addr		rmt_addr;
 	struct sk_buff		*pktopts;
 	int			iif;
 };
 
-static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk)
-{
-	return (struct tcp6_request_sock *)sk;
-}
+struct tcp6_request_sock {
+	struct tcp_request_sock	  tcp6rsk_tcp;
+	struct inet6_request_sock tcp6rsk_inet6;
+};
 
 /**
  * struct ipv6_pinfo - ipv6 private area
@@ -304,6 +303,28 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return inet_sk(__sk)->pinet6;
 }
 
+static inline struct inet6_request_sock *
+			inet6_rsk(const struct request_sock *rsk)
+{
+	return (struct inet6_request_sock *)(((u8 *)rsk) +
+					     inet_rsk(rsk)->inet6_rsk_offset);
+}
+
+static inline u32 inet6_rsk_offset(struct request_sock *rsk)
+{
+	return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock);
+}
+
+static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
+{
+	struct request_sock *req = reqsk_alloc(ops);
+
+	if (req != NULL)
+		inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req);
+
+	return req;
+}
+
 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 {
 	return (struct raw6_sock *)sk;
@@ -361,6 +382,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 	return NULL;
 }
 
+static inline struct inet6_request_sock *
+			inet6_rsk(const struct request_sock *rsk)
+{
+	return NULL;
+}
+
 static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 {
 	return NULL;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 39061ed53cf..3ce73b141d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 	if (r->idiag_family == AF_INET6) {
 		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-			       &tcp6_rsk(req)->loc_addr);
+			       &inet6_rsk(req)->loc_addr);
 		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-			       &tcp6_rsk(req)->rmt_addr);
+			       &inet6_rsk(req)->rmt_addr);
 	}
 #endif
 	nlh->nlmsg_len = skb->tail - b;
@@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 				entry.saddr =
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->loc_addr.s6_addr32 :
+					inet6_rsk(req)->loc_addr.s6_addr32 :
 #endif
 					&ireq->loc_addr;
 				entry.daddr = 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->rmt_addr.s6_addr32 :
+					inet6_rsk(req)->rmt_addr.s6_addr32 :
 #endif
 					&ireq->rmt_addr;
 				entry.dport = ntohs(ireq->rmt_port);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 04ff44344f9..fe874eeaa40 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -61,7 +61,7 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk,
 						     lopt->nr_table_entries)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
-		const struct tcp6_request_sock *treq = tcp6_rsk(req);
+		const struct inet6_request_sock *treq = inet6_rsk(req);
 
 		if (inet_rsk(req)->rmt_port == rport &&
 		    req->rsk_ops->family == AF_INET6 &&
@@ -85,7 +85,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
-	const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr,
+	const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr,
 				      inet_rsk(req)->rmt_port,
 				      lopt->hash_rnd, lopt->nr_table_entries);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5a10d30cec4..c2472d77166 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -656,7 +656,7 @@ out:
 static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 			      struct dst_entry *dst)
 {
-	struct tcp6_request_sock *treq = tcp6_rsk(req);
+	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	struct ipv6_txoptions *opt = NULL;
@@ -722,8 +722,8 @@ done:
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
-	if (tcp6_rsk(req)->pktopts)
-		kfree_skb(tcp6_rsk(req)->pktopts);
+	if (inet6_rsk(req)->pktopts)
+		kfree_skb(inet6_rsk(req)->pktopts);
 }
 
 static struct request_sock_ops tcp6_request_sock_ops = {
@@ -956,7 +956,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
  */
 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp6_request_sock *treq;
+	struct inet6_request_sock *treq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -981,7 +981,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
-	req = reqsk_alloc(&tcp6_request_sock_ops);
+	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
 	if (req == NULL)
 		goto drop;
 
@@ -994,7 +994,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	treq = tcp6_rsk(req);
+	treq = inet6_rsk(req);
 	ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
 	ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
 	TCP_ECN_create_request(req, skb->h.th);
@@ -1035,7 +1035,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 					  struct request_sock *req,
 					  struct dst_entry *dst)
 {
-	struct tcp6_request_sock *treq = tcp6_rsk(req);
+	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
@@ -1723,14 +1723,13 @@ static int tcp_v6_destroy_sock(struct sock *sk)
 static void get_openreq6(struct seq_file *seq, 
 			 struct sock *sk, struct request_sock *req, int i, int uid)
 {
-	struct in6_addr *dest, *src;
 	int ttd = req->expires - jiffies;
+	struct in6_addr *src = &inet6_rsk(req)->loc_addr;
+	struct in6_addr *dest = &inet6_rsk(req)->rmt_addr;
 
 	if (ttd < 0)
 		ttd = 0;
 
-	src = &tcp6_rsk(req)->loc_addr;
-	dest = &tcp6_rsk(req)->rmt_addr;
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
-- 
cgit v1.2.3


From 8292a17a399ffb7c5c8b083db4ad994e090055f7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:15:52 -0800
Subject: [ICSK]: Rename struct tcp_func to struct inet_connection_sock_af_ops

And move it to struct inet_connection_sock. DCCP will use it in the
upcoming changesets.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h                |  2 --
 include/net/inet_connection_sock.h | 26 ++++++++++++++++++++
 include/net/tcp.h                  | 50 +-------------------------------------
 include/net/transp_v6.h            |  2 +-
 net/ipv4/syncookies.c              |  4 +--
 net/ipv4/tcp.c                     |  8 +++---
 net/ipv4/tcp_input.c               | 11 +++++----
 net/ipv4/tcp_ipv4.c                | 11 ++++-----
 net/ipv4/tcp_minisocks.c           |  8 +++---
 net/ipv4/tcp_output.c              | 17 ++++++-------
 net/ipv6/ipv6_sockglue.c           |  2 +-
 net/ipv6/tcp_ipv6.c                | 28 ++++++++++-----------
 12 files changed, 71 insertions(+), 98 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 0e1da6602e0..4e1434007f4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -295,8 +295,6 @@ struct tcp_sock {
 
 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
 
-	struct tcp_func		*af_specific;	/* Operations which are AF_INET{4,6} specific	*/
-
  	__u32	rcv_wnd;	/* Current receiver window		*/
 	__u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
 	__u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ccc81a1c550..9e20d201e95 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -15,6 +15,7 @@
 #ifndef _INET_CONNECTION_SOCK_H
 #define _INET_CONNECTION_SOCK_H
 
+#include <linux/compiler.h>
 #include <linux/ip.h>
 #include <linux/string.h>
 #include <linux/timer.h>
@@ -29,6 +30,29 @@ struct inet_bind_bucket;
 struct inet_hashinfo;
 struct tcp_congestion_ops;
 
+/*
+ * Pointers to address related TCP functions
+ * (i.e. things that depend on the address family)
+ */
+struct inet_connection_sock_af_ops {
+	int	    (*queue_xmit)(struct sk_buff *skb, int ipfragok);
+	void	    (*send_check)(struct sock *sk, int len,
+				  struct sk_buff *skb);
+	int	    (*rebuild_header)(struct sock *sk);
+	int	    (*conn_request)(struct sock *sk, struct sk_buff *skb);
+	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
+				      struct request_sock *req,
+				      struct dst_entry *dst);
+	int	    (*remember_stamp)(struct sock *sk);
+	__u16	    net_header_len;
+	int	    (*setsockopt)(struct sock *sk, int level, int optname, 
+				  char __user *optval, int optlen);
+	int	    (*getsockopt)(struct sock *sk, int level, int optname, 
+				  char __user *optval, int __user *optlen);
+	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
+	int sockaddr_len;
+};
+
 /** inet_connection_sock - INET connection oriented sock
  *
  * @icsk_accept_queue:	   FIFO of established children 
@@ -37,6 +61,7 @@ struct tcp_congestion_ops;
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
  * @icsk_ca_ops		   Pluggable congestion control hook
+ * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
  * @icsk_pending:	   Scheduled timer event
@@ -55,6 +80,7 @@ struct inet_connection_sock {
  	struct timer_list	  icsk_delack_timer;
 	__u32			  icsk_rto;
 	struct tcp_congestion_ops *icsk_ca_ops;
+	struct inet_connection_sock_af_ops *icsk_af_ops;
 	__u8			  icsk_ca_state;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d78025f9fbe..83b117a25c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -224,53 +224,6 @@ extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
-/*
- *	Pointers to address related TCP functions
- *	(i.e. things that depend on the address family)
- */
-
-struct tcp_func {
-	int			(*queue_xmit)		(struct sk_buff *skb,
-							 int ipfragok);
-
-	void			(*send_check)		(struct sock *sk,
-							 struct tcphdr *th,
-							 int len,
-							 struct sk_buff *skb);
-
-	int			(*rebuild_header)	(struct sock *sk);
-
-	int			(*conn_request)		(struct sock *sk,
-							 struct sk_buff *skb);
-
-	struct sock *		(*syn_recv_sock)	(struct sock *sk,
-							 struct sk_buff *skb,
-							 struct request_sock *req,
-							 struct dst_entry *dst);
-    
-	int			(*remember_stamp)	(struct sock *sk);
-
-	__u16			net_header_len;
-
-	int			(*setsockopt)		(struct sock *sk, 
-							 int level, 
-							 int optname, 
-							 char __user *optval, 
-							 int optlen);
-
-	int			(*getsockopt)		(struct sock *sk, 
-							 int level, 
-							 int optname, 
-							 char __user *optval, 
-							 int __user *optlen);
-
-
-	void			(*addr2sockaddr)	(struct sock *sk,
-							 struct sockaddr *);
-
-	int sockaddr_len;
-};
-
 /*
  * The next routines deal with comparing 32 bit unsigned ints
  * and worry about wraparound (automatic with unsigned arithmetic).
@@ -405,8 +358,7 @@ extern void			tcp_parse_options(struct sk_buff *skb,
  *	TCP v4 functions exported for the inet6 API
  */
 
-extern void		       	tcp_v4_send_check(struct sock *sk, 
-						  struct tcphdr *th, int len, 
+extern void		       	tcp_v4_send_check(struct sock *sk, int len,
 						  struct sk_buff *skb);
 
 extern int			tcp_v4_conn_request(struct sock *sk,
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 4e86f2de663..61f724c1036 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -44,7 +44,7 @@ extern int			datagram_send_ctl(struct msghdr *msg,
 /*
  *	address family specific functions
  */
-extern struct tcp_func	ipv4_specific;
+extern struct inet_connection_sock_af_ops ipv4_specific;
 
 extern int inet6_destroy_sock(struct sock *sk);
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a34e60ea48a..e20be3331f6 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
 					   struct dst_entry *dst)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sock *child;
 
-	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
 	if (child)
 		inet_csk_reqsk_queue_add(sk, req, child);
 	else
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ef98b14ac56..eacfe6a3442 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	int err = 0;
 
 	if (level != SOL_TCP)
-		return tp->af_specific->setsockopt(sk, level, optname,
-						   optval, optlen);
+		return icsk->icsk_af_ops->setsockopt(sk, level, optname,
+						     optval, optlen);
 
 	/* This is a string value all the others are int's */
 	if (optname == TCP_CONGESTION) {
@@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	int val, len;
 
 	if (level != SOL_TCP)
-		return tp->af_specific->getsockopt(sk, level, optname,
-						   optval, optlen);
+		return icsk->icsk_af_ops->getsockopt(sk, level, optname,
+						     optval, optlen);
 
 	if (get_user(len, optlen))
 		return -EFAULT;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bf2e23086bc..7de6184d4bd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4071,8 +4071,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 
+		icsk = inet_csk(sk);
+
 		/* Make sure socket is routed, for correct metrics.  */
-		tp->af_specific->rebuild_header(sk);
+		icsk->icsk_af_ops->rebuild_header(sk);
 
 		tcp_init_metrics(sk);
 
@@ -4098,8 +4100,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			sk_wake_async(sk, 0, POLL_OUT);
 		}
 
-		icsk = inet_csk(sk);
-
 		if (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
 		    icsk->icsk_ack.pingpong) {
@@ -4220,6 +4220,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			  struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int queued = 0;
 
 	tp->rx_opt.saw_tstamp = 0;
@@ -4236,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			goto discard;
 
 		if(th->syn) {
-			if(tp->af_specific->conn_request(sk, skb) < 0)
+			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
 				return 1;
 
 			/* Now we have several options: In theory there is 
@@ -4349,7 +4350,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				/* Make sure socket is routed, for
 				 * correct metrics.
 				 */
-				tp->af_specific->rebuild_header(sk);
+				icsk->icsk_af_ops->rebuild_header(sk);
 
 				tcp_init_metrics(sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2aa19c89a94..704cf210579 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -86,8 +86,7 @@ int sysctl_tcp_low_latency;
 /* Socket used for sending RSTs */
 static struct socket *tcp_socket;
 
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
-		       struct sk_buff *skb);
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
 
 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
@@ -645,10 +644,10 @@ out:
 }
 
 /* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
-		       struct sk_buff *skb)
+void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct tcphdr *th = skb->h.th;
 
 	if (skb->ip_summed == CHECKSUM_HW) {
 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
@@ -1383,7 +1382,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 	return 0;
 }
 
-struct tcp_func ipv4_specific = {
+struct inet_connection_sock_af_ops ipv4_specific = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
 	.rebuild_header	=	inet_sk_rebuild_header,
@@ -1434,7 +1433,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-	tp->af_specific = &ipv4_specific;
+	icsk->icsk_af_ops = &ipv4_specific;
 
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1b66a2ac432..9c029683a62 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -274,18 +274,18 @@ kill:
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	int recycle_ok = 0;
 
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
-		recycle_ok = tp->af_specific->remember_stamp(sk);
+		recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
 
 	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
 		tw = inet_twsk_alloc(sk, state);
 
 	if (tw != NULL) {
 		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
-		const struct inet_connection_sock *icsk = inet_csk(sk);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
@@ -456,7 +456,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			   struct request_sock **prev)
 {
 	struct tcphdr *th = skb->h.th;
-	struct tcp_sock *tp = tcp_sk(sk);
 	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	int paws_reject = 0;
 	struct tcp_options_received tmp_opt;
@@ -613,7 +612,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		 * ESTABLISHED STATE. If it will be dropped after
 		 * socket is created, wait for troubles.
 		 */
-		child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+		child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb,
+								 req, NULL);
 		if (child == NULL)
 			goto listen_overflow;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b7325e0b406..af1946c52c3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -371,7 +371,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_ECN_send(sk, tp, skb, tcp_header_size);
 	}
 
-	tp->af_specific->send_check(sk, th, skb->len, skb);
+	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
 
 	if (likely(tcb->flags & TCPCB_FLAG_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
@@ -381,7 +381,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
-	err = tp->af_specific->queue_xmit(skb, 0);
+	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
 	if (unlikely(err <= 0))
 		return err;
 
@@ -638,12 +638,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now;
-
 	/* Calculate base mss without TCP options:
 	   It is MMS_S - sizeof(tcphdr) of rfc1122
 	 */
-	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+		       sizeof(struct tcphdr));
 
 	/* Clamp it (mss_clamp does not include tcp options) */
 	if (mss_now > tp->rx_opt.mss_clamp)
@@ -705,9 +704,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 	xmit_size_goal = mss_now;
 
 	if (doing_tso) {
-		xmit_size_goal = 65535 -
-			tp->af_specific->net_header_len -
-			tp->ext_header_len - tp->tcp_header_len;
+		xmit_size_goal = (65535 -
+				  inet_csk(sk)->icsk_af_ops->net_header_len -
+				  tp->ext_header_len - tp->tcp_header_len);
 
 		if (tp->max_window &&
 		    (xmit_size_goal > (tp->max_window >> 1)))
@@ -1422,7 +1421,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	   (sysctl_tcp_retrans_collapse != 0))
 		tcp_retrans_try_collapse(sk, skb, cur_mss);
 
-	if(tp->af_specific->rebuild_header(sk))
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
 	/* Some Solaris stacks overoptimize and ignore the FIN on a
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3620718defe..b6b63fa8454 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -170,7 +170,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 				sock_prot_inc_use(&tcp_prot);
 				local_bh_enable();
 				sk->sk_prot = &tcp_prot;
-				tp->af_specific = &ipv4_specific;
+				inet_csk(sk)->icsk_af_ops = &ipv4_specific;
 				sk->sk_socket->ops = &inet_stream_ops;
 				sk->sk_family = PF_INET;
 				tcp_sync_mss(sk, tp->pmtu_cookie);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c2472d77166..8ce8a1359d2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -68,14 +68,14 @@
 
 static void	tcp_v6_send_reset(struct sk_buff *skb);
 static void	tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
-static void	tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 
+static void	tcp_v6_send_check(struct sock *sk, int len, 
 				  struct sk_buff *skb);
 
 static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
 static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 
-static struct tcp_func ipv6_mapped;
-static struct tcp_func ipv6_specific;
+static struct inet_connection_sock_af_ops ipv6_mapped;
+static struct inet_connection_sock_af_ops ipv6_specific;
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb)
@@ -107,9 +107,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 static void tcp_v6_hash(struct sock *sk)
 {
 	if (sk->sk_state != TCP_CLOSE) {
-		struct tcp_sock *tp = tcp_sk(sk);
-
-		if (tp->af_specific == &ipv6_mapped) {
+		if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
 			tcp_prot.hash(sk);
 			return;
 		}
@@ -417,14 +415,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		tp->af_specific = &ipv6_mapped;
+		inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
 
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
 			tp->ext_header_len = exthdrlen;
-			tp->af_specific = &ipv6_specific;
+			inet_csk(sk)->icsk_af_ops = &ipv6_specific;
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -751,10 +749,10 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
 }
 
 
-static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 
-			      struct sk_buff *skb)
+static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct tcphdr *th = skb->h.th;
 
 	if (skb->ip_summed == CHECKSUM_HW) {
 		th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP,  0);
@@ -1070,7 +1068,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
 
-		newtp->af_specific = &ipv6_mapped;
+		inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
@@ -1084,7 +1082,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		 */
 
 		/* It is tricky place. Until this moment IPv4 tcp
-		   worked with IPv6 af_tcp.af_specific.
+		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
 		tcp_sync_mss(newsk, newtp->pmtu_cookie);
@@ -1631,7 +1629,7 @@ static int tcp_v6_remember_stamp(struct sock *sk)
 	return 0;
 }
 
-static struct tcp_func ipv6_specific = {
+static struct inet_connection_sock_af_ops ipv6_specific = {
 	.queue_xmit	=	tcp_v6_xmit,
 	.send_check	=	tcp_v6_send_check,
 	.rebuild_header	=	tcp_v6_rebuild_header,
@@ -1650,7 +1648,7 @@ static struct tcp_func ipv6_specific = {
  *	TCP over IPv4 via INET6 API
  */
 
-static struct tcp_func ipv6_mapped = {
+static struct inet_connection_sock_af_ops ipv6_mapped = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
 	.rebuild_header	=	inet_sk_rebuild_header,
@@ -1700,7 +1698,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	sk->sk_state = TCP_CLOSE;
 
-	tp->af_specific = &ipv6_specific;
+	icsk->icsk_af_ops = &ipv6_specific;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-- 
cgit v1.2.3


From af05dc9394feb193d221bc9d4c6db768facb4b40 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:16:04 -0800
Subject: [ICSK]: Move v4_addr2sockaddr from TCP to icsk

Renaming it to inet_csk_addr2sockaddr.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  2 ++
 net/ipv4/inet_connection_sock.c    | 12 ++++++++++++
 net/ipv4/tcp_ipv4.c                | 12 +-----------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 9e20d201e95..e50e2b890c6 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -303,4 +303,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk)
 extern int  inet_csk_listen_start(struct sock *sk, const int nr_table_entries);
 extern void inet_csk_listen_stop(struct sock *sk);
 
+extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+
 #endif /* _INET_CONNECTION_SOCK_H */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e2bf508ed77..ae20281d8de 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -636,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk)
 }
 
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
+
+void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	const struct inet_sock *inet = inet_sk(sk);
+
+	sin->sin_family		= AF_INET;
+	sin->sin_addr.s_addr	= inet->daddr;
+	sin->sin_port		= inet->dport;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 704cf210579..0b5ab04d3c5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1314,16 +1314,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
-	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
-	struct inet_sock *inet = inet_sk(sk);
-
-	sin->sin_family		= AF_INET;
-	sin->sin_addr.s_addr	= inet->daddr;
-	sin->sin_port		= inet->dport;
-}
-
 /* VJ's idea. Save last timestamp seen from this destination
  * and hold it at least for normal timewait interval to use for duplicate
  * segment detection in subsequent connections, before they enter synchronized
@@ -1392,7 +1382,7 @@ struct inet_connection_sock_af_ops ipv4_specific = {
 	.net_header_len	=	sizeof(struct iphdr),
 	.setsockopt	=	ip_setsockopt,
 	.getsockopt	=	ip_getsockopt,
-	.addr2sockaddr	=	v4_addr2sockaddr,
+	.addr2sockaddr	=	inet_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in),
 };
 
-- 
cgit v1.2.3


From 57cca05af1e20fdc65b55be52c042c234f86c866 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:16:16 -0800
Subject: [DCCP]: Introduce dccp_ipv4_af_ops

And make the core DCCP code AF agnostic, just like TCP, now its time
to work on net/dccp/ipv6.c, we are close to the end!

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c     |  5 +++--
 net/dccp/ipv4.c      | 23 +++++++++++++++++++++++
 net/dccp/minisocks.c |  2 +-
 net/dccp/output.c    | 14 ++++++--------
 net/dccp/proto.c     |  9 ++++++---
 5 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 3454d594190..c81488fa293 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -329,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		dccp_set_state(sk, DCCP_PARTOPEN);
 
 		/* Make sure socket is routed, for correct metrics. */
-		inet_sk_rebuild_header(sk);
+		icsk->icsk_af_ops->rebuild_header(sk);
 
 		if (!sock_flag(sk, SOCK_DEAD)) {
 			sk->sk_state_change(sk);
@@ -444,7 +444,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 */
 	if (sk->sk_state == DCCP_LISTEN) {
 		if (dh->dccph_type == DCCP_PKT_REQUEST) {
-			if (dccp_v4_conn_request(sk, skb) < 0)
+			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+								    skb) < 0)
 				return 1;
 
 			/* FIXME: do congestion control initialization */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 1ac3e30ae79..0ce7d0fe5ee 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -607,6 +607,15 @@ out:
 	sock_put(sk);
 }
 
+/* This routine computes an IPv4 DCCP checksum. */
+static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
+}
+
 int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
 {
 	struct sk_buff *skb;
@@ -1195,6 +1204,19 @@ do_time_wait:
 	goto no_dccp_socket;
 }
 
+struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+	.queue_xmit	= ip_queue_xmit,
+	.send_check	= dccp_v4_send_check,
+	.rebuild_header	= inet_sk_rebuild_header,
+	.conn_request	= dccp_v4_conn_request,
+	.syn_recv_sock	= dccp_v4_request_recv_sock,
+	.net_header_len	= sizeof(struct iphdr),
+	.setsockopt	= ip_setsockopt,
+	.getsockopt	= ip_getsockopt,
+	.addr2sockaddr	= inet_csk_addr2sockaddr,
+	.sockaddr_len	= sizeof(struct sockaddr_in),
+};
+
 static int dccp_v4_init_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -1240,6 +1262,7 @@ static int dccp_v4_init_sock(struct sock *sk)
 	inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
 	sk->sk_state = DCCP_CLOSED;
 	sk->sk_write_space = dccp_write_space;
+	inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
 	dp->dccps_mss_cache = 536;
 	dp->dccps_role = DCCP_ROLE_UNDEFINED;
 	dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 1393461898b..c7ff80cf53a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -214,7 +214,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 		goto drop;
 	}
 
-	child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
 	if (child == NULL)
 		goto listen_overflow;
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 74ff8702587..f35880503bb 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -43,6 +43,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	if (likely(skb != NULL)) {
 		const struct inet_sock *inet = inet_sk(sk);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		struct dccp_sock *dp = dccp_sk(sk);
 		struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 		struct dccp_hdr *dh;
@@ -108,8 +109,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 			break;
 		}
 
-		dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
-						      inet->daddr);
+		icsk->icsk_af_ops->send_check(sk, skb->len, skb);
 
 		if (set_ack)
 			dccp_event_ack_sent(sk);
@@ -117,7 +117,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 
 		memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-		err = ip_queue_xmit(skb, 0);
+		err = icsk->icsk_af_ops->queue_xmit(skb, 0);
 		if (err <= 0)
 			return err;
 
@@ -135,16 +135,14 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	int mss_now;
-
 	/*
 	 * FIXME: we really should be using the af_specific thing to support
 	 * 	  IPv6.
 	 * mss_now = pmtu - tp->af_specific->net_header_len -
 	 * 	     sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
 	 */
-	mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
-		  sizeof(struct dccp_hdr_ext);
+	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+		       sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
 
 	/* Now subtract optional transport overhead */
 	mss_now -= dp->dccps_ext_header_len;
@@ -266,7 +264,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
 
 int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
-	if (inet_sk_rebuild_header(sk) != 0)
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
 		return -EHOSTUNREACH; /* Routing failure or similar. */
 
 	return dccp_transmit_skb(sk, (skb_cloned(skb) ?
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8a6b2a9e458..7b30c12147c 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -254,7 +254,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
 	int val;
 
 	if (level != SOL_DCCP)
-		return ip_setsockopt(sk, level, optname, optval, optlen);
+		return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+							     optname, optval,
+							     optlen);
 
 	if (optlen < sizeof(int))
 		return -EINVAL;
@@ -320,8 +322,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
 	int val, len;
 
 	if (level != SOL_DCCP)
-		return ip_getsockopt(sk, level, optname, optval, optlen);
-
+		return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+							     optname, optval,
+							     optlen);
 	if (get_user(len, optlen))
 		return -EFAULT;
 
-- 
cgit v1.2.3


From 3305b80c214c642b89cd5c21af83bc91ec13f8bd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 13 Dec 2005 23:16:37 -0800
Subject: [IP]: Simplify and consolidate MSG_PEEK error handling

When a packet is obtained from skb_recv_datagram with MSG_PEEK enabled
it is left on the socket receive queue.  This means that when we detect
a checksum error we have to be careful when trying to free the packet
as someone could have dequeued it in the time being.

Currently this delicate logic is duplicated three times between UDPv4,
UDPv6 and RAWv6.  This patch moves them into a one place and simplifies
the code somewhat.

This is based on a suggestion by Eric Dumazet.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 net/core/datagram.c    | 36 ++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c         | 15 +--------------
 net/ipv6/raw.c         | 16 +++-------------
 net/ipv6/udp.c         | 16 ++--------------
 5 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c5d6001a92..97f6580ce03 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1239,6 +1239,8 @@ extern int	       skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 							int hlen,
 							struct iovec *iov);
 extern void	       skb_free_datagram(struct sock *sk, struct sk_buff *skb);
+extern void	       skb_kill_datagram(struct sock *sk, struct sk_buff *skb,
+					 unsigned int flags);
 extern unsigned int    skb_checksum(const struct sk_buff *skb, int offset,
 				    int len, unsigned int csum);
 extern int	       skb_copy_bits(const struct sk_buff *skb, int offset,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 1bcfef51ac5..f8d322e1ea9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -47,6 +47,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/poll.h>
 #include <linux/highmem.h>
+#include <linux/spinlock.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -199,6 +200,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+/**
+ *	skb_kill_datagram - Free a datagram skbuff forcibly
+ *	@sk: socket
+ *	@skb: datagram skbuff
+ *	@flags: MSG_ flags
+ *
+ *	This function frees a datagram skbuff that was received by
+ *	skb_recv_datagram.  The flags argument must match the one
+ *	used for skb_recv_datagram.
+ *
+ *	If the MSG_PEEK flag is set, and the packet is still on the
+ *	receive queue of the socket, it will be taken off the queue
+ *	before it is freed.
+ *
+ *	This function currently only disables BH when acquiring the
+ *	sk_receive_queue lock.  Therefore it must not be used in a
+ *	context where that lock is acquired in an IRQ context.
+ */
+
+void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
+{
+	if (flags & MSG_PEEK) {
+		spin_lock_bh(&sk->sk_receive_queue.lock);
+		if (skb == skb_peek(&sk->sk_receive_queue)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			atomic_dec(&skb->users);
+		}
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+	}
+
+	kfree_skb(skb);
+}
+
+EXPORT_SYMBOL(skb_kill_datagram);
+
 /**
  *	skb_copy_datagram_iovec - Copy a datagram to an iovec.
  *	@skb: buffer to copy
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2422a5f7195..012c4621e40 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -846,20 +846,7 @@ out:
 csum_copy_err:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
-
-	skb_free_datagram(sk, skb);
+	skb_kill_datagram(sk, skb, flags);
 
 	if (noblock)
 		return -EAGAIN;	
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a66900cda2a..66f1d12ea57 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -32,6 +32,7 @@
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/skbuff.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 #include <asm/bug.h>
@@ -433,25 +434,14 @@ out:
 	return err;
 
 csum_copy_err:
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
+	skb_kill_datagram(sk, skb, flags);
 
 	/* Error for blocking case is chosen to masquerade
 	   as some normal condition.
 	 */
 	err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
 	/* FIXME: increment a raw6 drops counter here */
-	goto out_free;
+	goto out;
 }
 
 static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5cc8731eb55..d8538dcea81 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -36,6 +36,7 @@
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/init.h>
+#include <linux/skbuff.h>
 #include <asm/uaccess.h>
 
 #include <net/sock.h>
@@ -300,20 +301,7 @@ out:
 	return err;
 
 csum_copy_err:
-	/* Clear queue. */
-	if (flags&MSG_PEEK) {
-		int clear = 0;
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (skb == skb_peek(&sk->sk_receive_queue)) {
-			__skb_unlink(skb, &sk->sk_receive_queue);
-			clear = 1;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		if (clear)
-			kfree_skb(skb);
-	}
-
-	skb_free_datagram(sk, skb);
+	skb_kill_datagram(sk, skb, flags);
 
 	if (flags & MSG_DONTWAIT) {
 		UDP6_INC_STATS_USER(UDP_MIB_INERRORS);
-- 
cgit v1.2.3


From 65a45441d7e91ef6107fadbc55f775db4ba23874 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 13 Dec 2005 23:17:02 -0800
Subject: [UDP]: udp_checksum_init return value

Since udp_checksum_init always returns 0 there is no point in
having it return a value.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 012c4621e40..67c036384e7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1081,7 +1081,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
  * Otherwise, csum completion requires chacksumming packet body,
  * including udp header and folding it to skb->csum.
  */
-static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 			     unsigned short ulen, u32 saddr, u32 daddr)
 {
 	if (uh->check == 0) {
@@ -1095,7 +1095,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 	/* Probably, we should checksum udp header (it should be in cache
 	 * in any case) and data in tiny packets (< rx copybreak).
 	 */
-	return 0;
 }
 
 /*
@@ -1128,8 +1127,7 @@ int udp_rcv(struct sk_buff *skb)
 	if (pskb_trim_rcsum(skb, ulen))
 		goto short_packet;
 
-	if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
-		goto csum_error;
+	udp_checksum_init(skb, uh, ulen, saddr, daddr);
 
 	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
-- 
cgit v1.2.3


From f1f71e03b17db3b9edb0264a8be7719bd5c35582 Mon Sep 17 00:00:00 2001
From: Roberto Nibali <ratz@drugphish.ch>
Date: Tue, 13 Dec 2005 23:17:20 -0800
Subject: [IPVS]: remove dead code

This patch removes dead code. I don't see the reason to keep this cruft
around, besides cluttering the nice and functionally working code.

Signed-off-by: Roberto Nibali <ratz@drugphish.ch>
Signed-off-by: Horms <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_app.c       | 28 ----------------------------
 net/ipv4/ipvs/ip_vs_lblc.c      | 27 ---------------------------
 net/ipv4/ipvs/ip_vs_lblcr.c     | 27 ---------------------------
 net/ipv4/ipvs/ip_vs_proto_tcp.c | 22 ----------------------
 4 files changed, 104 deletions(-)

diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d7eb680101c..9b176a942ac 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
 }
 
 
-#if 0000
-/*
- *	Get reference to app by name (called from user context)
- */
-struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
-{
-	struct ip_vs_app *app, *a = NULL;
-
-	down(&__ip_vs_app_mutex);
-
-	list_for_each_entry(ent, &ip_vs_app_list, a_list) {
-		if (strcmp(app->name, appname))
-			continue;
-
-		/* softirq may call ip_vs_app_get too, so the caller
-		   must disable softirq on the current CPU */
-		if (ip_vs_app_get(app))
-			a = app;
-		break;
-	}
-
-	up(&__ip_vs_app_mutex);
-
-	return a;
-}
-#endif
-
-
 /*
  *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
  */
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 561cda326fa..b6dad7e3710 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -228,33 +228,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
 }
 
 
-#if 0000
-/*
- *	Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
- *	returns bool success.
- */
-static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
-			     struct ip_vs_lblc_entry *en)
-{
-	if (list_empty(&en->list)) {
-		IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
-
-	/*
-	 * Remove it from the table
-	 */
-	write_lock(&tbl->lock);
-	list_del(&en->list);
-	INIT_LIST_HEAD(&en->list);
-	write_unlock(&tbl->lock);
-
-	return 1;
-}
-#endif
-
-
 /*
  *  Get ip_vs_lblc_entry associated with supplied parameters.
  */
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index ce456dbf09a..8c78ef76c12 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -414,33 +414,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
 }
 
 
-#if 0000
-/*
- *	Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
- *	returns bool success.
- */
-static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
-			     struct ip_vs_lblcr_entry *en)
-{
-	if (list_empty(&en->list)) {
-		IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
-			  "called from %p\n", __builtin_return_address(0));
-		return 0;
-	}
-
-	/*
-	 * Remove it from the table
-	 */
-	write_lock(&tbl->lock);
-	list_del(&en->list);
-	INIT_LIST_HEAD(&en->list);
-	write_unlock(&tbl->lock);
-
-	return 1;
-}
-#endif
-
-
 /*
  *  Get ip_vs_lblcr_entry associated with supplied parameters.
  */
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 0e878fd6215..638ba8c9e8b 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 	[IP_VS_TCP_S_LAST]		=	2*HZ,
 };
 
-
-#if 0
-
-/* FIXME: This is going to die */
-
-static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
-	[IP_VS_TCP_S_NONE]		=	2*HZ,
-	[IP_VS_TCP_S_ESTABLISHED]	=	8*60*HZ,
-	[IP_VS_TCP_S_SYN_SENT]		=	60*HZ,
-	[IP_VS_TCP_S_SYN_RECV]		=	10*HZ,
-	[IP_VS_TCP_S_FIN_WAIT]		=	60*HZ,
-	[IP_VS_TCP_S_TIME_WAIT]		=	60*HZ,
-	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
-	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
-	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
-	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
-	[IP_VS_TCP_S_SYNACK]		=	100*HZ,
-	[IP_VS_TCP_S_LAST]		=	2*HZ,
-};
-
-#endif
-
 static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
 	[IP_VS_TCP_S_NONE]		=	"NONE",
 	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
-- 
cgit v1.2.3


From c1cbe4b7ad0bc4b1d98ea708a3fecb7362aa4088 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Date: Tue, 13 Dec 2005 23:22:19 -0800
Subject: [NET]: Avoid atomic xchg() for non-error case

It also looks like there were 2 places where the test on sk_err was
missing from the event wait logic (in sk_stream_wait_connect and
sk_stream_wait_memory), while the rest of the sock_error() users look
to be doing the right thing.  This version of the patch fixes those,
and cleans up a few places that were testing ->sk_err directly.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h           |  5 ++++-
 net/bluetooth/af_bluetooth.c |  5 ++---
 net/bluetooth/l2cap.c        |  5 +++--
 net/bluetooth/sco.c          |  5 +++--
 net/core/stream.c            | 10 +++++++---
 net/irda/af_irda.c           |  5 +++--
 net/llc/af_llc.c             |  5 ++---
 7 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 982b4ecd187..0fbae85c6d5 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1166,7 +1166,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
  
 static inline int sock_error(struct sock *sk)
 {
-	int err = xchg(&sk->sk_err, 0);
+	int err;
+	if (likely(!sk->sk_err))
+		return 0;
+	err = xchg(&sk->sk_err, 0);
 	return -err;
 }
 
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ea616e3fc98..fb031fe9be9 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
 		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
 
-		if (sk->sk_err) {
-			err = sock_error(sk);
+		err = sock_error(sk);
+		if (err)
 			break;
-		}
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(sk->sk_sleep, &wait);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index e3bb11ca423..95f33cc7a24 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
-	if (sk->sk_err)
-		return sock_error(sk);
+	err = sock_error(sk);
+	if (err)
+		return err;
 
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 9cb00dc6c08..64818143069 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	BT_DBG("sock %p, sk %p", sock, sk);
 
-	if (sk->sk_err)
-		return sock_error(sk);
+	err = sock_error(sk);
+	if (err)
+		return err;
 
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
diff --git a/net/core/stream.c b/net/core/stream.c
index 15bfd03e802..35e25259fd9 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 	int done;
 
 	do {
-		if (sk->sk_err)
-			return sock_error(sk);
+		int err = sock_error(sk);
+		if (err)
+			return err;
 		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 			return -EPIPE;
 		if (!*timeo_p)
@@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 		sk->sk_write_pending++;
 		done = sk_wait_event(sk, timeo_p,
+				     !sk->sk_err &&
 				     !((1 << sk->sk_state) & 
 				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
 		finish_wait(sk->sk_sleep, &wait);
@@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk->sk_write_pending++;
-		sk_wait_event(sk, &current_timeo, sk_stream_memory_free(sk) &&
+		sk_wait_event(sk, &current_timeo, !sk->sk_err && 
+						  !(sk->sk_shutdown & SEND_SHUTDOWN) &&
+						  sk_stream_memory_free(sk) &&
 						  vm_wait);
 		sk->sk_write_pending--;
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 6f92f9c6299..f121f7de203 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 			/*
 			 *	POSIX 1003.1g mandates this order.
 			 */
-			if (sk->sk_err)
-				ret = sock_error(sk);
+			ret = sock_error(sk);
+			if (ret)
+				break;
 			else if (sk->sk_shutdown & RCV_SHUTDOWN)
 				;
 			else if (noblock)
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c3f0b078345..b6d3df5c911 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo)
 		/*
 		 * POSIX 1003.1g mandates this order.
 		 */
-		if (sk->sk_err) {
-			rc = sock_error(sk);
+		rc = sock_error(sk);
+		if (rc)
 			break;
-		}
 		rc = 0;
 		if (sk->sk_shutdown & RCV_SHUTDOWN)
 			break;
-- 
cgit v1.2.3


From 830a1e5c212fb3fdc83b66359c780c3b3a294897 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Date: Tue, 13 Dec 2005 23:22:32 -0800
Subject: [AF_UNIX]: Remove superfluous reference counting in
 unix_stream_sendmsg

AF_UNIX stream socket performance on P4 CPUs tends to suffer due to a
lot of pipeline flushes from atomic operations.  The patch below
removes the sock_hold() and sock_put() in unix_stream_sendmsg().  This
should be safe as the socket still holds a reference to its peer which
is only released after the file descriptor's final user invokes
unix_release_sock().  The only consideration is that we must add a
memory barrier before setting the peer initially.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index acc73ba8bad..1dc3685048f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1063,10 +1063,12 @@ restart:
 	/* Set credentials */
 	sk->sk_peercred = other->sk_peercred;
 
-	sock_hold(newsk);
-	unix_peer(sk)	= newsk;
 	sock->state	= SS_CONNECTED;
 	sk->sk_state	= TCP_ESTABLISHED;
+	sock_hold(newsk);
+
+	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
+	unix_peer(sk)	= newsk;
 
 	unix_state_wunlock(sk);
 
@@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	} else {
 		sunaddr = NULL;
 		err = -ENOTCONN;
-		other = unix_peer_get(sk);
+		other = unix_peer(sk);
 		if (!other)
 			goto out_err;
 	}
@@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		other->sk_data_ready(other, size);
 		sent+=size;
 	}
-	sock_put(other);
 
 	scm_destroy(siocb->scm);
 	siocb->scm = NULL;
@@ -1491,8 +1492,6 @@ pipe_err:
 		send_sig(SIGPIPE,current,0);
 	err = -EPIPE;
 out_err:
-        if (other)
-		sock_put(other);
 	scm_destroy(siocb->scm);
 	siocb->scm = NULL;
 	return sent ? : err;
-- 
cgit v1.2.3


From b9750ce13c08aa8a71a9b138d741f3046aefd991 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:22:54 -0800
Subject: [IPV6]: Generalise some functions

Using sk->sk_protocol instead of IPPROTO_TCP.

Will be used by DCCPv6 in the next changesets.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h                |   2 +
 include/net/inet6_connection_sock.h |  13 ++-
 net/ipv6/af_inet6.c                 |  52 ++++++++++++
 net/ipv6/inet6_connection_sock.c    | 103 ++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c                 | 153 +-----------------------------------
 5 files changed, 173 insertions(+), 150 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7d3e86d9576..69a0decfbdf 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -297,6 +297,8 @@ struct tcp6_sock {
 	struct ipv6_pinfo inet6;
 };
 
+extern int inet6_sk_rebuild_header(struct sock *sk);
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
 {
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index aa30ebde70d..b33b438bffc 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -15,8 +15,15 @@
 
 #include <linux/types.h>
 
-struct sock;
+struct in6_addr;
+struct inet_bind_bucket;
 struct request_sock;
+struct sk_buff;
+struct sock;
+struct sockaddr;
+
+extern int inet6_csk_bind_conflict(const struct sock *sk,
+				   const struct inet_bind_bucket *tb);
 
 extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
 						 struct request_sock ***prevp,
@@ -28,4 +35,8 @@ extern struct request_sock *inet6_csk_search_req(const struct sock *sk,
 extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 					   struct request_sock *req,
 					   const unsigned long timeout);
+
+extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+
+extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok);
 #endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d9546380fa0..fd040e9a1f4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -609,6 +609,58 @@ inet6_unregister_protosw(struct inet_protosw *p)
 	}
 }
 
+int inet6_sk_rebuild_header(struct sock *sk)
+{
+	int err;
+	struct dst_entry *dst;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	dst = __sk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		struct inet_sock *inet = inet_sk(sk);
+		struct in6_addr *final_p = NULL, final;
+		struct flowi fl;
+
+		memset(&fl, 0, sizeof(fl));
+		fl.proto = sk->sk_protocol;
+		ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+		fl.fl6_flowlabel = np->flow_label;
+		fl.oif = sk->sk_bound_dev_if;
+		fl.fl_ip_dport = inet->dport;
+		fl.fl_ip_sport = inet->sport;
+
+		if (np->opt && np->opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+			ipv6_addr_copy(&final, &fl.fl6_dst);
+			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+			final_p = &final;
+		}
+
+		err = ip6_dst_lookup(sk, &dst, &fl);
+		if (err) {
+			sk->sk_route_caps = 0;
+			return err;
+		}
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+			sk->sk_err_soft = -err;
+			return err;
+		}
+
+		ip6_dst_store(sk, dst, NULL);
+		sk->sk_route_caps = dst->dev->features &
+			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
+
 int
 snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 {
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index fe874eeaa40..792f90f0f9e 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -21,8 +21,34 @@
 
 #include <net/addrconf.h>
 #include <net/inet_connection_sock.h>
+#include <net/inet_ecn.h>
+#include <net/inet_hashtables.h>
+#include <net/ip6_route.h>
 #include <net/sock.h>
 
+int inet6_csk_bind_conflict(const struct sock *sk,
+			    const struct inet_bind_bucket *tb)
+{
+	const struct sock *sk2;
+	const struct hlist_node *node;
+
+	/* We must walk the whole port owner list in this case. -DaveM */
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+		    (!sk->sk_reuse || !sk2->sk_reuse ||
+		     sk2->sk_state == TCP_LISTEN) &&
+		     ipv6_rcv_saddr_equal(sk, sk2))
+			break;
+	}
+
+	return node != NULL;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict);
+
 /*
  * request_sock (formerly open request) hash tables.
  */
@@ -94,3 +120,80 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 }
 
 EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
+
+void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
+
+	sin6->sin6_family = AF_INET6;
+	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
+	sin6->sin6_port	= inet_sk(sk)->dport;
+	/* We do not store received flowlabel for TCP */
+	sin6->sin6_flowinfo = 0;
+	sin6->sin6_scope_id = 0;
+	if (sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+		sin6->sin6_scope_id = sk->sk_bound_dev_if;
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
+
+int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi fl;
+	struct dst_entry *dst;
+	struct in6_addr *final_p = NULL, final;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.proto = sk->sk_protocol;
+	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+	ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+	fl.fl6_flowlabel = np->flow_label;
+	IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
+	fl.oif = sk->sk_bound_dev_if;
+	fl.fl_ip_sport = inet->sport;
+	fl.fl_ip_dport = inet->dport;
+
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+		ipv6_addr_copy(&final, &fl.fl6_dst);
+		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+		final_p = &final;
+	}
+
+	dst = __sk_dst_check(sk, np->dst_cookie);
+
+	if (dst == NULL) {
+		int err = ip6_dst_lookup(sk, &dst, &fl);
+
+		if (err) {
+			sk->sk_err_soft = -err;
+			return err;
+		}
+
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+			sk->sk_route_caps = 0;
+			return err;
+		}
+
+		ip6_dst_store(sk, dst, NULL);
+		sk->sk_route_caps = dst->dev->features &
+			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+	}
+
+	skb->dst = dst_clone(dst);
+
+	/* Restore final destination back after routing done */
+	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+
+	return ip6_xmit(sk, skb, &fl, np->opt, 0);
+}
+
+EXPORT_SYMBOL_GPL(inet6_csk_xmit);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8ce8a1359d2..2f932ce7261 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -72,32 +72,10 @@ static void	tcp_v6_send_check(struct sock *sk, int len,
 				  struct sk_buff *skb);
 
 static int	tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 
 static struct inet_connection_sock_af_ops ipv6_mapped;
 static struct inet_connection_sock_af_ops ipv6_specific;
 
-int inet6_csk_bind_conflict(const struct sock *sk,
-			    const struct inet_bind_bucket *tb)
-{
-	const struct sock *sk2;
-	const struct hlist_node *node;
-
-	/* We must walk the whole port owner list in this case. -DaveM */
-	sk_for_each_bound(sk2, node, &tb->owners) {
-		if (sk != sk2 &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
-		    (!sk->sk_reuse || !sk2->sk_reuse ||
-		     sk2->sk_state == TCP_LISTEN) &&
-		     ipv6_rcv_saddr_equal(sk, sk2))
-			break;
-	}
-
-	return node != NULL;
-}
-
 static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 {
 	return inet_csk_get_port(&tcp_hashinfo, sk, snum,
@@ -1500,129 +1478,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-static int tcp_v6_rebuild_header(struct sock *sk)
-{
-	int err;
-	struct dst_entry *dst;
-	struct ipv6_pinfo *np = inet6_sk(sk);
-
-	dst = __sk_dst_check(sk, np->dst_cookie);
-
-	if (dst == NULL) {
-		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *final_p = NULL, final;
-		struct flowi fl;
-
-		memset(&fl, 0, sizeof(fl));
-		fl.proto = IPPROTO_TCP;
-		ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-		fl.fl6_flowlabel = np->flow_label;
-		fl.oif = sk->sk_bound_dev_if;
-		fl.fl_ip_dport = inet->dport;
-		fl.fl_ip_sport = inet->sport;
-
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
-
-		err = ip6_dst_lookup(sk, &dst, &fl);
-		if (err) {
-			sk->sk_route_caps = 0;
-			return err;
-		}
-		if (final_p)
-			ipv6_addr_copy(&fl.fl6_dst, final_p);
-
-		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
-			sk->sk_err_soft = -err;
-			return err;
-		}
-
-		ip6_dst_store(sk, dst, NULL);
-		sk->sk_route_caps = dst->dev->features &
-			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-	}
-
-	return 0;
-}
-
-static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok)
-{
-	struct sock *sk = skb->sk;
-	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct flowi fl;
-	struct dst_entry *dst;
-	struct in6_addr *final_p = NULL, final;
-
-	memset(&fl, 0, sizeof(fl));
-	fl.proto = IPPROTO_TCP;
-	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-	ipv6_addr_copy(&fl.fl6_src, &np->saddr);
-	fl.fl6_flowlabel = np->flow_label;
-	IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel);
-	fl.oif = sk->sk_bound_dev_if;
-	fl.fl_ip_sport = inet->sport;
-	fl.fl_ip_dport = inet->dport;
-
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
-
-	dst = __sk_dst_check(sk, np->dst_cookie);
-
-	if (dst == NULL) {
-		int err = ip6_dst_lookup(sk, &dst, &fl);
-
-		if (err) {
-			sk->sk_err_soft = -err;
-			return err;
-		}
-
-		if (final_p)
-			ipv6_addr_copy(&fl.fl6_dst, final_p);
-
-		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
-			sk->sk_route_caps = 0;
-			return err;
-		}
-
-		ip6_dst_store(sk, dst, NULL);
-		sk->sk_route_caps = dst->dev->features &
-			~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-	}
-
-	skb->dst = dst_clone(dst);
-
-	/* Restore final destination back after routing done */
-	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
-
-	return ip6_xmit(sk, skb, &fl, np->opt, 0);
-}
-
-static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
-{
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
-	sin6->sin6_family = AF_INET6;
-	ipv6_addr_copy(&sin6->sin6_addr, &np->daddr);
-	sin6->sin6_port	= inet_sk(sk)->dport;
-	/* We do not store received flowlabel for TCP */
-	sin6->sin6_flowinfo = 0;
-	sin6->sin6_scope_id = 0;
-	if (sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
-		sin6->sin6_scope_id = sk->sk_bound_dev_if;
-}
-
 static int tcp_v6_remember_stamp(struct sock *sk)
 {
 	/* Alas, not yet... */
@@ -1630,9 +1485,9 @@ static int tcp_v6_remember_stamp(struct sock *sk)
 }
 
 static struct inet_connection_sock_af_ops ipv6_specific = {
-	.queue_xmit	=	tcp_v6_xmit,
+	.queue_xmit	=	inet6_csk_xmit,
 	.send_check	=	tcp_v6_send_check,
-	.rebuild_header	=	tcp_v6_rebuild_header,
+	.rebuild_header	=	inet6_sk_rebuild_header,
 	.conn_request	=	tcp_v6_conn_request,
 	.syn_recv_sock	=	tcp_v6_syn_recv_sock,
 	.remember_stamp	=	tcp_v6_remember_stamp,
@@ -1640,7 +1495,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
-	.addr2sockaddr	=	v6_addr2sockaddr,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
 
@@ -1659,7 +1514,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = {
 
 	.setsockopt	=	ipv6_setsockopt,
 	.getsockopt	=	ipv6_getsockopt,
-	.addr2sockaddr	=	v6_addr2sockaddr,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
 	.sockaddr_len	=	sizeof(struct sockaddr_in6)
 };
 
-- 
cgit v1.2.3


From 0fa1a53e1f055a6c790f40e7728f42a825b29248 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:23:09 -0800
Subject: [IPV6]: Introduce inet6_timewait_sock

Out of tcp6_timewait_sock, that now is just an aggregation of
inet_timewait_sock and inet6_timewait_sock, using tw_ipv6_offset in struct
inet_timewait_sock, that is common to the IPv6 transport protocols that use
timewait sockets, like DCCP and TCP.

tw_ipv6_offset plays the struct inet_sock pinfo6 role, i.e. for the generic
code to find the IPv6 area in a timewait sock.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             | 32 +++++++++++++++++++++-----------
 include/net/inet6_hashtables.h   |  6 +++---
 include/net/inet_timewait_sock.h |  3 ++-
 net/ipv4/inet_diag.c             |  6 +++---
 net/ipv4/tcp_minisocks.c         |  8 +++++---
 net/ipv6/addrconf.c              |  2 +-
 net/ipv6/tcp_ipv6.c              | 12 ++++++------
 7 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 69a0decfbdf..7d3908594fa 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -348,26 +348,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 
 #include <linux/tcp.h>
 
+struct inet6_timewait_sock {
+	struct in6_addr tw_v6_daddr;
+	struct in6_addr	tw_v6_rcv_saddr;
+};
+
 struct tcp6_timewait_sock {
-	struct tcp_timewait_sock tw_v6_sk;
-	struct in6_addr		 tw_v6_daddr;
-	struct in6_addr		 tw_v6_rcv_saddr;
+	struct tcp_timewait_sock   tcp6tw_tcp;
+	struct inet6_timewait_sock tcp6tw_inet6;
 };
 
-static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
+static inline u16 inet6_tw_offset(const struct proto *prot)
+{
+	return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock);
+}
+
+static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
 {
-	return (struct tcp6_timewait_sock *)sk;
+	return (struct inet6_timewait_sock *)(((u8 *)sk) +
+					      inet_twsk(sk)->tw_ipv6_offset);
 }
 
-static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk)
 {
 	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
+		&inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr;
 }
 
-static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
+static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
 {
-	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
+	return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL;
 }
 
 static inline int inet_v6_ipv6only(const struct sock *sk)
@@ -395,8 +405,8 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 	return NULL;
 }
 
-#define __tcp_v6_rcv_saddr(__sk)	NULL
-#define tcp_v6_rcv_saddr(__sk)		NULL
+#define __inet6_rcv_saddr(__sk)	NULL
+#define inet6_rcv_saddr(__sk)	NULL
 #define tcp_twsk_ipv6only(__sk)		0
 #define inet_v6_ipv6only(__sk)		0
 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index a4a204f99ea..25f708ff020 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -110,10 +110,10 @@ static inline struct sock *
 
 		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
 		   sk->sk_family		== PF_INET6) {
-			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+			const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
 
-			if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
-			    ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
+			if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
+			    ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 			    (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
 				goto hit;
 		}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 28f7b210350..ca240f856c4 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -127,7 +127,8 @@ struct inet_timewait_sock {
 	__u16			tw_num;
 	/* And these are ours. */
 	__u8			tw_ipv6only:1;
-	/* 31 bits hole, try to pack */
+	/* 15 bits hole, try to pack */
+	__u16			tw_ipv6_offset;
 	int			tw_timeout;
 	unsigned long		tw_ttd;
 	struct inet_bind_bucket	*tw_tb;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3ce73b141d7..c4990819204 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
 		r->idiag_inode = 0;
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 		if (r->idiag_family == AF_INET6) {
-			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+			const struct inet6_timewait_sock *tw6 = inet6_twsk(sk);
 
 			ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
-				       &tcp6tw->tw_v6_rcv_saddr);
+				       &tw6->tw_v6_rcv_saddr);
 			ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
-				       &tcp6tw->tw_v6_daddr);
+				       &tw6->tw_v6_daddr);
 		}
 #endif
 		nlh->nlmsg_len = skb->tail - b;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9c029683a62..2b9b7f6c7f7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
-			struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+			struct inet6_timewait_sock *tw6;
 
-			ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
-			ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+			tw6 = inet6_twsk((struct sock *)tw);
+			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
 			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a60585fd85a..704fb73e6c5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
 	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
-	const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
+	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
 	u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
 	u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2f932ce7261..cb880079daf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -138,14 +138,14 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
 
 		tw = inet_twsk(sk2);
 
 		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
 		   sk2->sk_family		== PF_INET6	&&
-		   ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
-		   ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
+		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
+		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
@@ -1663,14 +1663,14 @@ static void get_timewait6_sock(struct seq_file *seq,
 {
 	struct in6_addr *dest, *src;
 	__u16 destp, srcp;
-	struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
+	struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw);
 	int ttd = tw->tw_ttd - jiffies;
 
 	if (ttd < 0)
 		ttd = 0;
 
-	dest = &tcp6tw->tw_v6_daddr;
-	src  = &tcp6tw->tw_v6_rcv_saddr;
+	dest = &tw6->tw_v6_daddr;
+	src  = &tw6->tw_v6_rcv_saddr;
 	destp = ntohs(tw->tw_dport);
 	srcp  = ntohs(tw->tw_sport);
 
-- 
cgit v1.2.3


From 3cf3dc6c2e05e67b12e522f547c0b71d509a516c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:23:20 -0800
Subject: [IPV6]: Export some symbols for DCCPv6

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c      | 2 ++
 net/ipv6/exthdrs.c       | 4 ++++
 net/ipv6/ip6_flowlabel.c | 2 ++
 net/ipv6/ip6_output.c    | 2 ++
 4 files changed, 10 insertions(+)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fd040e9a1f4..23675ef1f42 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -389,6 +389,8 @@ int inet6_destroy_sock(struct sock *sk)
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+
 /*
  *	This does both peername and sockname.
  */
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index be6faf31138..113374dc342 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
 	return opt;
 }
 
+EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
+
 /**********************************
   Hop-by-hop options.
  **********************************/
@@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 	return opt2;
 }
 
+EXPORT_SYMBOL_GPL(ipv6_dup_options);
+
 static int ipv6_renew_option(void *ohdr,
 			     struct ipv6_opt_hdr __user *newopt, int newoptlen,
 			     int inherit,
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1cf02765fb5..89d12b4817a 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label)
 	return NULL;
 }
 
+EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+
 void fl6_free_socklist(struct sock *sk)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8523c76ebf7..b4c4beba0ed 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -775,6 +775,8 @@ out_err_release:
 	return err;
 }
 
+EXPORT_SYMBOL_GPL(ip6_dst_lookup);
+
 static inline int ip6_ufo_append_data(struct sock *sk,
 			int getfrag(void *from, char *to, int offset, int len,
 			int odd, struct sk_buff *skb),
-- 
cgit v1.2.3


From 34ca6860810342441f801226b19ae6c9e0ecb34f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:23:32 -0800
Subject: [DCCP]: Just rename dccp_v4_prot to dccp_prot

To match TCP equivalent.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/dccp.h  | 2 +-
 net/dccp/ipv4.c  | 2 +-
 net/dccp/proto.c | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index f97b85d55ad..e711f850053 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
 
 #define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
 
-extern struct proto dccp_v4_prot;
+extern struct proto dccp_prot;
 
 /* is seq1 < seq2 ? */
 static inline int before48(const u64 seq1, const u64 seq2)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 0ce7d0fe5ee..9f69a67a4b0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1317,7 +1317,7 @@ static struct request_sock_ops dccp_request_sock_ops = {
 	.send_reset	= dccp_v4_ctl_send_reset,
 };
 
-struct proto dccp_v4_prot = {
+struct proto dccp_prot = {
 	.name			= "DCCP",
 	.owner			= THIS_MODULE,
 	.close			= dccp_close,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7b30c12147c..9cb2989f93b 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -684,7 +684,7 @@ extern struct net_proto_family inet_family_ops;
 static struct inet_protosw dccp_v4_protosw = {
 	.type		= SOCK_DCCP,
 	.protocol	= IPPROTO_DCCP,
-	.prot		= &dccp_v4_prot,
+	.prot		= &dccp_prot,
 	.ops		= &inet_dccp_ops,
 	.capability	= -1,
 	.no_check	= 0,
@@ -769,7 +769,7 @@ static int __init dccp_init(void)
 {
 	unsigned long goal;
 	int ehash_order, bhash_order, i;
-	int rc = proto_register(&dccp_v4_prot, 1);
+	int rc = proto_register(&dccp_prot, 1);
 
 	if (rc)
 		goto out;
@@ -872,7 +872,7 @@ out_free_bind_bucket_cachep:
 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
 	dccp_hashinfo.bind_bucket_cachep = NULL;
 out_proto_unregister:
-	proto_unregister(&dccp_v4_prot);
+	proto_unregister(&dccp_prot);
 	goto out;
 }
 
@@ -895,7 +895,7 @@ static void __exit dccp_fini(void)
 		   get_order(dccp_hashinfo.ehash_size *
 			     sizeof(struct inet_ehash_bucket)));
 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
-	proto_unregister(&dccp_v4_prot);
+	proto_unregister(&dccp_prot);
 }
 
 module_init(dccp_init);
-- 
cgit v1.2.3


From f21e68caa0ddffddf98a1e729e734a470957b6ec Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:24:16 -0800
Subject: [DCCP]: Prepare the AF agnostic core for the introduction of DCCPv6

Basically exports a similar set of functions as the one exported by
the non-AF specific TCP code.

In the process moved some non-AF specific code from dccp_v4_connect to
dccp_connect_init and moved the checksum verification from
dccp_invalid_packet to dccp_v4_rcv, so as to use it in dccp_v6_rcv
too.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/dccp.h      | 22 ++++++++++++++++
 net/dccp/input.c     |  4 +++
 net/dccp/ipv4.c      | 73 ++++++++++++++++++++++++----------------------------
 net/dccp/minisocks.c |  8 ++++++
 net/dccp/output.c    | 27 ++++++++++++-------
 net/dccp/proto.c     | 32 ++++++++++++++++++++---
 6 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e711f850053..93f26dd6e6c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				const struct dccp_hdr *dh, const unsigned len);
 
+extern int dccp_v4_init_sock(struct sock *sk);
+extern int dccp_v4_destroy_sock(struct sock *sk);
+
 extern void		dccp_close(struct sock *sk, long timeout);
 extern struct sk_buff	*dccp_make_response(struct sock *sk,
 					    struct dst_entry *dst,
@@ -238,6 +241,7 @@ extern struct sk_buff	*dccp_make_reset(struct sock *sk,
 
 extern int	   dccp_connect(struct sock *sk);
 extern int	   dccp_disconnect(struct sock *sk, int flags);
+extern void	   dccp_unhash(struct sock *sk);
 extern int	   dccp_getsockopt(struct sock *sk, int level, int optname,
 				   char __user *optval, int __user *optlen);
 extern int	   dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -249,6 +253,13 @@ extern int	   dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
 				struct msghdr *msg, size_t len, int nonblock,
 				int flags, int *addr_len);
 extern void	   dccp_shutdown(struct sock *sk, int how);
+extern int	   inet_dccp_listen(struct socket *sock, int backlog);
+extern unsigned int dccp_poll(struct file *file, struct socket *sock,
+			     poll_table *wait);
+extern void	   dccp_v4_send_check(struct sock *sk, int len,
+				      struct sk_buff *skb);
+extern int	   dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+				   int addr_len);
 
 extern int	   dccp_v4_checksum(const struct sk_buff *skb,
 				    const u32 saddr, const u32 daddr);
@@ -256,6 +267,17 @@ extern int	   dccp_v4_checksum(const struct sk_buff *skb,
 extern int	   dccp_v4_send_reset(struct sock *sk,
 				      enum dccp_reset_codes code);
 extern void	   dccp_send_close(struct sock *sk, const int active);
+extern int	   dccp_invalid_packet(struct sk_buff *skb);
+
+static inline int dccp_bad_service_code(const struct sock *sk,
+					const __u32 service)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+
+	if (dp->dccps_service == service)
+		return 0;
+	return !dccp_list_has_service(dp->dccps_service_list, service);
+}
 
 struct dccp_skb_cb {
 	__u8  dccpd_type:4;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index c81488fa293..9a724ff2a62 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -250,6 +250,8 @@ discard:
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(dccp_rcv_established);
+
 static int dccp_rcv_request_sent_state_process(struct sock *sk,
 					       struct sk_buff *skb,
 					       const struct dccp_hdr *dh,
@@ -567,3 +569,5 @@ discard:
 	}
 	return 0;
 }
+
+EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9f69a67a4b0..3108c9d464f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -46,11 +46,13 @@ static void dccp_v4_hash(struct sock *sk)
 	inet_hash(&dccp_hashinfo, sk);
 }
 
-static void dccp_v4_unhash(struct sock *sk)
+void dccp_unhash(struct sock *sk)
 {
 	inet_unhash(&dccp_hashinfo, sk);
 }
 
+EXPORT_SYMBOL_GPL(dccp_unhash);
+
 /* called with local bh disabled */
 static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
 				      struct inet_timewait_sock **twp)
@@ -209,8 +211,7 @@ out:
 	}
 }
 
-static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
-			   int addr_len)
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -288,16 +289,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
 							    usin->sin_port);
 	dccp_update_gss(sk, dp->dccps_iss);
 
-	/*
-	 * SWL and AWL are initially adjusted so that they are not less than
-	 * the initial Sequence Numbers received and sent, respectively:
-	 *	SWL := max(GSR + 1 - floor(W/4), ISR),
-	 *	AWL := max(GSS - W' + 1, ISS).
-	 * These adjustments MUST be applied only at the beginning of the
-	 * connection.
-	 */
-	dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
-
 	inet->id = dp->dccps_iss ^ jiffies;
 
 	err = dccp_connect(sk);
@@ -317,6 +308,8 @@ failure:
 	goto out;
 }
 
+EXPORT_SYMBOL_GPL(dccp_v4_connect);
+
 /*
  * This routine does path mtu discovery as defined in RFC1191.
  */
@@ -608,7 +601,7 @@ out:
 }
 
 /* This routine computes an IPv4 DCCP checksum. */
-static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	struct dccp_hdr *dh = dccp_hdr(skb);
@@ -616,6 +609,8 @@ static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 	dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
 }
 
+EXPORT_SYMBOL_GPL(dccp_v4_send_check);
+
 int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
 {
 	struct sk_buff *skb;
@@ -651,16 +646,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk,
 					   dccp_hdr(skb)->dccph_sport);
 }
 
-static inline int dccp_bad_service_code(const struct sock *sk,
-					const __u32 service)
-{
-	const struct dccp_sock *dp = dccp_sk(sk);
-
-	if (dp->dccps_service == service)
-		return 0;
-	return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
 int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq;
@@ -672,7 +657,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  	const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
 	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
 	__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
-	struct dst_entry *dst = NULL;
 
 	/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
 	if (((struct rtable *)skb->dst)->rt_flags &
@@ -713,7 +697,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	ireq = inet_rsk(req);
 	ireq->loc_addr = daddr;
 	ireq->rmt_addr = saddr;
-	/* FIXME: Merge Aristeu's option parsing code when ready */
 	req->rcv_wnd	= 100; /* Fake, option parsing will get the
 				  right value */
 	ireq->opt	= NULL;
@@ -731,7 +714,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	dreq->dreq_iss	   = dccp_v4_init_sequence(sk, skb);
 	dreq->dreq_service = service;
 
-	if (dccp_v4_send_response(sk, req, dst))
+	if (dccp_v4_send_response(sk, req, NULL))
 		goto drop_and_free;
 
 	inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -748,6 +731,8 @@ drop:
 	return -1;
 }
 
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
 /*
  * The three way handshake has completed - we got a valid ACK or DATAACK -
  * now create the new socket.
@@ -802,6 +787,8 @@ exit:
 	return NULL;
 }
 
+EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
+
 static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 {
 	const struct dccp_hdr *dh = dccp_hdr(skb);
@@ -1021,7 +1008,9 @@ discard:
 	return 0;
 }
 
-static inline int dccp_invalid_packet(struct sk_buff *skb)
+EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+
+int dccp_invalid_packet(struct sk_buff *skb)
 {
 	const struct dccp_hdr *dh;
 
@@ -1075,17 +1064,11 @@ static inline int dccp_invalid_packet(struct sk_buff *skb)
 		return 1;
 	}
 
-	/* If the header checksum is incorrect, drop packet and return */
-	if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
-				    skb->nh.iph->daddr) < 0) {
-		LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
-					    "incorrect\n");
-		return 1;
-	}
-
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(dccp_invalid_packet);
+
 /* this is called when real data arrives */
 int dccp_v4_rcv(struct sk_buff *skb)
 {
@@ -1098,6 +1081,14 @@ int dccp_v4_rcv(struct sk_buff *skb)
 	if (dccp_invalid_packet(skb))
 		goto discard_it;
 
+	/* If the header checksum is incorrect, drop packet and return */
+	if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+				    skb->nh.iph->daddr) < 0) {
+		LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
+			       __FUNCTION__);
+		goto discard_it;
+	}
+
 	dh = dccp_hdr(skb);
 
 	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(skb);
@@ -1217,7 +1208,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 	.sockaddr_len	= sizeof(struct sockaddr_in),
 };
 
-static int dccp_v4_init_sock(struct sock *sk)
+int dccp_v4_init_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	static int dccp_ctl_socket_init = 1;
@@ -1270,7 +1261,9 @@ static int dccp_v4_init_sock(struct sock *sk)
 	return 0;
 }
 
-static int dccp_v4_destroy_sock(struct sock *sk)
+EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
+
+int dccp_v4_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
@@ -1303,6 +1296,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
+
 static void dccp_v4_reqsk_destructor(struct request_sock *req)
 {
 	kfree(inet_rsk(req)->opt);
@@ -1331,7 +1326,7 @@ struct proto dccp_prot = {
 	.recvmsg		= dccp_recvmsg,
 	.backlog_rcv		= dccp_v4_do_rcv,
 	.hash			= dccp_v4_hash,
-	.unhash			= dccp_v4_unhash,
+	.unhash			= dccp_unhash,
 	.accept			= inet_csk_accept,
 	.get_port		= dccp_v4_get_port,
 	.shutdown		= dccp_shutdown,
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index c7ff80cf53a..5c767b5e9a5 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = {
 					    (unsigned long)&dccp_death_row),
 };
 
+EXPORT_SYMBOL_GPL(dccp_death_row);
+
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -170,6 +172,8 @@ out_free:
 	return newsk;
 }
 
+EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
+
 /* 
  * Process an incoming packet for RESPOND sockets represented
  * as an request_sock.
@@ -236,6 +240,8 @@ drop:
 	goto out;
 }
 
+EXPORT_SYMBOL_GPL(dccp_check_req);
+
 /*
  *  Queue segment on the new socket if the new socket is active,
  *  otherwise we just shortcircuit this and continue with
@@ -266,3 +272,5 @@ int dccp_child_process(struct sock *parent, struct sock *child,
 	sock_put(child);
 	return ret;
 }
+
+EXPORT_SYMBOL_GPL(dccp_child_process);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index f35880503bb..c40f7f8a328 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -135,12 +135,6 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	/*
-	 * FIXME: we really should be using the af_specific thing to support
-	 * 	  IPv6.
-	 * mss_now = pmtu - tp->af_specific->net_header_len -
-	 * 	     sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
-	 */
 	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
 		       sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
 
@@ -164,6 +158,8 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	return mss_now;
 }
 
+EXPORT_SYMBOL_GPL(dccp_sync_mss);
+
 void dccp_write_space(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
@@ -319,6 +315,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	return skb;
 }
 
+EXPORT_SYMBOL_GPL(dccp_make_response);
+
 struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
 				const enum dccp_reset_codes code)
 				   
@@ -375,6 +373,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
  */
 static inline void dccp_connect_init(struct sock *sk)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -383,10 +382,16 @@ static inline void dccp_connect_init(struct sock *sk)
 	
 	dccp_sync_mss(sk, dst_mtu(dst));
 
-	/*
-	 * FIXME: set dp->{dccps_swh,dccps_swl}, with
-	 * something like dccp_inc_seq
-	 */
+	dccp_update_gss(sk, dp->dccps_iss);
+ 	/*
+	 * SWL and AWL are initially adjusted so that they are not less than
+	 * the initial Sequence Numbers received and sent, respectively:
+	 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+	 *	AWL := max(GSS - W' + 1, ISS).
+	 * These adjustments MUST be applied only at the beginning of the
+	 * connection.
+ 	 */
+	dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
 
 	icsk->icsk_retransmits = 0;
 }
@@ -418,6 +423,8 @@ int dccp_connect(struct sock *sk)
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(dccp_connect);
+
 void dccp_send_ack(struct sock *sk)
 {
 	/* If we have been reset, we may not send again. */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9cb2989f93b..51dfacd22a6 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -41,8 +41,12 @@
 
 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
+EXPORT_SYMBOL_GPL(dccp_statistics);
+
 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
 
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
 static struct net_protocol dccp_protocol = {
 	.handler	= dccp_v4_rcv,
 	.err_handler	= dccp_v4_err,
@@ -149,6 +153,8 @@ int dccp_disconnect(struct sock *sk, int flags)
 	return err;
 }
 
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
 /*
  *	Wait for a DCCP event.
  *
@@ -156,8 +162,8 @@ int dccp_disconnect(struct sock *sk, int flags)
  *	take care of normal races (between the test and the event) and we don't
  *	go look at any of the socket buffers directly.
  */
-static unsigned int dccp_poll(struct file *file, struct socket *sock,
-			      poll_table *wait)
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+		       poll_table *wait)
 {
 	unsigned int mask;
 	struct sock *sk = sock->sk;
@@ -205,12 +211,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
+EXPORT_SYMBOL_GPL(dccp_poll);
+
 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	dccp_pr_debug("entry\n");
 	return -ENOIOCTLCMD;
 }
 
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
 static int dccp_setsockopt_service(struct sock *sk, const u32 service,
 				   char __user *optval, int optlen)
 {
@@ -284,6 +294,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
 	return err;
 }
 
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
 static int dccp_getsockopt_service(struct sock *sk, int len,
 				   u32 __user *optval,
 				   int __user *optlen)
@@ -357,6 +369,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
+
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
@@ -413,6 +427,8 @@ out_discard:
 	goto out_release;
 }
 
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len, int nonblock, int flags, int *addr_len)
 {
@@ -510,7 +526,9 @@ out:
 	return len;
 }
 
-static int inet_dccp_listen(struct socket *sock, int backlog)
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
 {
 	struct sock *sk = sock->sk;
 	unsigned char old_state;
@@ -546,6 +564,8 @@ out:
 	return err;
 }
 
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
+
 static const unsigned char dccp_new_state[] = {
 	/* current state:   new state:      action:	*/
 	[0]		  = DCCP_CLOSED,
@@ -651,11 +671,15 @@ adjudge_to_death:
 	sock_put(sk);
 }
 
+EXPORT_SYMBOL_GPL(dccp_close);
+
 void dccp_shutdown(struct sock *sk, int how)
 {
 	dccp_pr_debug("entry\n");
 }
 
+EXPORT_SYMBOL_GPL(dccp_shutdown);
+
 static struct proto_ops inet_dccp_ops = {
 	.family		= PF_INET,
 	.owner		= THIS_MODULE,
@@ -763,6 +787,8 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
 int dccp_debug;
 module_param(dccp_debug, int, 0444);
 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+
+EXPORT_SYMBOL_GPL(dccp_debug);
 #endif
 
 static int __init dccp_init(void)
-- 
cgit v1.2.3


From 399c07def62a77678d633f5b3005431423a424a8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:24:28 -0800
Subject: [IPV6]: Export ipv6_opt_accepted

It was already non-TCP specific, will be used by DCCPv6.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h  |  2 ++
 net/ipv6/af_inet6.c | 21 +++++++++++++++++++++
 net/ipv6/tcp_ipv6.c | 16 ----------------
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0a2ad51cff8..851376108ac 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -240,6 +240,8 @@ extern struct ipv6_txoptions *	ipv6_renew_options(struct sock *sk, struct ipv6_t
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
+extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb);
+
 extern int ip6_frag_nqueues;
 extern atomic_t ip6_frag_mem;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 23675ef1f42..bf17aab9b77 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -663,6 +663,27 @@ int inet6_sk_rebuild_header(struct sock *sk)
 
 EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
 
+int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	if (np->rxopt.all) {
+		if ((opt->hop && (np->rxopt.bits.hopopts ||
+				  np->rxopt.bits.ohopopts)) ||
+		    ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) &&
+		     np->rxopt.bits.rxflow) ||
+		    (opt->srcrt && (np->rxopt.bits.srcrt ||
+		     np->rxopt.bits.osrcrt)) ||
+		    ((opt->dst1 || opt->dst0) &&
+		     (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
+			return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
+
 int
 snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cb880079daf..e5c8a669e84 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -711,22 +711,6 @@ static struct request_sock_ops tcp6_request_sock_ops = {
 	.send_reset	=	tcp_v6_send_reset
 };
 
-static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
-{
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct inet6_skb_parm *opt = IP6CB(skb);
-
-	if (np->rxopt.all) {
-		if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
-		    ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) ||
-		    (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) ||
-		    ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
-			return 1;
-	}
-	return 0;
-}
-
-
 static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-- 
cgit v1.2.3


From 3df80d9320bcaea72b1b4761a319c79cb3fdaf5f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:24:53 -0800
Subject: [DCCP]: Introduce DCCPv6

Still needs mucho polishing, specially in the checksum code, but works
just fine, inet_diag/iproute2 and all 8)

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/Makefile    |    4 +
 net/dccp/ipv6.c      | 1438 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/ipv6.h      |   37 ++
 net/dccp/minisocks.c |   13 +-
 4 files changed, 1491 insertions(+), 1 deletion(-)
 create mode 100644 net/dccp/ipv6.c
 create mode 100644 net/dccp/ipv6.h

diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 344a8da153f..87b27fff6e3 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,3 +1,7 @@
+obj-$(CONFIG_IPV6) += dccp_ipv6.o
+
+dccp_ipv6-y := ipv6.o
+
 obj-$(CONFIG_IP_DCCP) += dccp.o
 
 dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
new file mode 100644
index 00000000000..a7d2aee5b3a
--- /dev/null
+++ b/net/dccp/ipv6.c
@@ -0,0 +1,1438 @@
+/*
+ *	DCCP over IPv6
+ *	Linux INET6 implementation 
+ *
+ *	Based on net/dccp6/ipv6.c
+ *
+ *	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/xfrm.h>
+
+#include <net/addrconf.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/xfrm.h>
+
+#include "dccp.h"
+#include "ipv6.h"
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+				   struct request_sock *req);
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
+
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
+
+static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
+{
+	return inet_csk_get_port(&dccp_hashinfo, sk, snum,
+				 inet6_csk_bind_conflict);
+}
+
+static void dccp_v6_hash(struct sock *sk)
+{
+	if (sk->sk_state != DCCP_CLOSED) {
+		if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
+			dccp_prot.hash(sk);
+			return;
+		}
+		local_bh_disable();
+		__inet6_hash(&dccp_hashinfo, sk);
+		local_bh_enable();
+	}
+}
+
+static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
+				struct in6_addr *saddr, 
+				struct in6_addr *daddr, 
+				unsigned long base)
+{
+	return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
+}
+
+static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
+						    skb->nh.ipv6h->saddr.s6_addr32,
+						    dh->dccph_dport,
+						    dh->dccph_sport);
+	else
+		return secure_dccp_sequence_number(skb->nh.iph->daddr,
+						   skb->nh.iph->saddr,
+						   dh->dccph_dport,
+						   dh->dccph_sport);
+}
+
+static int __dccp_v6_check_established(struct sock *sk, const __u16 lport,
+				       struct inet_timewait_sock **twp)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *daddr = &np->rcv_saddr;
+	const struct in6_addr *saddr = &np->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const unsigned int hash = inet6_ehashfn(daddr, inet->num,
+						saddr, inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
+	struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	prefetch(head->chain.first);
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
+		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
+
+		tw = inet_twsk(sk2);
+
+		if(*((__u32 *)&(tw->tw_dport))	== ports	 &&
+		   sk2->sk_family		== PF_INET6	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if)
+			goto not_unique;
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sk->sk_hash = hash;
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, &dccp_death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 dccp_v6_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+
+	return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+					   np->daddr.s6_addr32,
+					   inet->dport);
+}
+
+static int dccp_v6_hash_connect(struct sock *sk)
+{
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (snum == 0) {
+ 		int low = sysctl_local_port_range[0];
+ 		int high = sysctl_local_port_range[1];
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint + dccp_v6_port_offset(sk);
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &dccp_hashinfo.bhash[inet_bhashfn(port,
+						    dccp_hashinfo.bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__dccp_v6_check_established(sk,
+									 port,
+									 &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
+						     head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__inet6_hash(&dccp_hashinfo, sk);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			inet_twsk_deschedule(tw, &dccp_death_row);
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
+						 dccp_hashinfo.bhash_size)];
+ 	tb   = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+
+	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+		__inet6_hash(&dccp_hashinfo, sk);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __dccp_v6_check_established(sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
+			   int addr_len)
+{
+	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct in6_addr *saddr = NULL, *final_p = NULL, final;
+	struct flowi fl;
+	struct dst_entry *dst;
+	int addr_type;
+	int err;
+
+	dp->dccps_role = DCCP_ROLE_CLIENT;
+
+	if (addr_len < SIN6_LEN_RFC2133) 
+		return -EINVAL;
+
+	if (usin->sin6_family != AF_INET6) 
+		return -EAFNOSUPPORT;
+
+	memset(&fl, 0, sizeof(fl));
+
+	if (np->sndflow) {
+		fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+		IP6_ECN_flow_init(fl.fl6_flowlabel);
+		if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
+			struct ip6_flowlabel *flowlabel;
+			flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
+			if (flowlabel == NULL)
+				return -EINVAL;
+			ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
+			fl6_sock_release(flowlabel);
+		}
+	}
+
+	/*
+  	 *	connect() to INADDR_ANY means loopback (BSD'ism).
+  	 */
+  	
+  	if (ipv6_addr_any(&usin->sin6_addr))
+		usin->sin6_addr.s6_addr[15] = 0x1; 
+
+	addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+	if(addr_type & IPV6_ADDR_MULTICAST)
+		return -ENETUNREACH;
+
+	if (addr_type & IPV6_ADDR_LINKLOCAL) {
+		if (addr_len >= sizeof(struct sockaddr_in6) &&
+		    usin->sin6_scope_id) {
+			/* If interface is set while binding, indices
+			 * must coincide.
+			 */
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != usin->sin6_scope_id)
+				return -EINVAL;
+
+			sk->sk_bound_dev_if = usin->sin6_scope_id;
+		}
+
+		/* Connect to link-local address requires an interface */
+		if (!sk->sk_bound_dev_if)
+			return -EINVAL;
+	}
+
+	ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
+	np->flow_label = fl.fl6_flowlabel;
+
+	/*
+	 *	DCCP over IPv4
+	 */
+
+	if (addr_type == IPV6_ADDR_MAPPED) {
+		u32 exthdrlen = dp->dccps_ext_header_len;
+		struct sockaddr_in sin;
+
+		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+		if (__ipv6_only_sock(sk))
+			return -ENETUNREACH;
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = usin->sin6_port;
+		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+		inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped;
+		sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+		err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+
+		if (err) {
+			dp->dccps_ext_header_len = exthdrlen;
+			inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+			sk->sk_backlog_rcv = dccp_v6_do_rcv;
+			goto failure;
+		} else {
+			ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
+				      inet->saddr);
+			ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
+				      inet->rcv_saddr);
+		}
+
+		return err;
+	}
+
+	if (!ipv6_addr_any(&np->rcv_saddr))
+		saddr = &np->rcv_saddr;
+
+	fl.proto = IPPROTO_DCCP;
+	ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+	ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
+	fl.oif = sk->sk_bound_dev_if;
+	fl.fl_ip_dport = usin->sin6_port;
+	fl.fl_ip_sport = inet->sport;
+
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
+		ipv6_addr_copy(&final, &fl.fl6_dst);
+		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+		final_p = &final;
+	}
+
+	err = ip6_dst_lookup(sk, &dst, &fl);
+	if (err)
+		goto failure;
+	if (final_p)
+		ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+	if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+		goto failure;
+
+	if (saddr == NULL) {
+		saddr = &fl.fl6_src;
+		ipv6_addr_copy(&np->rcv_saddr, saddr);
+	}
+
+	/* set the source address */
+	ipv6_addr_copy(&np->saddr, saddr);
+	inet->rcv_saddr = LOOPBACK4_IPV6;
+
+	ip6_dst_store(sk, dst, NULL);
+
+	dp->dccps_ext_header_len = 0;
+	if (np->opt)
+		dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+
+	inet->dport = usin->sin6_port;
+
+	dccp_set_state(sk, DCCP_REQUESTING);
+	err = dccp_v6_hash_connect(sk);
+	if (err)
+		goto late_failure;
+	/* FIXME */
+#if 0
+	dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
+						       np->daddr.s6_addr32,
+						       inet->sport,
+						       inet->dport);
+#endif
+	err = dccp_connect(sk);
+	if (err)
+		goto late_failure;
+
+	return 0;
+
+late_failure:
+	dccp_set_state(sk, DCCP_CLOSED);
+	__sk_dst_reset(sk);
+failure:
+	inet->dport = 0;
+	sk->sk_route_caps = 0;
+	return err;
+}
+
+static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			int type, int code, int offset, __u32 info)
+{
+	struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+	struct ipv6_pinfo *np;
+	struct sock *sk;
+	int err;
+	__u64 seq;
+
+	sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
+			  &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
+
+	if (sk == NULL) {
+		ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+		return;
+	}
+
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		inet_twsk_put((struct inet_timewait_sock *)sk);
+		return;
+	}
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == DCCP_CLOSED)
+		goto out;
+
+	np = inet6_sk(sk);
+
+	if (type == ICMPV6_PKT_TOOBIG) {
+		struct dccp_sock *dp = dccp_sk(sk);
+		struct dst_entry *dst = NULL;
+
+		if (sock_owned_by_user(sk))
+			goto out;
+		if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
+			goto out;
+
+		/* icmp should have updated the destination cache entry */
+		dst = __sk_dst_check(sk, np->dst_cookie);
+
+		if (dst == NULL) {
+			struct inet_sock *inet = inet_sk(sk);
+			struct flowi fl;
+
+			/* BUGGG_FUTURE: Again, it is not clear how
+			   to handle rthdr case. Ignore this complexity
+			   for now.
+			 */
+			memset(&fl, 0, sizeof(fl));
+			fl.proto = IPPROTO_DCCP;
+			ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
+			ipv6_addr_copy(&fl.fl6_src, &np->saddr);
+			fl.oif = sk->sk_bound_dev_if;
+			fl.fl_ip_dport = inet->dport;
+			fl.fl_ip_sport = inet->sport;
+
+			if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
+				sk->sk_err_soft = -err;
+				goto out;
+			}
+
+			if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+				sk->sk_err_soft = -err;
+				goto out;
+			}
+
+		} else
+			dst_hold(dst);
+
+		if (dp->dccps_pmtu_cookie > dst_mtu(dst)) {
+			dccp_sync_mss(sk, dst_mtu(dst));
+		} /* else let the usual retransmit timer handle it */
+		dst_release(dst);
+		goto out;
+	}
+
+	icmpv6_err_convert(type, code, &err);
+
+	seq = DCCP_SKB_CB(skb)->dccpd_seq;
+	/* Might be for an request_sock */
+	switch (sk->sk_state) {
+		struct request_sock *req, **prev;
+	case DCCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+
+		req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
+					   &hdr->daddr, &hdr->saddr,
+					   inet6_iif(skb));
+		if (!req)
+			goto out;
+
+		/* ICMPs are not backlogged, hence we cannot get
+		 * an established socket here.
+		 */
+		BUG_TRAP(req->sk == NULL);
+
+		if (seq != dccp_rsk(req)->dreq_iss) {
+			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case DCCP_REQUESTING:
+	case DCCP_RESPOND:  /* Cannot happen.
+			       It can, it SYNs are crossed. --ANK */ 
+		if (!sock_owned_by_user(sk)) {
+			DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+			sk->sk_err = err;
+			/*
+			 * Wake people up to see the error
+			 * (see connect in sock.c)
+			 */
+			sk->sk_error_report(sk);
+
+			dccp_done(sk);
+		} else
+			sk->sk_err_soft = err;
+		goto out;
+	}
+
+	if (!sock_owned_by_user(sk) && np->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else
+		sk->sk_err_soft = err;
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
+				 struct dst_entry *dst)
+{
+	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *skb;
+	struct ipv6_txoptions *opt = NULL;
+	struct in6_addr *final_p = NULL, final;
+	struct flowi fl;
+	int err = -1;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.proto = IPPROTO_DCCP;
+	ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+	ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+	fl.fl6_flowlabel = 0;
+	fl.oif = ireq6->iif;
+	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+	fl.fl_ip_sport = inet_sk(sk)->sport;
+
+	if (dst == NULL) {
+		opt = np->opt;
+		if (opt == NULL &&
+		    np->rxopt.bits.osrcrt == 2 &&
+		    ireq6->pktopts) {
+			struct sk_buff *pktopts = ireq6->pktopts;
+			struct inet6_skb_parm *rxopt = IP6CB(pktopts);
+			if (rxopt->srcrt)
+				opt = ipv6_invert_rthdr(sk,
+					(struct ipv6_rt_hdr *)(pktopts->nh.raw +
+							       rxopt->srcrt));
+		}
+
+		if (opt && opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
+			ipv6_addr_copy(&final, &fl.fl6_dst);
+			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+			final_p = &final;
+		}
+
+		err = ip6_dst_lookup(sk, &dst, &fl);
+		if (err)
+			goto done;
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+		if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+			goto done;
+	}
+
+	skb = dccp_make_response(sk, dst, req);
+	if (skb != NULL) {
+		struct dccp_hdr *dh = dccp_hdr(skb);
+		dh->dccph_checksum = dccp_v6_check(dh, skb->len,
+						   &ireq6->loc_addr,
+						   &ireq6->rmt_addr,
+						   csum_partial((char *)dh,
+								skb->len,
+								skb->csum));
+		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+		err = ip6_xmit(sk, skb, &fl, opt, 0);
+		if (err == NET_XMIT_CN)
+			err = 0;
+	}
+
+done:
+        if (opt && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	return err;
+}
+
+static void dccp_v6_reqsk_destructor(struct request_sock *req)
+{
+	if (inet6_rsk(req)->pktopts != NULL)
+		kfree_skb(inet6_rsk(req)->pktopts);
+}
+
+static struct request_sock_ops dccp6_request_sock_ops = {
+	.family		= AF_INET6,
+	.obj_size	= sizeof(struct dccp6_request_sock),
+	.rtx_syn_ack	= dccp_v6_send_response,
+	.send_ack	= dccp_v6_reqsk_send_ack,
+	.destructor	= dccp_v6_reqsk_destructor,
+	.send_reset	= dccp_v6_ctl_send_reset,
+};
+
+static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
+					     len, IPPROTO_DCCP, 
+					     csum_partial((char *)dh,
+							  dh->dccph_doff << 2,
+							  skb->csum));
+}
+
+static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
+{
+	struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; 
+	const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+				       sizeof(struct dccp_hdr_ext) +
+				       sizeof(struct dccp_hdr_reset);
+	struct sk_buff *skb;
+	struct flowi fl;
+	u64 seqno;
+
+	if (rxdh->dccph_type == DCCP_PKT_RESET)
+		return;
+
+	if (!ipv6_unicast_destination(rxskb))
+		return; 
+
+	/*
+	 * We need to grab some memory, and put together an RST,
+	 * and then put it into the queue to be sent.
+	 */
+
+	skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+			dccp_hdr_reset_len, GFP_ATOMIC);
+	if (skb == NULL) 
+	  	return;
+
+	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+		    dccp_hdr_reset_len);
+
+	skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_hdr_reset_len);
+
+	/* Swap the send and the receive. */
+	dh->dccph_type	= DCCP_PKT_RESET;
+	dh->dccph_sport	= rxdh->dccph_dport;
+	dh->dccph_dport	= rxdh->dccph_sport;
+	dh->dccph_doff	= dccp_hdr_reset_len / 4;
+	dh->dccph_x	= 1;
+	dccp_hdr_reset(skb)->dccph_reset_code =
+				DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+	/* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+	seqno = 0;
+	if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+	dccp_hdr_set_seq(dh, seqno);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+			 DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+	memset(&fl, 0, sizeof(fl));
+	ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+	ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+	dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
+					     sizeof(*dh), IPPROTO_DCCP,
+					     skb->csum);
+	fl.proto = IPPROTO_DCCP;
+	fl.oif = inet6_iif(rxskb);
+	fl.fl_ip_dport = dh->dccph_dport;
+	fl.fl_ip_sport = dh->dccph_sport;
+
+	/* sk = NULL, but it is safe for now. RST socket required. */
+	if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+		if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+			ip6_xmit(NULL, skb, &fl, NULL, 0);
+			DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+			DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+			return;
+		}
+	}
+
+	kfree_skb(skb);
+}
+
+static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
+{
+	struct flowi fl;
+	struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+	const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+				     sizeof(struct dccp_hdr_ext) +
+				     sizeof(struct dccp_hdr_ack_bits);
+	struct sk_buff *skb;
+
+	skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
+			dccp_hdr_ack_len, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
+			 dccp_hdr_ack_len);
+
+	skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_hdr_ack_len);
+
+	/* Build DCCP header and checksum it. */
+	dh->dccph_type	= DCCP_PKT_ACK;
+	dh->dccph_sport = rxdh->dccph_dport;
+	dh->dccph_dport = rxdh->dccph_sport;
+	dh->dccph_doff	= dccp_hdr_ack_len / 4;
+	dh->dccph_x	= 1;
+	
+	dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+			 DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+	memset(&fl, 0, sizeof(fl));
+	ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
+	ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
+
+	/* FIXME: calculate checksum, IPv4 also should... */
+
+	fl.proto = IPPROTO_DCCP;
+	fl.oif = inet6_iif(rxskb);
+	fl.fl_ip_dport = dh->dccph_dport;
+	fl.fl_ip_sport = dh->dccph_sport;
+
+	if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
+		if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
+			ip6_xmit(NULL, skb, &fl, NULL, 0);
+			DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+			return;
+		}
+	}
+
+	kfree_skb(skb);
+}
+
+static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
+				   struct request_sock *req)
+{
+	dccp_v6_ctl_send_ack(skb);
+}
+
+static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const struct ipv6hdr *iph = skb->nh.ipv6h;
+	struct sock *nsk;
+	struct request_sock **prev;
+	/* Find possible connection requests. */
+	struct request_sock *req = inet6_csk_search_req(sk, &prev,
+							dh->dccph_sport,
+							&iph->saddr,
+							&iph->daddr,
+							inet6_iif(skb));
+	if (req != NULL)
+		return dccp_check_req(sk, skb, req, prev);
+
+	nsk = __inet6_lookup_established(&dccp_hashinfo,
+					 &iph->saddr, dh->dccph_sport,
+					 &iph->daddr, ntohs(dh->dccph_dport),
+					 inet6_iif(skb));
+
+	if (nsk != NULL) {
+		if (nsk->sk_state != DCCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put((struct inet_timewait_sock *)nsk);
+		return NULL;
+	}
+
+	return sk;
+}
+
+static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq;
+	struct dccp_sock dp;
+	struct request_sock *req;
+	struct dccp_request_sock *dreq;
+	struct inet6_request_sock *ireq6;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+ 	const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+	__u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return dccp_v4_conn_request(sk, skb);
+
+	if (!ipv6_unicast_destination(skb))
+		goto drop; 
+
+	if (dccp_bad_service_code(sk, service)) {
+		reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+		goto drop;
+ 	}
+	/*
+	 *	There are no SYN attacks on IPv6, yet...	
+	 */
+	if (inet_csk_reqsk_queue_is_full(sk))
+		goto drop;		
+
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
+	if (req == NULL)
+		goto drop;
+
+	/* FIXME: process options */
+
+	dccp_openreq_init(req, &dp, skb);
+
+	ireq6 = inet6_rsk(req);
+	ireq = inet_rsk(req);
+	ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
+	ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
+	req->rcv_wnd	= 100; /* Fake, option parsing will get the
+				  right value */
+	ireq6->pktopts	= NULL;
+
+	if (ipv6_opt_accepted(sk, skb) ||
+	    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+	    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+		atomic_inc(&skb->users);
+		ireq6->pktopts = skb;
+	}
+	ireq6->iif = sk->sk_bound_dev_if;
+
+	/* So that link locals have meaning */
+	if (!sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq6->iif = inet6_iif(skb);
+
+	/* 
+	 * Step 3: Process LISTEN state
+	 *
+	 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *
+	 * In fact we defer setting S.GSR, S.SWL, S.SWH to
+	 * dccp_create_openreq_child.
+	 */
+	dreq = dccp_rsk(req);
+	dreq->dreq_isr	   = dcb->dccpd_seq;
+	dreq->dreq_iss	   = dccp_v6_init_sequence(sk, skb);
+	dreq->dreq_service = service;
+
+	if (dccp_v6_send_response(sk, req, NULL))
+		goto drop_and_free;
+
+	inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_free:
+	reqsk_free(req);
+drop:
+	DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+	dcb->dccpd_reset_code = reset_code;
+	return -1;
+}
+
+static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
+					      struct sk_buff *skb,
+					      struct request_sock *req,
+					      struct dst_entry *dst)
+{
+	struct inet6_request_sock *ireq6 = inet6_rsk(req);
+	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
+	struct inet_sock *newinet;
+	struct dccp_sock *newdp;
+	struct dccp6_sock *newdp6;
+	struct sock *newsk;
+	struct ipv6_txoptions *opt;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		/*
+		 *	v6 mapped
+		 */
+
+		newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+		if (newsk == NULL) 
+			return NULL;
+
+		newdp6 = (struct dccp6_sock *)newsk;
+		newdp = dccp_sk(newsk);
+		newinet = inet_sk(newsk);
+		newinet->pinet6 = &newdp6->inet6;
+		newnp = inet6_sk(newsk);
+
+		memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+		ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
+			      newinet->daddr);
+
+		ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
+			      newinet->saddr);
+
+		ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+
+		inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
+		newsk->sk_backlog_rcv = dccp_v4_do_rcv;
+		newnp->pktoptions  = NULL;
+		newnp->opt	   = NULL;
+		newnp->mcast_oif   = inet6_iif(skb);
+		newnp->mcast_hops  = skb->nh.ipv6h->hop_limit;
+
+		/*
+		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+		 * here, dccp_create_openreq_child now does this for us, see the comment in
+		 * that function for the gory details. -acme
+		 */
+
+		/* It is tricky place. Until this moment IPv4 tcp
+		   worked with IPv6 icsk.icsk_af_ops.
+		   Sync it now.
+		 */
+		dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie);
+
+		return newsk;
+	}
+
+	opt = np->opt;
+
+	if (sk_acceptq_is_full(sk))
+		goto out_overflow;
+
+	if (np->rxopt.bits.osrcrt == 2 &&
+	    opt == NULL && ireq6->pktopts) {
+		struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
+		if (rxopt->srcrt)
+			opt = ipv6_invert_rthdr(sk,
+				(struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
+						       rxopt->srcrt));
+	}
+
+	if (dst == NULL) {
+		struct in6_addr *final_p = NULL, final;
+		struct flowi fl;
+
+		memset(&fl, 0, sizeof(fl));
+		fl.proto = IPPROTO_DCCP;
+		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+		if (opt && opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+			ipv6_addr_copy(&final, &fl.fl6_dst);
+			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+			final_p = &final;
+		}
+		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+		fl.oif = sk->sk_bound_dev_if;
+		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+		fl.fl_ip_sport = inet_sk(sk)->sport;
+
+		if (ip6_dst_lookup(sk, &dst, &fl))
+			goto out;
+
+		if (final_p)
+			ipv6_addr_copy(&fl.fl6_dst, final_p);
+
+		if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+			goto out;
+	} 
+
+	newsk = dccp_create_openreq_child(sk, req, skb);
+	if (newsk == NULL)
+		goto out;
+
+	/*
+	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
+	 * count here, dccp_create_openreq_child now does this for us, see the
+	 * comment in that function for the gory details. -acme
+	 */
+
+	ip6_dst_store(newsk, dst, NULL);
+	newsk->sk_route_caps = dst->dev->features &
+		~(NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+	newdp6 = (struct dccp6_sock *)newsk;
+	newinet = inet_sk(newsk);
+	newinet->pinet6 = &newdp6->inet6;
+	newdp = dccp_sk(newsk);
+	newnp = inet6_sk(newsk);
+
+	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+	ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
+	ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
+	ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
+	newsk->sk_bound_dev_if = ireq6->iif;
+
+	/* Now IPv6 options... 
+
+	   First: no IPv4 options.
+	 */
+	newinet->opt = NULL;
+
+	/* Clone RX bits */
+	newnp->rxopt.all = np->rxopt.all;
+
+	/* Clone pktoptions received with SYN */
+	newnp->pktoptions = NULL;
+	if (ireq6->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
+		kfree_skb(ireq6->pktopts);
+		ireq6->pktopts = NULL;
+		if (newnp->pktoptions)
+			skb_set_owner_r(newnp->pktoptions, newsk);
+	}
+	newnp->opt	  = NULL;
+	newnp->mcast_oif  = inet6_iif(skb);
+	newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+
+	/* Clone native IPv6 options from listening socket (if any)
+
+	   Yes, keeping reference count would be much more clever,
+	   but we make one more one thing there: reattach optmem
+	   to newsk.
+	 */
+	if (opt) {
+		newnp->opt = ipv6_dup_options(newsk, opt);
+		if (opt != np->opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+	}
+
+	newdp->dccps_ext_header_len = 0;
+	if (newnp->opt)
+		newdp->dccps_ext_header_len = newnp->opt->opt_nflen +
+					      newnp->opt->opt_flen;
+
+	dccp_sync_mss(newsk, dst_mtu(dst));
+
+	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+
+	__inet6_hash(&dccp_hashinfo, newsk);
+	inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+	return newsk;
+
+out_overflow:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+out:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+	if (opt && opt != np->opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
+	dst_release(dst);
+	return NULL;
+}
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sk_buff *opt_skb = NULL;
+
+	/* Imagine: socket is IPv6. IPv4 packet arrives,
+	   goes to IPv4 receive handler and backlogged.
+	   From backlog it always goes here. Kerboom...
+	   Fortunately, dccp_rcv_established and rcv_established
+	   handle them correctly, but it is not case with
+	   dccp_v6_hnd_req and dccp_v6_ctl_send_reset().   --ANK
+	 */
+
+	if (skb->protocol == htons(ETH_P_IP))
+		return dccp_v4_do_rcv(sk, skb);
+
+	if (sk_filter(sk, skb, 0))
+		goto discard;
+
+	/*
+	 *	socket locking is here for SMP purposes as backlog rcv
+	 *	is currently called with bh processing disabled.
+	 */
+
+	/* Do Stevens' IPV6_PKTOPTIONS.
+
+	   Yes, guys, it is the only place in our code, where we
+	   may make it not affecting IPv4.
+	   The rest of code is protocol independent,
+	   and I do not like idea to uglify IPv4.
+
+	   Actually, all the idea behind IPV6_PKTOPTIONS
+	   looks not very well thought. For now we latch
+	   options, received in the last packet, enqueued
+	   by tcp. Feel free to propose better solution.
+	                                       --ANK (980728)
+	 */
+	if (np->rxopt.all)
+		opt_skb = skb_clone(skb, GFP_ATOMIC);
+
+	if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+		if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
+			goto reset;
+		return 0;
+	}
+
+	if (sk->sk_state == DCCP_LISTEN) { 
+		struct sock *nsk = dccp_v6_hnd_req(sk, skb);
+		if (!nsk)
+			goto discard;
+
+		/*
+		 * Queue it on the new socket if the new socket is active,
+		 * otherwise we just shortcircuit this and continue with
+		 * the new socket..
+		 */
+ 		if(nsk != sk) {
+			if (dccp_child_process(sk, nsk, skb))
+				goto reset;
+			if (opt_skb)
+				__kfree_skb(opt_skb);
+			return 0;
+		}
+	}
+
+	if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
+		goto reset;
+	return 0;
+
+reset:
+	dccp_v6_ctl_send_reset(skb);
+discard:
+	if (opt_skb)
+		__kfree_skb(opt_skb);
+	kfree_skb(skb);
+	return 0;
+}
+
+static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+{
+	const struct dccp_hdr *dh;
+	struct sk_buff *skb = *pskb;
+	struct sock *sk;
+	int rc;
+
+	/* Step 1: Check header basics: */
+
+	if (dccp_invalid_packet(skb))
+		goto discard_it;
+
+	dh = dccp_hdr(skb);
+
+	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(skb);
+	DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+	if (dccp_packet_without_ack(skb))
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+	else
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+
+	/* Step 2:
+	 * 	Look up flow ID in table and get corresponding socket */
+	sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
+			    dh->dccph_sport,
+			    &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
+			    inet6_iif(skb));
+	/* 
+	 * Step 2:
+	 * 	If no socket ...
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (sk == NULL)
+		goto no_dccp_socket;
+
+	/* 
+	 * Step 2:
+	 * 	... or S.state == TIMEWAIT,
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	       
+	if (sk->sk_state == DCCP_TIME_WAIT)
+                goto do_time_wait;
+
+	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+
+	if (sk_filter(sk, skb, 0))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	bh_lock_sock(sk);
+	rc = 0;
+	if (!sock_owned_by_user(sk))
+		rc = dccp_v6_do_rcv(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+	return rc ? -1 : 0;
+
+no_dccp_socket:
+	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+	/*
+	 * Step 2:
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (dh->dccph_type != DCCP_PKT_RESET) {
+		DCCP_SKB_CB(skb)->dccpd_reset_code =
+					DCCP_RESET_CODE_NO_CONNECTION;
+		dccp_v6_ctl_send_reset(skb);
+	}
+discard_it:
+
+	/*
+	 *	Discard frame
+	 */
+
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	inet_twsk_put((struct inet_timewait_sock *)sk);
+	goto no_dccp_socket;
+}
+
+static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+	.queue_xmit	=	inet6_csk_xmit,
+	.send_check	=	dccp_v6_send_check,
+	.rebuild_header	=	inet6_sk_rebuild_header,
+	.conn_request	=	dccp_v6_conn_request,
+	.syn_recv_sock	=	dccp_v6_request_recv_sock,
+	.net_header_len	=	sizeof(struct ipv6hdr),
+	.setsockopt	=	ipv6_setsockopt,
+	.getsockopt	=	ipv6_getsockopt,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
+	.sockaddr_len	=	sizeof(struct sockaddr_in6)
+};
+
+/*
+ *	DCCP over IPv4 via INET6 API
+ */
+static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+	.queue_xmit	=	ip_queue_xmit,
+	.send_check	=	dccp_v4_send_check,
+	.rebuild_header	=	inet_sk_rebuild_header,
+	.conn_request	=	dccp_v6_conn_request,
+	.syn_recv_sock	=	dccp_v6_request_recv_sock,
+	.net_header_len	=	sizeof(struct iphdr),
+	.setsockopt	=	ipv6_setsockopt,
+	.getsockopt	=	ipv6_getsockopt,
+	.addr2sockaddr	=	inet6_csk_addr2sockaddr,
+	.sockaddr_len	=	sizeof(struct sockaddr_in6)
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int dccp_v6_init_sock(struct sock *sk)
+{
+	int err = dccp_v4_init_sock(sk);
+
+	if (err == 0)
+		inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+
+	return err;
+}
+
+static int dccp_v6_destroy_sock(struct sock *sk)
+{
+	dccp_v4_destroy_sock(sk);
+	return inet6_destroy_sock(sk);
+}
+
+static struct proto dccp_v6_prot = {
+	.name			= "DCCPv6",
+	.owner			= THIS_MODULE,
+	.close			= dccp_close,
+	.connect		= dccp_v6_connect,
+	.disconnect		= dccp_disconnect,
+	.ioctl			= dccp_ioctl,
+	.init			= dccp_v6_init_sock,
+	.setsockopt		= dccp_setsockopt,
+	.getsockopt		= dccp_getsockopt,
+	.sendmsg		= dccp_sendmsg,
+	.recvmsg		= dccp_recvmsg,
+	.backlog_rcv		= dccp_v6_do_rcv,
+	.hash			= dccp_v6_hash,
+	.unhash			= dccp_unhash,
+	.accept			= inet_csk_accept,
+	.get_port		= dccp_v6_get_port,
+	.shutdown		= dccp_shutdown,
+	.destroy		= dccp_v6_destroy_sock,
+	.orphan_count		= &dccp_orphan_count,
+	.max_header		= MAX_DCCP_HEADER,
+	.obj_size		= sizeof(struct dccp6_sock),
+	.rsk_prot		= &dccp6_request_sock_ops,
+	.twsk_obj_size		= sizeof(struct dccp6_timewait_sock),
+};
+
+static struct inet6_protocol dccp_v6_protocol = {
+	.handler	=	dccp_v6_rcv,
+	.err_handler	=	dccp_v6_err,
+	.flags		=	INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+};
+
+static struct proto_ops inet6_dccp_ops = {
+	.family		= PF_INET6,
+	.owner		= THIS_MODULE,
+	.release	= inet6_release,
+	.bind		= inet6_bind,
+	.connect	= inet_stream_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= inet_accept,
+	.getname	= inet6_getname,
+	.poll		= dccp_poll,
+	.ioctl		= inet6_ioctl,
+	.listen		= inet_dccp_listen,
+	.shutdown	= inet_shutdown,
+	.setsockopt	= sock_common_setsockopt,
+	.getsockopt	= sock_common_getsockopt,
+	.sendmsg	= inet_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+static struct inet_protosw dccp_v6_protosw = {
+	.type		= SOCK_DCCP,
+	.protocol	= IPPROTO_DCCP,
+	.prot		= &dccp_v6_prot,
+	.ops		= &inet6_dccp_ops,
+	.capability	= -1,
+};
+
+static int __init dccp_v6_init(void)
+{
+	int err = proto_register(&dccp_v6_prot, 1);
+
+	if (err != 0)
+		goto out;
+
+	err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+	if (err != 0)
+		goto out_unregister_proto;
+
+	inet6_register_protosw(&dccp_v6_protosw);
+out:
+	return err;
+out_unregister_proto:
+	proto_unregister(&dccp_v6_prot);
+	goto out;
+}
+
+static void __exit dccp_v6_exit(void)
+{
+	inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+	inet6_unregister_protosw(&dccp_v6_protosw);
+	proto_unregister(&dccp_v6_prot);
+}
+
+module_init(dccp_v6_init);
+module_exit(dccp_v6_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
new file mode 100644
index 00000000000..e4d4e930927
--- /dev/null
+++ b/net/dccp/ipv6.h
@@ -0,0 +1,37 @@
+#ifndef _DCCP_IPV6_H
+#define _DCCP_IPV6_H
+/*
+ *  net/dccp/ipv6.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/ipv6.h>
+
+struct dccp6_sock {
+	struct dccp_sock  dccp;
+	/*
+	 * ipv6_pinfo has to be the last member of dccp6_sock,
+	 * see inet6_sk_generic.
+	 */
+	struct ipv6_pinfo inet6;
+};
+
+struct dccp6_request_sock {
+	struct dccp_request_sock  dccp;
+	struct inet6_request_sock inet6;
+};
+
+struct dccp6_timewait_sock {
+	struct inet_timewait_sock   inet;
+	struct inet6_timewait_sock  tw6;
+};
+
+#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5c767b5e9a5..29261fc198e 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -52,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 	if (tw != NULL) {
 		const struct inet_connection_sock *icsk = inet_csk(sk);
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (tw->tw_family == PF_INET6) {
+			const struct ipv6_pinfo *np = inet6_sk(sk);
+			struct inet6_timewait_sock *tw6;
+
+			tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
+			tw6 = inet6_twsk((struct sock *)tw);
+			ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6only = np->ipv6only;
+		}
+#endif
 		/* Linkage updates. */
 		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
 
-- 
cgit v1.2.3


From fc44b9805324c0ad2733ea2feea9935cc056709d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:06 -0800
Subject: [DCCP]: Use reqsk_free in dccp_v4_conn_request

Now we have the destructor (dccp_v4_reqsk_destructor) in our
request_sock_ops vtable.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3108c9d464f..bc28d71905e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -721,10 +721,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 drop_and_free:
-	/*
-	 * FIXME: should be reqsk_free after implementing req->rsk_ops
-	 */
-	__reqsk_free(req);
+	reqsk_free(req);
 drop:
 	DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
 	dcb->dccpd_reset_code = reset_code;
-- 
cgit v1.2.3


From 6d6ee43e0b8b8d4847627fd43739b98ec2b9404f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:19 -0800
Subject: [TWSK]: Introduce struct timewait_sock_ops

So that we can share several timewait sockets related functions and
make the timewait mini sockets infrastructure closer to the request
mini sockets one.

Next changesets will take advantage of this, moving more code out of
TCP and DCCP v4 and v6 to common infrastructure.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             |  3 +-
 include/net/inet_timewait_sock.h |  3 +-
 include/net/sock.h               |  4 +--
 include/net/tcp.h                |  3 ++
 include/net/timewait_sock.h      | 31 ++++++++++++++++++
 net/core/sock.c                  | 21 ++++++------
 net/dccp/ipv4.c                  |  9 ++++-
 net/dccp/ipv6.c                  |  6 +++-
 net/ipv4/inet_timewait_sock.c    |  5 +--
 net/ipv4/tcp_ipv4.c              | 71 ++++++++++++++++++++++++----------------
 net/ipv6/tcp_ipv6.c              | 25 +++++---------
 11 files changed, 118 insertions(+), 63 deletions(-)
 create mode 100644 include/net/timewait_sock.h

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7d3908594fa..a0d04891fe1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -360,7 +360,8 @@ struct tcp6_timewait_sock {
 
 static inline u16 inet6_tw_offset(const struct proto *prot)
 {
-	return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock);
+	return prot->twsk_prot->twsk_obj_size -
+			sizeof(struct inet6_timewait_sock);
 }
 
 static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index ca240f856c4..e396a65473d 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -26,6 +26,7 @@
 
 #include <net/sock.h>
 #include <net/tcp_states.h>
+#include <net/timewait_sock.h>
 
 #include <asm/atomic.h>
 
@@ -200,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw)
 		printk(KERN_DEBUG "%s timewait_sock %p released\n",
 		       tw->tw_prot->name, tw);
 #endif
-		kmem_cache_free(tw->tw_prot->twsk_slab, tw);
+		kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
 		module_put(owner);
 	}
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 0fbae85c6d5..91d28957dc1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk);
 extern int sk_wait_data(struct sock *sk, long *timeo);
 
 struct request_sock_ops;
+struct timewait_sock_ops;
 
 /* Networking protocol blocks we attach to sockets.
  * socket layer -> transport layer interface
@@ -557,11 +558,10 @@ struct proto {
 	kmem_cache_t		*slab;
 	unsigned int		obj_size;
 
-	kmem_cache_t		*twsk_slab;
-	unsigned int		twsk_obj_size;
 	atomic_t		*orphan_count;
 
 	struct request_sock_ops	*rsk_prot;
+	struct timewait_sock_ops *twsk_prot;
 
 	struct module		*owner;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 83b117a25c2..176221cd0cc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -287,6 +287,9 @@ extern int			tcp_rcv_established(struct sock *sk,
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
+extern int			tcp_twsk_unique(struct sock *sk,
+						struct sock *sktw, void *twp);
+
 static inline void tcp_dec_quickack_mode(struct sock *sk,
 					 const unsigned int pkts)
 {
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
new file mode 100644
index 00000000000..2544281e1d5
--- /dev/null
+++ b/include/net/timewait_sock.h
@@ -0,0 +1,31 @@
+/*
+ * NET		Generic infrastructure for Network protocols.
+ *
+ * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _TIMEWAIT_SOCK_H
+#define _TIMEWAIT_SOCK_H
+
+#include <linux/slab.h>
+#include <net/sock.h>
+
+struct timewait_sock_ops {
+	kmem_cache_t	*twsk_slab;
+	unsigned int	twsk_obj_size;
+	int		(*twsk_unique)(struct sock *sk,
+				       struct sock *sktw, void *twp);
+};
+
+static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
+		return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
+	return 0;
+}
+
+#endif /* _TIMEWAIT_SOCK_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 13cc3be4f05..6465b0e4c8c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 			}
 		}
 
-		if (prot->twsk_obj_size) {
+		if (prot->twsk_prot != NULL) {
 			static const char mask[] = "tw_sock_%s";
 
 			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
@@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab)
 				goto out_free_request_sock_slab;
 
 			sprintf(timewait_sock_slab_name, mask, prot->name);
-			prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
-							    prot->twsk_obj_size,
-							    0, SLAB_HWCACHE_ALIGN,
-							    NULL, NULL);
-			if (prot->twsk_slab == NULL)
+			prot->twsk_prot->twsk_slab =
+				kmem_cache_create(timewait_sock_slab_name,
+						  prot->twsk_prot->twsk_obj_size,
+						  0, SLAB_HWCACHE_ALIGN,
+						  NULL, NULL);
+			if (prot->twsk_prot->twsk_slab == NULL)
 				goto out_free_timewait_sock_slab_name;
 		}
 	}
@@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot)
 		prot->rsk_prot->slab = NULL;
 	}
 
-	if (prot->twsk_slab != NULL) {
-		const char *name = kmem_cache_name(prot->twsk_slab);
+	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+		const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
 
-		kmem_cache_destroy(prot->twsk_slab);
+		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
 		kfree(name);
-		prot->twsk_slab = NULL;
+		prot->twsk_prot->twsk_slab = NULL;
 	}
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index bc28d71905e..e11cda0cb6b 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -20,6 +20,7 @@
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
 #include <net/sock.h>
+#include <net/timewait_sock.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
 
@@ -1309,6 +1310,10 @@ static struct request_sock_ops dccp_request_sock_ops = {
 	.send_reset	= dccp_v4_ctl_send_reset,
 };
 
+static struct timewait_sock_ops dccp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct inet_timewait_sock),
+};
+
 struct proto dccp_prot = {
 	.name			= "DCCP",
 	.owner			= THIS_MODULE,
@@ -1332,5 +1337,7 @@ struct proto dccp_prot = {
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp_sock),
 	.rsk_prot		= &dccp_request_sock_ops,
-	.twsk_obj_size		= sizeof(struct inet_timewait_sock),
+	.twsk_prot		= &dccp_timewait_sock_ops,
 };
+
+EXPORT_SYMBOL_GPL(dccp_prot);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index a7d2aee5b3a..4d078f5b911 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -652,6 +652,10 @@ static struct request_sock_ops dccp6_request_sock_ops = {
 	.send_reset	= dccp_v6_ctl_send_reset,
 };
 
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct dccp6_timewait_sock),
+};
+
 static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1359,7 +1363,7 @@ static struct proto dccp_v6_prot = {
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp6_sock),
 	.rsk_prot		= &dccp6_request_sock_ops,
-	.twsk_obj_size		= sizeof(struct dccp6_timewait_sock),
+	.twsk_prot		= &dccp6_timewait_sock_ops,
 };
 
 static struct inet6_protocol dccp_v6_protocol = {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index a010e9a6881..417f126c749 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
 {
-	struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
-							 SLAB_ATOMIC);
+	struct inet_timewait_sock *tw =
+		kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+				 SLAB_ATOMIC);
 	if (tw != NULL) {
 		const struct inet_sock *inet = inet_sk(sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0b5ab04d3c5..6728772a943 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -69,6 +69,7 @@
 #include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
+#include <net/timewait_sock.h>
 #include <net/xfrm.h>
 
 #include <linux/inet.h>
@@ -118,6 +119,39 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 					  skb->h.th->source);
 }
 
+int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
+{
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* With PAWS, it is safe from the viewpoint
+	   of data integrity. Even without PAWS it is safe provided sequence
+	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
+
+	   Actually, the idea is close to VJ's one, only timestamp cache is
+	   held not per host, but per port pair and TW bucket is used as state
+	   holder.
+
+	   If TW bucket has been already destroyed we fall back to VJ's scheme
+	   and use initial timestamp retrieved from peer table.
+	 */
+	if (tcptw->tw_ts_recent_stamp &&
+	    (twp == NULL || (sysctl_tcp_tw_reuse &&
+			     xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
+		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+		if (tp->write_seq == 0)
+			tp->write_seq = 1;
+		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+		sock_hold(sktw);
+		return 1;
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+
 /* called with local bh disabled */
 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 				      struct inet_timewait_sock **twp)
@@ -142,35 +176,9 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 		tw = inet_twsk(sk2);
 
 		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
-			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
-			struct tcp_sock *tp = tcp_sk(sk);
-
-			/* With PAWS, it is safe from the viewpoint
-			   of data integrity. Even without PAWS it
-			   is safe provided sequence spaces do not
-			   overlap i.e. at data rates <= 80Mbit/sec.
-
-			   Actually, the idea is close to VJ's one,
-			   only timestamp cache is held not per host,
-			   but per port pair and TW bucket is used
-			   as state holder.
-
-			   If TW bucket has been already destroyed we
-			   fall back to VJ's scheme and use initial
-			   timestamp retrieved from peer table.
-			 */
-			if (tcptw->tw_ts_recent_stamp &&
-			    (!twp || (sysctl_tcp_tw_reuse &&
-				      xtime.tv_sec -
-				      tcptw->tw_ts_recent_stamp > 1))) {
-				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-				if (tp->write_seq == 0)
-					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
-				sock_hold(sk2);
+			if (twsk_unique(sk, sk2, twp))
 				goto unique;
-			} else
+			else
 				goto not_unique;
 		}
 	}
@@ -869,6 +877,11 @@ struct request_sock_ops tcp_request_sock_ops = {
 	.send_reset	=	tcp_v4_send_reset,
 };
 
+static struct timewait_sock_ops tcp_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+};
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq;
@@ -1979,7 +1992,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
+	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 };
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5c8a669e84..514b57bb80b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -60,6 +60,7 @@
 #include <net/addrconf.h>
 #include <net/snmp.h>
 #include <net/dsfield.h>
+#include <net/timewait_sock.h>
 
 #include <asm/uaccess.h>
 
@@ -147,22 +148,9 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
 		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
 		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
 		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
-			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
-			struct tcp_sock *tp = tcp_sk(sk);
-
-			if (tcptw->tw_ts_recent_stamp &&
-			    (!twp ||
-			     (sysctl_tcp_tw_reuse &&
-			      xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
-				/* See comment in tcp_ipv4.c */
-				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
-				if (!tp->write_seq)
-					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
-				sock_hold(sk2);
+			if (twsk_unique(sk, sk2, twp))
 				goto unique;
-			} else
+			else
 				goto not_unique;
 		}
 	}
@@ -711,6 +699,11 @@ static struct request_sock_ops tcp6_request_sock_ops = {
 	.send_reset	=	tcp_v6_send_reset
 };
 
+static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
+	.twsk_unique	= tcp_twsk_unique,
+};
+
 static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1752,7 +1745,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.twsk_obj_size		= sizeof(struct tcp6_timewait_sock),
+	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 };
 
-- 
cgit v1.2.3


From a7f5e7f164788a22eb5d3de8e2d3cee1bf58fdca Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:31 -0800
Subject: [INET]: Generalise tcp_v4_hash_connect

Renaming it to inet_hash_connect, making it possible to ditch
dccp_v4_hash_connect and share the same code with TCP instead.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/random.c         |   6 +-
 include/linux/random.h        |   2 +-
 include/net/inet_hashtables.h |   3 +
 net/dccp/ipv4.c               | 160 +------------------------------------
 net/ipv4/inet_hashtables.c    | 178 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c           | 173 +---------------------------------------
 6 files changed, 186 insertions(+), 336 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7999da25fe4..79b59d986af 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 
 EXPORT_SYMBOL(secure_tcp_sequence_number);
 
-
-
-/* Generate secure starting point for ephemeral TCP port search */
-u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
+/* Generate secure starting point for ephemeral IPV4 transport port search */
+u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
 {
 	struct keydata *keyptr = get_keyptr();
 	u32 hash[4];
diff --git a/include/linux/random.h b/include/linux/random.h
index 7b2adb3322d..01424a8e621 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -52,7 +52,7 @@ extern void get_random_bytes(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
-extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
+extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
 extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
 				       __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 07840baa934..c83baa79f66 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -434,4 +434,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
 
 	return sk;
 }
+
+extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
+			     struct sock *sk);
 #endif /* _INET_HASHTABLES_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e11cda0cb6b..671fbf3b237 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -54,164 +54,6 @@ void dccp_unhash(struct sock *sk)
 
 EXPORT_SYMBOL_GPL(dccp_unhash);
 
-/* called with local bh disabled */
-static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const u32 daddr = inet->rcv_saddr;
-	const u32 saddr = inet->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	INET_ADDR_COOKIE(acookie, saddr, daddr)
-	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
-	const struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
-		tw = inet_twsk(sk2);
-
-		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
-	sk->sk_hash = hash;
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp != NULL) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw != NULL) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &dccp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static int dccp_v4_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (snum == 0) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
- 		int remaining = (high - low) + 1;
- 		int rover = net_random() % (high - low) + low;
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
- 		do {
- 			head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
-						    dccp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == rover) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__dccp_v4_check_established(sk,
-									 rover,
-									 &tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
-						     head, rover);
- 			if (tb == NULL) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 			if (++rover > high)
- 				rover = low;
- 		} while (--remaining > 0);
-
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
- 		/* All locks still held and bhs disabled */
- 		inet_bind_hash(sk, tb, rover);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(rover);
- 			__inet_hash(&dccp_hashinfo, sk, 0);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw != NULL) {
- 			inet_twsk_deschedule(tw, &dccp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
-						 dccp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
-		__inet_hash(&dccp_hashinfo, sk, 0);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __dccp_v4_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -272,7 +114,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	dccp_set_state(sk, DCCP_REQUESTING);
-	err = dccp_v4_hash_connect(sk);
+	err = inet_hash_connect(&dccp_death_row, sk);
 	if (err != 0)
 		goto failure;
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index e8d29fe736d..33228115cda 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -15,12 +15,14 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
+#include <net/ip.h>
 
 /*
  * Allocate and initialize a new local port bind bucket.
@@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
 }
 
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+
+/* called with local bh disabled */
+static int __inet_check_established(struct inet_timewait_death_row *death_row,
+				    struct sock *sk, __u16 lport,
+				    struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	struct inet_sock *inet = inet_sk(sk);
+	u32 daddr = inet->rcv_saddr;
+	u32 saddr = inet->daddr;
+	int dif = sk->sk_bound_dev_if;
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	prefetch(head->chain.first);
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+		tw = inet_twsk(sk2);
+
+		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->num = lport;
+	inet->sport = htons(lport);
+	sk->sk_hash = hash;
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					  inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+int inet_hash_connect(struct inet_timewait_death_row *death_row,
+		      struct sock *sk)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (!snum) {
+ 		int low = sysctl_local_port_range[0];
+ 		int high = sysctl_local_port_range[1];
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint + inet_sk_port_offset(sk);
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__inet_check_established(death_row,
+								      sk, port,
+								      &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__inet_hash(hinfo, sk, 0);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			inet_twsk_deschedule(tw, death_row);;
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ 	tb  = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+		__inet_hash(hinfo, sk, 0);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __inet_check_established(death_row, sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet_hash_connect);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6728772a943..c2fe61becd6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -152,177 +152,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 
 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 
-/* called with local bh disabled */
-static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	u32 daddr = inet->rcv_saddr;
-	u32 saddr = inet->daddr;
-	int dif = sk->sk_bound_dev_if;
-	INET_ADDR_COOKIE(acookie, saddr, daddr)
-	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		tw = inet_twsk(sk2);
-
-		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
-		}
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-unique:
-	/* Must record num and sport now. Otherwise we will see
-	 * in hash table socket with a funny identity. */
-	inet->num = lport;
-	inet->sport = htons(lport);
-	sk->sk_hash = hash;
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &tcp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 connect_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-
-	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
-					 inet->dport);
-}
-
-/*
- * Bind a port for a connect operation and hash it.
- */
-static inline int tcp_v4_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (!snum) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + connect_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__tcp_v4_check_established(sk,
-									port,
-									&tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet_hash(&tcp_hashinfo, sk, 0);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &tcp_death_row);;
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- 	tb  = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet_hash(&tcp_hashinfo, sk, 0);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __tcp_v4_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
@@ -403,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 * complete initialization after this.
 	 */
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = tcp_v4_hash_connect(sk);
+	err = inet_hash_connect(&tcp_death_row, sk);
 	if (err)
 		goto failure;
 
-- 
cgit v1.2.3


From d8313f5ca2b1f86b7df6c99fc4b3fffa1f84e92b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:44 -0800
Subject: [INET6]: Generalise tcp_v6_hash_connect

Renaming it to inet6_hash_connect, making it possible to ditch
dccp_v6_hash_connect and share the same code with TCP instead.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/random.c       |   4 +-
 include/linux/random.h      |   4 +-
 include/net/ipv6.h          |   3 +
 net/dccp/ipv6.c             | 171 +----------------------------------------
 net/ipv6/inet6_hashtables.c | 183 +++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/tcp_ipv6.c         | 173 +----------------------------------------
 6 files changed, 190 insertions(+), 348 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 79b59d986af..bdfdfd28594 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1573,7 +1573,7 @@ u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
+u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport)
 {
 	struct keydata *keyptr = get_keyptr();
 	u32 hash[12];
@@ -1584,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
 
 	return twothirdsMD4Transform(daddr, hash);
 }
-EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
+EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
diff --git a/include/linux/random.h b/include/linux/random.h
index 01424a8e621..5d6456bcdeb 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -53,8 +53,8 @@ void generate_random_uuid(unsigned char uuid_out[16]);
 
 extern __u32 secure_ip_id(__u32 daddr);
 extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport);
-extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
-				       __u16 dport);
+extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, 
+				      __u16 dport);
 extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 851376108ac..e3d5d7bc883 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -527,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 extern int inet6_ioctl(struct socket *sock, unsigned int cmd, 
 		       unsigned long arg);
 
+extern int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+			      struct sock *sk);
+
 /*
  * reassembly.c
  */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 4d078f5b911..71bf04eb21e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -84,175 +84,6 @@ static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
 						   dh->dccph_sport);
 }
 
-static int __dccp_v6_check_established(struct sock *sk, const __u16 lport,
-				       struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *daddr = &np->rcv_saddr;
-	const struct in6_addr *saddr = &np->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	const unsigned int hash = inet6_ehashfn(daddr, inet->num,
-						saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
-		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
-
-		tw = inet_twsk(sk2);
-
-		if(*((__u32 *)&(tw->tw_dport))	== ports	 &&
-		   sk2->sk_family		== PF_INET6	 &&
-		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	 &&
-		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
-		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if)
-			goto not_unique;
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sk->sk_hash = hash;
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &dccp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 dccp_v6_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-
-	return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
-					   np->daddr.s6_addr32,
-					   inet->dport);
-}
-
-static int dccp_v6_hash_connect(struct sock *sk)
-{
-	const unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (snum == 0) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + dccp_v6_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &dccp_hashinfo.bhash[inet_bhashfn(port,
-						    dccp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__dccp_v6_check_established(sk,
-									 port,
-									 &tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
-						     head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet6_hash(&dccp_hashinfo, sk);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &dccp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
-						 dccp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet6_hash(&dccp_hashinfo, sk);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __dccp_v6_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
 			   int addr_len)
 {
@@ -403,7 +234,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->dport = usin->sin6_port;
 
 	dccp_set_state(sk, DCCP_REQUESTING);
-	err = dccp_v6_hash_connect(sk);
+	err = inet6_hash_connect(&dccp_death_row, sk);
 	if (err)
 		goto late_failure;
 	/* FIXME */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 01d5f46d4e4..4154f3a8b6c 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -5,7 +5,8 @@
  *
  *		Generic INET6 transport hashtables
  *
- * Authors:	Lotsa people, from code originally in tcp
+ * Authors:	Lotsa people, from code originally in tcp, generalised here
+ * 		by Arnaldo Carvalho de Melo <acme@mandriva.com>
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -14,12 +15,13 @@
  */
 
 #include <linux/config.h>
-
 #include <linux/module.h>
+#include <linux/random.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
+#include <net/ip.h>
 
 struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
 				   const struct in6_addr *daddr,
@@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
 }
 
 EXPORT_SYMBOL_GPL(inet6_lookup);
+
+static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+				     struct sock *sk, const __u16 lport,
+				     struct inet_timewait_sock **twp)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *daddr = &np->rcv_saddr;
+	const struct in6_addr *saddr = &np->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
+						inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+	struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	prefetch(head->chain.first);
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
+
+		tw = inet_twsk(sk2);
+
+		if(*((__u32 *)&(tw->tw_dport)) == ports		 &&
+		   sk2->sk_family	       == PF_INET6	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	 &&
+		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+			if (twsk_unique(sk, sk2, twp))
+				goto unique;
+			else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sk->sk_hash = hash;
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp != NULL) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw != NULL) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 inet6_sk_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32,
+					  np->daddr.s6_addr32,
+					  inet->dport);
+}
+
+int inet6_hash_connect(struct inet_timewait_death_row *death_row,
+		       struct sock *sk)
+{
+	struct inet_hashinfo *hinfo = death_row->hashinfo;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (snum == 0) {
+ 		const int low = sysctl_local_port_range[0];
+ 		const int high = sysctl_local_port_range[1];
+		const int range = high - low;
+ 		int i, port;
+		static u32 hint;
+		const u32 offset = hint + inet6_sk_port_offset(sk);
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__inet6_check_established(death_row,
+								       sk, port,
+								       &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+						     head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		inet_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__inet6_hash(hinfo, sk);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			inet_twsk_deschedule(tw, death_row);
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
+ 	tb   = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+
+	if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+		__inet6_hash(hinfo, sk);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __inet6_check_established(death_row, sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 514b57bb80b..a682eb9093e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -119,177 +119,6 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
-				      struct inet_timewait_sock **twp)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-	const struct in6_addr *daddr = &np->rcv_saddr;
-	const struct in6_addr *saddr = &np->daddr;
-	const int dif = sk->sk_bound_dev_if;
-	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport);
-	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
-	struct sock *sk2;
-	const struct hlist_node *node;
-	struct inet_timewait_sock *tw;
-
-	prefetch(head->chain.first);
-	write_lock(&head->lock);
-
-	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
-		const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2);
-
-		tw = inet_twsk(sk2);
-
-		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
-		   sk2->sk_family		== PF_INET6	&&
-		   ipv6_addr_equal(&tw6->tw_v6_daddr, saddr)	&&
-		   ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr)	&&
-		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
-			if (twsk_unique(sk, sk2, twp))
-				goto unique;
-			else
-				goto not_unique;
-		}
-	}
-	tw = NULL;
-
-	/* And established part... */
-	sk_for_each(sk2, node, &head->chain) {
-		if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
-			goto not_unique;
-	}
-
-unique:
-	BUG_TRAP(sk_unhashed(sk));
-	__sk_add_node(sk, &head->chain);
-	sk->sk_hash = hash;
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(&head->lock);
-
-	if (twp) {
-		*twp = tw;
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-	} else if (tw) {
-		/* Silly. Should hash-dance instead... */
-		inet_twsk_deschedule(tw, &tcp_death_row);
-		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
-
-		inet_twsk_put(tw);
-	}
-	return 0;
-
-not_unique:
-	write_unlock(&head->lock);
-	return -EADDRNOTAVAIL;
-}
-
-static inline u32 tcpv6_port_offset(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct ipv6_pinfo *np = inet6_sk(sk);
-
-	return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32,
-					   np->daddr.s6_addr32,
-					   inet->dport);
-}
-
-static int tcp_v6_hash_connect(struct sock *sk)
-{
-	unsigned short snum = inet_sk(sk)->num;
- 	struct inet_bind_hashbucket *head;
- 	struct inet_bind_bucket *tb;
-	int ret;
-
- 	if (!snum) {
- 		int low = sysctl_local_port_range[0];
- 		int high = sysctl_local_port_range[1];
-		int range = high - low;
- 		int i;
-		int port;
-		static u32 hint;
-		u32 offset = hint + tcpv6_port_offset(sk);
-		struct hlist_node *node;
- 		struct inet_timewait_sock *tw = NULL;
-
- 		local_bh_disable();
-		for (i = 1; i <= range; i++) {
-			port = low + (i + offset) % range;
- 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
- 			spin_lock(&head->lock);
-
- 			/* Does not bother with rcv_saddr checks,
- 			 * because the established check is already
- 			 * unique enough.
- 			 */
-			inet_bind_bucket_for_each(tb, node, &head->chain) {
- 				if (tb->port == port) {
- 					BUG_TRAP(!hlist_empty(&tb->owners));
- 					if (tb->fastreuse >= 0)
- 						goto next_port;
- 					if (!__tcp_v6_check_established(sk,
-									port,
-									&tw))
- 						goto ok;
- 					goto next_port;
- 				}
- 			}
-
- 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
- 			if (!tb) {
- 				spin_unlock(&head->lock);
- 				break;
- 			}
- 			tb->fastreuse = -1;
- 			goto ok;
-
- 		next_port:
- 			spin_unlock(&head->lock);
- 		}
- 		local_bh_enable();
-
- 		return -EADDRNOTAVAIL;
-
-ok:
-		hint += i;
-
- 		/* Head lock still held and bh's disabled */
- 		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
- 			inet_sk(sk)->sport = htons(port);
- 			__inet6_hash(&tcp_hashinfo, sk);
- 		}
- 		spin_unlock(&head->lock);
-
- 		if (tw) {
- 			inet_twsk_deschedule(tw, &tcp_death_row);
- 			inet_twsk_put(tw);
- 		}
-
-		ret = 0;
-		goto out;
- 	}
-
- 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
- 	tb   = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet6_hash(&tcp_hashinfo, sk);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
-		spin_unlock(&head->lock);
-		/* No definite answer... Walk to established hash table */
-		ret = __tcp_v6_check_established(sk, snum, NULL);
-out:
-		local_bh_enable();
-		return ret;
-	}
-}
-
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
 			  int addr_len)
 {
@@ -450,7 +279,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
-	err = tcp_v6_hash_connect(sk);
+	err = inet6_hash_connect(&tcp_death_row, sk);
 	if (err)
 		goto late_failure;
 
-- 
cgit v1.2.3


From 22712813620fa8e682dbfb253a60ca0131da1e07 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:25:56 -0800
Subject: [TCP]: Move the TCPF_ enum to tcp_states.h

Upcoming patches will make, for instance, ip_sockglue.c need just this enum
and not all of tcp.h.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 16 ----------------
 include/net/tcp_states.h | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4e1434007f4..da38eea1994 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -55,22 +55,6 @@ struct tcphdr {
 	__u16	urg_ptr;
 };
 
-#define TCP_ACTION_FIN	(1 << 7)
-
-enum {
-  TCPF_ESTABLISHED = (1 << 1),
-  TCPF_SYN_SENT  = (1 << 2),
-  TCPF_SYN_RECV  = (1 << 3),
-  TCPF_FIN_WAIT1 = (1 << 4),
-  TCPF_FIN_WAIT2 = (1 << 5),
-  TCPF_TIME_WAIT = (1 << 6),
-  TCPF_CLOSE     = (1 << 7),
-  TCPF_CLOSE_WAIT = (1 << 8),
-  TCPF_LAST_ACK  = (1 << 9),
-  TCPF_LISTEN    = (1 << 10),
-  TCPF_CLOSING   = (1 << 11) 
-};
-
 /*
  *	The union cast uses a gcc extension to avoid aliasing problems
  *  (union is compatible to any of its members)
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index b9d4176b2d1..b0b645988bd 100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -31,4 +31,20 @@ enum {
 
 #define TCP_STATE_MASK	0xF
 
+#define TCP_ACTION_FIN	(1 << 7)
+
+enum {
+	TCPF_ESTABLISHED = (1 << 1),
+	TCPF_SYN_SENT	 = (1 << 2),
+	TCPF_SYN_RECV	 = (1 << 3),
+	TCPF_FIN_WAIT1	 = (1 << 4),
+	TCPF_FIN_WAIT2	 = (1 << 5),
+	TCPF_TIME_WAIT	 = (1 << 6),
+	TCPF_CLOSE	 = (1 << 7),
+	TCPF_CLOSE_WAIT	 = (1 << 8),
+	TCPF_LAST_ACK	 = (1 << 9),
+	TCPF_LISTEN	 = (1 << 10),
+	TCPF_CLOSING	 = (1 << 11) 
+};
+
 #endif	/* _LINUX_TCP_STATES_H */
-- 
cgit v1.2.3


From d83d8461f902c672bc1bd8fbc6a94e19f092da97 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 13 Dec 2005 23:26:10 -0800
Subject: [IP_SOCKGLUE]: Remove most of the tcp specific calls

As DCCP needs to be called in the same spots.

Now we have a member in inet_sock (is_icsk), set at sock creation time from
struct inet_protosw->flags (if INET_PROTOSW_ICSK is set, like for TCP and
DCCP) to see if a struct sock instance is a inet_connection_sock for places
like the ones in ip_sockglue.c (v4 and v6) where we previously were looking if
sk_type was SOCK_STREAM, that is insufficient because we now use the same code
for DCCP, that has sk_type SOCK_DCCP.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h               |  4 ----
 include/linux/ip.h                 |  1 +
 include/linux/tcp.h                |  3 +--
 include/net/inet_connection_sock.h |  6 +++++-
 include/net/protocol.h             |  1 +
 net/dccp/diag.c                    |  2 +-
 net/dccp/input.c                   |  2 +-
 net/dccp/ipv4.c                    | 12 +++++++-----
 net/dccp/ipv6.c                    | 26 ++++++++++++++------------
 net/dccp/output.c                  |  7 ++++---
 net/dccp/proto.c                   |  2 +-
 net/ipv4/af_inet.c                 |  4 +++-
 net/ipv4/ip_sockglue.c             | 13 ++++++-------
 net/ipv4/tcp.c                     |  2 +-
 net/ipv4/tcp_input.c               | 10 ++++------
 net/ipv4/tcp_ipv4.c                | 12 ++++++------
 net/ipv4/tcp_output.c              | 18 ++++++++++--------
 net/ipv6/af_inet6.c                |  1 +
 net/ipv6/ipv6_sockglue.c           | 24 +++++++++++++-----------
 net/ipv6/tcp_ipv6.c                | 30 +++++++++++++++++-------------
 20 files changed, 97 insertions(+), 83 deletions(-)

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 71fab4311e9..d0bdb499cf8 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -408,8 +408,6 @@ struct dccp_ackvec;
  * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
  * @dccps_timestamp_time - time of latest TIMESTAMP option
  * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
- * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options)
- * @dccps_pmtu_cookie - Last pmtu seen by socket
  * @dccps_packet_size - Set thru setsockopt
  * @dccps_role - Role of this sock, one of %dccp_role
  * @dccps_ndp_count - number of Non Data Packets since last data packet
@@ -434,8 +432,6 @@ struct dccp_sock {
 	__u32				dccps_timestamp_echo;
 	__u32				dccps_packet_size;
 	unsigned long			dccps_ndp_count;
-	__u16				dccps_ext_header_len;
-	__u32				dccps_pmtu_cookie;
 	__u32				dccps_mss_cache;
 	struct dccp_options		dccps_options;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 5a560daeade..6ccc596c19c 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -155,6 +155,7 @@ struct inet_sock {
 	__u8			mc_ttl;		/* Multicasting TTL */
 	__u8			pmtudisc;
 	unsigned		recverr : 1,
+				is_icsk : 1,	/* inet_connection_sock? */
 				freebind : 1,
 				hdrincl : 1,
 				mc_loop : 1;
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index da38eea1994..f2bb2396853 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,10 +238,9 @@ struct tcp_sock {
 	__u32	snd_wl1;	/* Sequence for window update		*/
 	__u32	snd_wnd;	/* The window we expect to receive	*/
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
-	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
 	__u16	xmit_size_goal;	/* Goal for segmenting output packets	*/
-	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
+	/* XXX Two bytes hole, try to pack */
 
 	__u32	window_clamp;	/* Maximal window to advertise		*/
 	__u32	rcv_ssthresh;	/* Current window clamp			*/
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index e50e2b890c6..91888967d3e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -60,6 +60,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_timeout:	   Timeout
  * @icsk_retransmit_timer: Resend (no ack)
  * @icsk_rto:		   Retransmit timeout
+ * @icsk_pmtu_cookie	   Last pmtu seen by socket
  * @icsk_ca_ops		   Pluggable congestion control hook
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ca_state:	   Congestion control state
@@ -68,6 +69,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_backoff:	   Backoff
  * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
  * @icsk_probes_out:	   unanswered 0 window probes
+ * @icsk_ext_hdr_len:	   Network protocol overhead (IP/IPv6 options)
  * @icsk_ack:		   Delayed ACK control data
  */
 struct inet_connection_sock {
@@ -79,15 +81,17 @@ struct inet_connection_sock {
  	struct timer_list	  icsk_retransmit_timer;
  	struct timer_list	  icsk_delack_timer;
 	__u32			  icsk_rto;
+	__u32			  icsk_pmtu_cookie;
 	struct tcp_congestion_ops *icsk_ca_ops;
 	struct inet_connection_sock_af_ops *icsk_af_ops;
+	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state;
 	__u8			  icsk_retransmits;
 	__u8			  icsk_pending;
 	__u8			  icsk_backoff;
 	__u8			  icsk_syn_retries;
 	__u8			  icsk_probes_out;
-	/* 2 BYTES HOLE, TRY TO PACK! */
+	__u16			  icsk_ext_hdr_len;
 	struct {
 		__u8		  pending;	 /* ACK is pending			   */
 		__u8		  quick;	 /* Scheduled number of quick acks	   */
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 357691f6a45..a29cb29647d 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -76,6 +76,7 @@ struct inet_protosw {
 };
 #define INET_PROTOSW_REUSE 0x01	     /* Are ports automatically reusable? */
 #define INET_PROTOSW_PERMANENT 0x02  /* Permanent protocols are unremovable. */
+#define INET_PROTOSW_ICSK      0x04  /* Is this an inet_connection_sock? */
 
 extern struct net_protocol *inet_protocol_base;
 extern struct net_protocol *inet_protos[MAX_INET_PROTOS];
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index f675d8e642d..3f78c00e382 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_retransmits	= icsk->icsk_retransmits;
 	info->tcpi_probes	= icsk->icsk_probes_out;
 	info->tcpi_backoff	= icsk->icsk_backoff;
-	info->tcpi_pmtu		= dp->dccps_pmtu_cookie;
+	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
 	if (dp->dccps_options.dccpo_send_ack_vector)
 		info->tcpi_options |= TCPI_OPT_SACK;
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 9a724ff2a62..55e921bdd13 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -311,7 +311,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
-		dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
 		/*
 		 *    Step 10: Process REQUEST state (second part)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 671fbf3b237..c363051a7f1 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -104,9 +104,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->dport = usin->sin_port;
 	inet->daddr = daddr;
 
-	dp->dccps_ext_header_len = 0;
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt != NULL)
-		dp->dccps_ext_header_len = inet->opt->optlen;
+		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 	/*
 	 * Socket identity is still unknown (sport may be zero).
 	 * However we set state to DCCP_REQUESTING and not releasing socket
@@ -191,7 +191,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
-	    dp->dccps_pmtu_cookie > mtu) {
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		dccp_sync_mss(sk, mtu);
 
 		/*
@@ -1051,6 +1051,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
 int dccp_v4_init_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	static int dccp_ctl_socket_init = 1;
 
 	dccp_options_init(&dp->dccps_options);
@@ -1090,10 +1091,11 @@ int dccp_v4_init_sock(struct sock *sk)
 		dccp_ctl_socket_init = 0;
 
 	dccp_init_xmit_timers(sk);
-	inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+	icsk->icsk_rto = DCCP_TIMEOUT_INIT;
 	sk->sk_state = DCCP_CLOSED;
 	sk->sk_write_space = dccp_write_space;
-	inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
+	icsk->icsk_af_ops = &dccp_ipv4_af_ops;
+	icsk->icsk_sync_mss = dccp_sync_mss;
 	dp->dccps_mss_cache = 536;
 	dp->dccps_role = DCCP_ROLE_UNDEFINED;
 	dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 71bf04eb21e..599b0be2151 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -88,6 +88,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			   int addr_len)
 {
 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -158,7 +159,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		u32 exthdrlen = dp->dccps_ext_header_len;
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
 		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -170,14 +171,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped;
+		icsk->icsk_af_ops = &dccp_ipv6_mapped;
 		sk->sk_backlog_rcv = dccp_v4_do_rcv;
 
 		err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
-			dp->dccps_ext_header_len = exthdrlen;
-			inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &dccp_ipv6_af_ops;
 			sk->sk_backlog_rcv = dccp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -227,9 +228,10 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	ip6_dst_store(sk, dst, NULL);
 
-	dp->dccps_ext_header_len = 0;
+	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
-		dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
 
 	inet->dport = usin->sin6_port;
 
@@ -292,7 +294,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	np = inet6_sk(sk);
 
 	if (type == ICMPV6_PKT_TOOBIG) {
-		struct dccp_sock *dp = dccp_sk(sk);
 		struct dst_entry *dst = NULL;
 
 		if (sock_owned_by_user(sk))
@@ -332,7 +333,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		} else
 			dst_hold(dst);
 
-		if (dp->dccps_pmtu_cookie > dst_mtu(dst)) {
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
 			dccp_sync_mss(sk, dst_mtu(dst));
 		} /* else let the usual retransmit timer handle it */
 		dst_release(dst);
@@ -808,7 +809,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
-		dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie);
+		dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
 
 		return newsk;
 	}
@@ -916,10 +917,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 			sock_kfree_s(sk, opt, opt->tot_len);
 	}
 
-	newdp->dccps_ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newnp->opt)
-		newdp->dccps_ext_header_len = newnp->opt->opt_nflen +
-					      newnp->opt->opt_flen;
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
@@ -1230,6 +1231,7 @@ static struct inet_protosw dccp_v6_protosw = {
 	.prot		= &dccp_v6_prot,
 	.ops		= &inet6_dccp_ops,
 	.capability	= -1,
+	.flags		= INET_PROTOSW_ICSK,
 };
 
 static int __init dccp_v6_init(void)
diff --git a/net/dccp/output.c b/net/dccp/output.c
index c40f7f8a328..95a3c2c6a3c 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -134,12 +134,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 
 unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
 		       sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
 
 	/* Now subtract optional transport overhead */
-	mss_now -= dp->dccps_ext_header_len;
+	mss_now -= icsk->icsk_ext_hdr_len;
 
 	/*
 	 * FIXME: this should come from the CCID infrastructure, where, say,
@@ -152,7 +153,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
 
 	/* And store cached results */
-	dp->dccps_pmtu_cookie = pmtu;
+	icsk->icsk_pmtu_cookie = pmtu;
 	dp->dccps_mss_cache = mss_now;
 
 	return mss_now;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 51dfacd22a6..40a4c689905 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -712,7 +712,7 @@ static struct inet_protosw dccp_v4_protosw = {
 	.ops		= &inet_dccp_ops,
 	.capability	= -1,
 	.no_check	= 0,
-	.flags		= 0,
+	.flags		= INET_PROTOSW_ICSK,
 };
 
 /*
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d368cf24900..617e858beff 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -302,6 +302,7 @@ lookup_protocol:
 		sk->sk_reuse = 1;
 
 	inet = inet_sk(sk);
+	inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
 
 	if (SOCK_RAW == sock->type) {
 		inet->num = protocol;
@@ -869,7 +870,8 @@ static struct inet_protosw inetsw_array[] =
                 .ops =        &inet_stream_ops,
                 .capability = -1,
                 .no_check =   0,
-                .flags =      INET_PROTOSW_PERMANENT,
+                .flags =      INET_PROTOSW_PERMANENT |
+			      INET_PROTOSW_ICSK,
         },
 
         {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4f2d8725730..add019c746f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -29,8 +29,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
-#include <net/tcp.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
 #include <linux/netfilter.h>
@@ -427,8 +426,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			err = ip_options_get_from_user(&opt, optval, optlen);
 			if (err)
 				break;
-			if (sk->sk_type == SOCK_STREAM) {
-				struct tcp_sock *tp = tcp_sk(sk);
+			if (inet->is_icsk) {
+				struct inet_connection_sock *icsk = inet_csk(sk);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 				if (sk->sk_family == PF_INET ||
 				    (!((1 << sk->sk_state) &
@@ -436,10 +435,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 				     inet->daddr != LOOPBACK4_IPV6)) {
 #endif
 					if (inet->opt)
-						tp->ext_header_len -= inet->opt->optlen;
+						icsk->icsk_ext_hdr_len -= inet->opt->optlen;
 					if (opt)
-						tp->ext_header_len += opt->optlen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+						icsk->icsk_ext_hdr_len += opt->optlen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 				}
 #endif
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eacfe6a3442..00aa80e9324 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
 
-	info->tcpi_pmtu = tp->pmtu_cookie;
+	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
 	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
 	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
 	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7de6184d4bd..981d1203b15 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2342,7 +2342,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
 
 			if (nwin > tp->max_window) {
 				tp->max_window = nwin;
-				tcp_sync_mss(sk, tp->pmtu_cookie);
+				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
 			}
 		}
 	}
@@ -3967,12 +3967,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
 	tcp_parse_options(skb, &tp->rx_opt, 0);
 
 	if (th->ack) {
-		struct inet_connection_sock *icsk;
 		/* rfc793:
 		 * "If the state is SYN-SENT then
 		 *    first check the ACK bit
@@ -4061,7 +4061,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
 			tp->rx_opt.sack_ok |= 2;
 
-		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
 		/* Remember, tcp_poll() does not lock socket!
@@ -4071,8 +4071,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		mb();
 		tcp_set_state(sk, TCP_ESTABLISHED);
 
-		icsk = inet_csk(sk);
-
 		/* Make sure socket is routed, for correct metrics.  */
 		icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -4173,7 +4171,7 @@ discard:
 		if (tp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
 
-		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c2fe61becd6..9b62d80bb20 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -220,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->dport = usin->sin_port;
 	inet->daddr = daddr;
 
-	tp->ext_header_len = 0;
+	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet->opt)
-		tp->ext_header_len = inet->opt->optlen;
+		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 
 	tp->rx_opt.mss_clamp = 536;
 
@@ -275,7 +275,6 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 	 * send out by Linux are always <576bytes so they should go through
@@ -304,7 +303,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
 	mtu = dst_mtu(dst);
 
 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
-	    tp->pmtu_cookie > mtu) {
+	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 		tcp_sync_mss(sk, mtu);
 
 		/* Resend the TCP packet because it's
@@ -895,9 +894,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	ireq->opt	      = NULL;
 	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = skb->nh.iph->ttl;
-	newtp->ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newinet->opt)
-		newtp->ext_header_len = newinet->opt->optlen;
+		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
 	newinet->id = newtp->write_seq ^ jiffies;
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1266,6 +1265,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
 	icsk->icsk_af_ops = &ipv4_specific;
+	icsk->icsk_sync_mss = tcp_sync_mss;
 
 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index af1946c52c3..3a0a914de91 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -621,7 +621,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    It is minimum of user_mss and mss received with SYN.
    It also does not include TCP options.
 
-   tp->pmtu_cookie is last pmtu, seen by this function.
+   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
 
    tp->mss_cache is current effective sending mss, including
    all tcp options except for SACKs. It is evaluated,
@@ -631,17 +631,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    NOTE1. rfc1122 clearly states that advertised MSS
    DOES NOT include either tcp or ip options.
 
-   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
-   this function.			--ANK (980731)
+   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+   are READ ONLY outside this function.		--ANK (980731)
  */
 
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	/* Calculate base mss without TCP options:
 	   It is MMS_S - sizeof(tcphdr) of rfc1122
 	 */
-	int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len -
+	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
 		       sizeof(struct tcphdr));
 
 	/* Clamp it (mss_clamp does not include tcp options) */
@@ -649,7 +650,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 		mss_now = tp->rx_opt.mss_clamp;
 
 	/* Now subtract optional transport overhead */
-	mss_now -= tp->ext_header_len;
+	mss_now -= icsk->icsk_ext_hdr_len;
 
 	/* Then reserve room for full set of TCP options and 8 bytes of data */
 	if (mss_now < 48)
@@ -663,7 +664,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 
 	/* And store cached results */
-	tp->pmtu_cookie = pmtu;
+	icsk->icsk_pmtu_cookie = pmtu;
 	tp->mss_cache = mss_now;
 
 	return mss_now;
@@ -693,7 +694,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
-		if (mtu != tp->pmtu_cookie)
+		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
@@ -706,7 +707,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 	if (doing_tso) {
 		xmit_size_goal = (65535 -
 				  inet_csk(sk)->icsk_af_ops->net_header_len -
-				  tp->ext_header_len - tp->tcp_header_len);
+				  inet_csk(sk)->icsk_ext_hdr_len -
+				  tp->tcp_header_len);
 
 		if (tp->max_window &&
 		    (xmit_size_goal > (tp->max_window >> 1)))
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index bf17aab9b77..70a510ff31e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,6 +167,7 @@ lookup_protocol:
 		sk->sk_reuse = 1;
 
 	inet = inet_sk(sk);
+	inet->is_icsk = INET_PROTOSW_ICSK & answer_flags;
 
 	if (SOCK_RAW == sock->type) {
 		inet->num = protocol;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b6b63fa8454..c63868dd2ca 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 			sk_refcnt_debug_dec(sk);
 
 			if (sk->sk_protocol == IPPROTO_TCP) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 
 				local_bh_disable();
 				sock_prot_dec_use(sk->sk_prot);
 				sock_prot_inc_use(&tcp_prot);
 				local_bh_enable();
 				sk->sk_prot = &tcp_prot;
-				inet_csk(sk)->icsk_af_ops = &ipv4_specific;
+				icsk->icsk_af_ops = &ipv4_specific;
 				sk->sk_socket->ops = &inet_stream_ops;
 				sk->sk_family = PF_INET;
-				tcp_sync_mss(sk, tp->pmtu_cookie);
+				tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 			} else {
 				local_bh_disable();
 				sock_prot_dec_use(sk->sk_prot);
@@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 		}
 
 		retv = 0;
-		if (sk->sk_type == SOCK_STREAM) {
+		if (inet_sk(sk)->is_icsk) {
 			if (opt) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 				if (!((1 << sk->sk_state) &
 				      (TCPF_LISTEN | TCPF_CLOSE))
 				    && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
-					tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+					icsk->icsk_ext_hdr_len =
+						opt->opt_flen + opt->opt_nflen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 				}
 			}
 			opt = xchg(&np->opt, opt);
@@ -380,14 +381,15 @@ sticky_done:
 			goto done;
 update:
 		retv = 0;
-		if (sk->sk_type == SOCK_STREAM) {
+		if (inet_sk(sk)->is_icsk) {
 			if (opt) {
-				struct tcp_sock *tp = tcp_sk(sk);
+				struct inet_connection_sock *icsk = inet_csk(sk);
 				if (!((1 << sk->sk_state) &
 				      (TCPF_LISTEN | TCPF_CLOSE))
 				    && inet_sk(sk)->daddr != LOOPBACK4_IPV6) {
-					tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
-					tcp_sync_mss(sk, tp->pmtu_cookie);
+					icsk->icsk_ext_hdr_len =
+						opt->opt_flen + opt->opt_nflen;
+					icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
 				}
 			}
 			opt = xchg(&np->opt, opt);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a682eb9093e..2947bc56d8a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -123,7 +123,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			  int addr_len)
 {
 	struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
-	struct inet_sock *inet = inet_sk(sk);
+ 	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct in6_addr *saddr = NULL, *final_p = NULL, final;
@@ -198,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		u32 exthdrlen = tp->ext_header_len;
+		u32 exthdrlen = icsk->icsk_ext_hdr_len;
 		struct sockaddr_in sin;
 
 		SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
@@ -210,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sin.sin_port = usin->sin6_port;
 		sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-		inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
+		icsk->icsk_af_ops = &ipv6_mapped;
 		sk->sk_backlog_rcv = tcp_v4_do_rcv;
 
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
-			tp->ext_header_len = exthdrlen;
-			inet_csk(sk)->icsk_af_ops = &ipv6_specific;
+			icsk->icsk_ext_hdr_len = exthdrlen;
+			icsk->icsk_af_ops = &ipv6_specific;
 			sk->sk_backlog_rcv = tcp_v6_do_rcv;
 			goto failure;
 		} else {
@@ -270,9 +271,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_route_caps = dst->dev->features &
 		~(NETIF_F_IP_CSUM | NETIF_F_TSO);
 
-	tp->ext_header_len = 0;
+	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
-		tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen;
+		icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+					  np->opt->opt_nflen);
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
@@ -385,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		} else
 			dst_hold(dst);
 
-		if (tp->pmtu_cookie > dst_mtu(dst)) {
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
 			tcp_sync_mss(sk, dst_mtu(dst));
 			tcp_simple_retransmit(sk);
 		} /* else let the usual retransmit timer handle it */
@@ -869,7 +871,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		   worked with IPv6 icsk.icsk_af_ops.
 		   Sync it now.
 		 */
-		tcp_sync_mss(newsk, newtp->pmtu_cookie);
+		tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
 
 		return newsk;
 	}
@@ -976,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 			sock_kfree_s(sk, opt, opt->tot_len);
 	}
 
-	newtp->ext_header_len = 0;
+	inet_csk(newsk)->icsk_ext_hdr_len = 0;
 	if (newnp->opt)
-		newtp->ext_header_len = newnp->opt->opt_nflen +
-					newnp->opt->opt_flen;
+		inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
+						     newnp->opt->opt_flen);
 
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
@@ -1361,6 +1363,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	icsk->icsk_af_ops = &ipv6_specific;
 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_sync_mss = tcp_sync_mss;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -1591,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = {
 	.ops		=	&inet6_stream_ops,
 	.capability	=	-1,
 	.no_check	=	0,
-	.flags		=	INET_PROTOSW_PERMANENT,
+	.flags		=	INET_PROTOSW_PERMANENT |
+				INET_PROTOSW_ICSK,
 };
 
 void __init tcpv6_init(void)
-- 
cgit v1.2.3


From fbe9cc4a87030d5cad5f944ffaef6af7efd119e4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 13 Dec 2005 23:26:29 -0800
Subject: [AF_UNIX]: Use spinlock for unix_table_lock

This lock is actually taken mostly as a writer,
so using a rwlock actually just makes performance
worse especially on chips like the Intel P4.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  2 +-
 net/unix/af_unix.c    | 34 +++++++++++++++++-----------------
 net/unix/garbage.c    |  4 ++--
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b5d785ab4a0..3f302ae98c0 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -13,7 +13,7 @@ extern void unix_gc(void);
 #define UNIX_HASH_SIZE	256
 
 extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-extern rwlock_t unix_table_lock;
+extern spinlock_t unix_table_lock;
 
 extern atomic_t unix_tot_inflight;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1dc3685048f..04e850e04e3 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -121,7 +121,7 @@
 int sysctl_unix_max_dgram_qlen = 10;
 
 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
-DEFINE_RWLOCK(unix_table_lock);
+DEFINE_SPINLOCK(unix_table_lock);
 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
@@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0);
 
 /*
  *  SMP locking strategy:
- *    hash table is protected with rwlock unix_table_lock
+ *    hash table is protected with spinlock unix_table_lock
  *    each socket state is protected by separate rwlock.
  */
 
@@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 
 static inline void unix_remove_socket(struct sock *sk)
 {
-	write_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	__unix_remove_socket(sk);
-	write_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 }
 
 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 {
-	write_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	__unix_insert_socket(list, sk);
-	write_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 }
 
 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
@@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
 {
 	struct sock *s;
 
-	read_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	s = __unix_find_socket_byname(sunname, len, type, hash);
 	if (s)
 		sock_hold(s);
-	read_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 	return s;
 }
 
@@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 	struct sock *s;
 	struct hlist_node *node;
 
-	read_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	sk_for_each(s, node,
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->dentry;
@@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 	}
 	s = NULL;
 found:
-	read_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 	return s;
 }
 
@@ -642,12 +642,12 @@ retry:
 	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 	addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
 
-	write_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
 
 	if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
 				      addr->hash)) {
-		write_unlock(&unix_table_lock);
+		spin_unlock(&unix_table_lock);
 		/* Sanity yield. It is unusual case, but yet... */
 		if (!(ordernum&0xFF))
 			yield();
@@ -658,7 +658,7 @@ retry:
 	__unix_remove_socket(sk);
 	u->addr = addr;
 	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
-	write_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 	err = 0;
 
 out:	up(&u->readsem);
@@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		addr->hash = UNIX_HASH_SIZE;
 	}
 
-	write_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 
 	if (!sunaddr->sun_path[0]) {
 		err = -EADDRINUSE;
@@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	__unix_insert_socket(list, sk);
 
 out_unlock:
-	write_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 out_up:
 	up(&u->readsem);
 out:
@@ -1916,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
 
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	read_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 	return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
 }
 
@@ -1931,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
 {
-	read_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 }
 
 static int unix_seq_show(struct seq_file *seq, void *v)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 6ffc64e1712..411802bd4d3 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -182,7 +182,7 @@ void unix_gc(void)
 	if (down_trylock(&unix_gc_sem))
 		return;
 
-	read_lock(&unix_table_lock);
+	spin_lock(&unix_table_lock);
 
 	forall_unix_sockets(i, s)
 	{
@@ -301,7 +301,7 @@ void unix_gc(void)
 		}
 		u->gc_tree = GC_ORPHAN;
 	}
-	read_unlock(&unix_table_lock);
+	spin_unlock(&unix_table_lock);
 
 	/*
 	 *	Here we are. Hitlist is filled. Die.
-- 
cgit v1.2.3


From 4505a3ef720845b5db3ddb440de13cd4800fd508 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 18:51:49 -0800
Subject: [BRIDGE]: allow setting hardware address of bridge pseudo-dev

Some people are using bridging to hide multiple machines from an ISP
that restricts by MAC address. So in that case allow the bridge mac
address to be set to any of the existing interfaces.  I don't want to
allow any arbitrary value and confuse STP.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  | 28 ++++++++++++++++++++++++++--
 net/bridge/br_private.h |  1 +
 net/bridge/br_stp_if.c  |  3 +--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f564ee99782..f7a66abf5de 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,8 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
-#include <linux/module.h>
+#include <linux/etherdevice.h>
+
 #include <asm/uaccess.h>
 #include "br_private.h"
 
@@ -82,6 +83,29 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+/* Allow setting mac address of pseudo-bridge to be same as
+ * any of the bound interfaces
+ */
+static int br_set_mac_address(struct net_device *dev, void *p)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct sockaddr *addr = p;
+	struct net_bridge_port *port;
+	int err = -EADDRNOTAVAIL;
+
+	spin_lock_bh(&br->lock);
+	list_for_each_entry(port, &br->port_list, list) {
+		if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) {
+			br_stp_change_bridge_id(br, addr->sa_data);
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock_bh(&br->lock);
+
+	return err;
+}
+
 void br_dev_setup(struct net_device *dev)
 {
 	memset(dev->dev_addr, 0, ETH_ALEN);
@@ -98,6 +122,6 @@ void br_dev_setup(struct net_device *dev)
 	SET_MODULE_OWNER(dev);
 	dev->stop = br_dev_stop;
 	dev->tx_queue_len = 0;
-	dev->set_mac_address = NULL;
+	dev->set_mac_address = br_set_mac_address;
 	dev->priv_flags = IFF_EBRIDGE;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index bdf95a74d8c..2c249486476 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -201,6 +201,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br);
 extern void br_stp_enable_port(struct net_bridge_port *p);
 extern void br_stp_disable_port(struct net_bridge_port *p);
 extern void br_stp_recalculate_bridge_id(struct net_bridge *br);
+extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a);
 extern void br_stp_set_bridge_priority(struct net_bridge *br,
 				       u16 newprio);
 extern void br_stp_set_port_priority(struct net_bridge_port *p,
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index ac09b6a2352..2d2e969ae25 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p)
 }
 
 /* called under bridge lock */
-static void br_stp_change_bridge_id(struct net_bridge *br, 
-				    const unsigned char *addr)
+void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
 {
 	unsigned char oldaddr[6];
 	struct net_bridge_port *p;
-- 
cgit v1.2.3


From 4433f420e57afae0ab308b1e2b979f09c86bc115 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 20 Dec 2005 15:19:51 -0800
Subject: [BRIDGE]: handle speed detection after carrier changes

Speed of a interface may not be available until carrier
is detected in the case of autonegotiation. To get the correct value
we need to recheck speed after carrier event.  But the check needs to
be done in a context that is similar to normal ethtool interface (can sleep).

Also, delay check for 1ms to try avoid any carrier bounce transitions.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c      | 49 ++++++++++++++++++++++++++++++++++++++-----------
 net/bridge/br_notify.c  | 14 +++-----------
 net/bridge/br_private.h |  3 +++
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 975abe254b7..e6b8bb51d61 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -32,9 +32,8 @@
  * ethtool, use ethtool_ops.  Also, since driver might sleep need to
  * not be holding any locks.
  */
-static int br_initial_port_cost(struct net_device *dev)
+static int port_cost(struct net_device *dev)
 {
-
 	struct ethtool_cmd ecmd = { ETHTOOL_GSET };
 	struct ifreq ifr;
 	mm_segment_t old_fs;
@@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev)
 			return 2;
 		case SPEED_10:
 			return 100;
-		default:
-			pr_info("bridge: can't decode speed from %s: %d\n",
-				dev->name, ecmd.speed);
-			return 100;
 		}
 	}
 
@@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev)
 	return 100;	/* assume old 10Mbps */
 }
 
+
+/*
+ * Check for port carrier transistions.
+ * Called from work queue to allow for calling functions that
+ * might sleep (such as speed check), and to debounce.
+ */
+static void port_carrier_check(void *arg)
+{
+	struct net_bridge_port *p = arg;
+
+	rtnl_lock();
+	if (netif_carrier_ok(p->dev)) {
+		u32 cost = port_cost(p->dev);
+
+		spin_lock_bh(&p->br->lock);
+		if (p->state == BR_STATE_DISABLED) {
+			p->path_cost = cost;
+			br_stp_enable_port(p);
+		}
+		spin_unlock_bh(&p->br->lock);
+	} else {
+		spin_lock_bh(&p->br->lock);
+		if (p->state != BR_STATE_DISABLED)
+			br_stp_disable_port(p);
+		spin_unlock_bh(&p->br->lock);
+	}
+	rtnl_unlock();
+}
+
 static void destroy_nbp(struct net_bridge_port *p)
 {
 	struct net_device *dev = p->dev;
@@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p)
 	dev->br_port = NULL;
 	dev_set_promiscuity(dev, -1);
 
+	cancel_delayed_work(&p->carrier_check);
+	flush_scheduled_work();
+
 	spin_lock_bh(&br->lock);
 	br_stp_disable_port(p);
 	spin_unlock_bh(&br->lock);
@@ -195,10 +222,9 @@ static int find_portno(struct net_bridge *br)
 	return (index >= BR_MAX_PORTS) ? -EXFULL : index;
 }
 
-/* called with RTNL */
+/* called with RTNL but without bridge lock */
 static struct net_bridge_port *new_nbp(struct net_bridge *br, 
-				       struct net_device *dev,
-				       unsigned long cost)
+				       struct net_device *dev)
 {
 	int index;
 	struct net_bridge_port *p;
@@ -215,12 +241,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->br = br;
 	dev_hold(dev);
 	p->dev = dev;
-	p->path_cost = cost;
+	p->path_cost = port_cost(dev);
  	p->priority = 0x8000 >> BR_PORT_BITS;
 	dev->br_port = p;
 	p->port_no = index;
 	br_init_port(p);
 	p->state = BR_STATE_DISABLED;
+	INIT_WORK(&p->carrier_check, port_carrier_check, p);
 	kobject_init(&p->kobj);
 
 	return p;
@@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (dev->br_port != NULL)
 		return -EBUSY;
 
-	if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev))))
+	if (IS_ERR(p = new_nbp(br, dev)))
 		return PTR_ERR(p);
 
  	if ((err = br_fdb_insert(br, p, dev->dev_addr)))
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 917311c6828..a43a9c1d50d 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 		br_stp_recalculate_bridge_id(br);
 		break;
 
-	case NETDEV_CHANGE:	/* device is up but carrier changed */
-		if (!(br->dev->flags & IFF_UP))
-			break;
-
-		if (netif_carrier_ok(dev)) {
-			if (p->state == BR_STATE_DISABLED)
-				br_stp_enable_port(p);
-		} else {
-			if (p->state != BR_STATE_DISABLED)
-				br_stp_disable_port(p);
-		}
+	case NETDEV_CHANGE:
+		if (br->dev->flags & IFF_UP)
+			schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE);
 		break;
 
 	case NETDEV_FEAT_CHANGE:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2c249486476..7ad53c2aa68 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -27,6 +27,8 @@
 #define BR_PORT_BITS	10
 #define BR_MAX_PORTS	(1<<BR_PORT_BITS)
 
+#define BR_PORT_DEBOUNCE (HZ/10)
+
 typedef struct bridge_id bridge_id;
 typedef struct mac_addr mac_addr;
 typedef __u16 port_id;
@@ -78,6 +80,7 @@ struct net_bridge_port
 	struct timer_list		hold_timer;
 	struct timer_list		message_age_timer;
 	struct kobject			kobj;
+	struct work_struct		carrier_check;
 	struct rcu_head			rcu;
 };
 
-- 
cgit v1.2.3


From 0e5eabac4995e128f06b90df64b93604625de6de Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:00:18 -0800
Subject: [BRIDGE]: filter packets in learning state

While in the learning state, run filters but drop the result.
This prevents us from acquiring bad fdb entries in learning state.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index b88220a64cd..c387852f753 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	/* insert into forwarding database after filtering to avoid spoofing */
 	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
 
+	if (p->state == BR_STATE_LEARNING) {
+		kfree_skb(skb);
+		goto out;
+	}
+
 	if (br->dev->flags & IFF_PROMISC) {
 		struct sk_buff *skb2;
 
@@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
 		goto err;
 
-	if (p->state == BR_STATE_LEARNING)
-		br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
-
 	if (p->br->stp_enabled &&
 	    !memcmp(dest, bridge_ula, 5) &&
 	    !(dest[5] & 0xF0)) {
@@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
 				NULL, br_stp_handle_bpdu);
 			return 1;
 		}
+		goto err;
 	}
 
-	else if (p->state == BR_STATE_FORWARDING) {
+	if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) {
 		if (br_should_route_hook) {
 			if (br_should_route_hook(pskb)) 
 				return 0;
-- 
cgit v1.2.3


From edb5e46fc03d0a45f2b41e3717631f7af7e9fc19 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:00:58 -0800
Subject: [BRIDGE]: limited ethtool support

Add limited ethtool support to bridge to allow disabling
features.

Note: if underlying device does not support a feature (like checksum
offload), then the bridge device won't inherit it.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_if.c      |  6 ++---
 net/bridge/br_private.h |  1 +
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index f7a66abf5de..0b33a7b3a00 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/ethtool.h>
 
 #include <asm/uaccess.h>
 #include "br_private.h"
@@ -106,6 +107,64 @@ static int br_set_mac_address(struct net_device *dev, void *p)
 	return err;
 }
 
+static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "bridge");
+	strcpy(info->version, BR_VERSION);
+	strcpy(info->fw_version, "N/A");
+	strcpy(info->bus_info, "N/A");
+}
+
+static int br_set_sg(struct net_device *dev, u32 data)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	if (data)
+		br->feature_mask |= NETIF_F_SG;
+	else
+		br->feature_mask &= ~NETIF_F_SG;
+
+	br_features_recompute(br);
+	return 0;
+}
+
+static int br_set_tso(struct net_device *dev, u32 data)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	if (data)
+		br->feature_mask |= NETIF_F_TSO;
+	else
+		br->feature_mask &= ~NETIF_F_TSO;
+
+	br_features_recompute(br);
+	return 0;
+}
+
+static int br_set_tx_csum(struct net_device *dev, u32 data)
+{
+	struct net_bridge *br = netdev_priv(dev);
+
+	if (data)
+		br->feature_mask |= NETIF_F_IP_CSUM;
+	else
+		br->feature_mask &= ~NETIF_F_IP_CSUM;
+
+	br_features_recompute(br);
+	return 0;
+}
+
+static struct ethtool_ops br_ethtool_ops = {
+	.get_drvinfo = br_getinfo,
+	.get_link = ethtool_op_get_link,
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = br_set_sg,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = br_set_tx_csum,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = br_set_tso,
+};
+
 void br_dev_setup(struct net_device *dev)
 {
 	memset(dev->dev_addr, 0, ETH_ALEN);
@@ -120,8 +179,12 @@ void br_dev_setup(struct net_device *dev)
 	dev->change_mtu = br_change_mtu;
 	dev->destructor = free_netdev;
 	SET_MODULE_OWNER(dev);
+ 	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
 	dev->stop = br_dev_stop;
 	dev->tx_queue_len = 0;
 	dev->set_mac_address = br_set_mac_address;
 	dev->priv_flags = IFF_EBRIDGE;
+
+ 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
+ 		| NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM;
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index e6b8bb51d61..11321197338 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -182,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name)
 	br->bridge_id.prio[1] = 0x00;
 	memset(br->bridge_id.addr, 0, ETH_ALEN);
 
+	br->feature_mask = dev->features;
 	br->stp_enabled = 0;
 	br->designated_root = br->bridge_id;
 	br->root_path_cost = 0;
@@ -349,9 +350,8 @@ void br_features_recompute(struct net_bridge *br)
 	struct net_bridge_port *p;
 	unsigned long features, checksum;
 
-	features = NETIF_F_SG | NETIF_F_FRAGLIST 
-		| NETIF_F_HIGHDMA | NETIF_F_TSO;
-	checksum = NETIF_F_IP_CSUM;	/* least commmon subset */
+	features = br->feature_mask &~ NETIF_F_IP_CSUM;
+	checksum = br->feature_mask & NETIF_F_IP_CSUM;
 
 	list_for_each_entry(p, &br->port_list, list) {
 		if (!(p->dev->features 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 7ad53c2aa68..db7b26b8de9 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -93,6 +93,7 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 	struct list_head		age_list;
+	unsigned long			feature_mask;
 
 	/* STP */
 	bridge_id			designated_root;
-- 
cgit v1.2.3


From 8cbb512e50fb702b5b1d444f76ebcdb53577b2ec Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:01:30 -0800
Subject: [BRIDGE]: add version number

Add version info to bridge module.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         | 1 +
 net/bridge/br_private.h | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/net/bridge/br.c b/net/bridge/br.c
index f8f184942aa..188cc1ac49e 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook);
 module_init(br_init)
 module_exit(br_deinit)
 MODULE_LICENSE("GPL");
+MODULE_VERSION(BR_VERSION);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index db7b26b8de9..c5bd631ffcd 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -29,6 +29,8 @@
 
 #define BR_PORT_DEBOUNCE (HZ/10)
 
+#define BR_VERSION	"2.1"
+
 typedef struct bridge_id bridge_id;
 typedef struct mac_addr mac_addr;
 typedef __u16 port_id;
-- 
cgit v1.2.3


From c865e5d99e25a171e8262fc0f7ba608568633c64 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:03:44 -0800
Subject: [PKT_SCHED] netem: packet corruption option

Here is a new feature for netem in 2.6.16. It adds the ability to
randomly corrupt packets with netem. A version was done by
Hagen Paul Pfeifer, but I redid it to handle the cases of backwards
compatibility with netlink interface and presence of hardware checksum
offload. It is useful for testing hardware offload in devices.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_sched.h |  7 +++++++
 net/sched/sch_netem.c     | 49 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e87b233615b..d10f3533850 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -429,6 +429,7 @@ enum
 	TCA_NETEM_CORR,
 	TCA_NETEM_DELAY_DIST,
 	TCA_NETEM_REORDER,
+	TCA_NETEM_CORRUPT,
 	__TCA_NETEM_MAX,
 };
 
@@ -457,6 +458,12 @@ struct tc_netem_reorder
 	__u32	correlation;
 };
 
+struct tc_netem_corrupt
+{
+	__u32	probability;
+	__u32	correlation;
+};
+
 #define NETEM_DIST_SCALE	8192
 
 #endif
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 82fb07aa06a..ba528320483 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,7 +25,7 @@
 
 #include <net/pkt_sched.h>
 
-#define VERSION "1.1"
+#define VERSION "1.2"
 
 /*	Network Emulation Queuing algorithm.
 	====================================
@@ -65,11 +65,12 @@ struct netem_sched_data {
 	u32 jitter;
 	u32 duplicate;
 	u32 reorder;
+	u32 corrupt;
 
 	struct crndstate {
 		unsigned long last;
 		unsigned long rho;
-	} delay_cor, loss_cor, dup_cor, reorder_cor;
+	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 
 	struct disttable {
 		u32  size;
@@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->duplicate = dupsave;
 	}
 
+	/*
+	 * Randomized packet corruption.
+	 * Make copy if needed since we are modifying
+	 * If packet is going to be hardware checksummed, then
+	 * do it now in software before we mangle it.
+	 */
+	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
+		if (!(skb = skb_unshare(skb, GFP_ATOMIC))
+		    || (skb->ip_summed == CHECKSUM_HW
+			&& skb_checksum_help(skb, 0))) {
+			sch->qstats.drops++;
+			return NET_XMIT_DROP;
+		}
+
+		skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
+	}
+
 	if (q->gap == 0 		/* not doing reordering */
 	    || q->counter < q->gap 	/* inside last reordering gap */
 	    || q->reorder < get_crandom(&q->reorder_cor)) {
@@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
 	return 0;
 }
 
+static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_corrupt *r = RTA_DATA(attr);
+
+	if (RTA_PAYLOAD(attr) != sizeof(*r))
+		return -EINVAL;
+
+	q->corrupt = r->probability;
+	init_crandom(&q->corrupt_cor, r->correlation);
+	return 0;
+}
+
+/* Parse netlink message to set options */
 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 			if (ret)
 				return ret;
 		}
+
 		if (tb[TCA_NETEM_REORDER-1]) {
 			ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
 			if (ret)
 				return ret;
 		}
-	}
 
+		if (tb[TCA_NETEM_CORRUPT-1]) {
+			ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]);
+			if (ret)
+				return ret;
+		}
+	}
 
 	return 0;
 }
@@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct tc_netem_qopt qopt;
 	struct tc_netem_corr cor;
 	struct tc_netem_reorder reorder;
+	struct tc_netem_corrupt corrupt;
 
 	qopt.latency = q->latency;
 	qopt.jitter = q->jitter;
@@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	reorder.correlation = q->reorder_cor.rho;
 	RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
 
+	corrupt.probability = q->corrupt;
+	corrupt.correlation = q->corrupt_cor.rho;
+	RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
+
 	rta->rta_len = skb->tail - b;
 
 	return skb->len;
-- 
cgit v1.2.3


From 3821af2fe13700cab6fd67367128fa180e43f8b8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:30:53 -0800
Subject: [FLS64]: generic version

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/asm-alpha/bitops.h     | 1 +
 include/asm-arm/bitops.h       | 2 ++
 include/asm-arm26/bitops.h     | 1 +
 include/asm-cris/bitops.h      | 1 +
 include/asm-frv/bitops.h       | 1 +
 include/asm-generic/bitops.h   | 1 +
 include/asm-h8300/bitops.h     | 1 +
 include/asm-i386/bitops.h      | 1 +
 include/asm-ia64/bitops.h      | 1 +
 include/asm-m32r/bitops.h      | 1 +
 include/asm-m68k/bitops.h      | 1 +
 include/asm-m68knommu/bitops.h | 1 +
 include/asm-mips/bitops.h      | 2 +-
 include/asm-parisc/bitops.h    | 1 +
 include/asm-powerpc/bitops.h   | 1 +
 include/asm-s390/bitops.h      | 1 +
 include/asm-sh/bitops.h        | 1 +
 include/asm-sh64/bitops.h      | 1 +
 include/asm-sparc/bitops.h     | 1 +
 include/asm-sparc64/bitops.h   | 1 +
 include/asm-v850/bitops.h      | 1 +
 include/asm-x86_64/bitops.h    | 1 +
 include/asm-xtensa/bitops.h    | 1 +
 include/linux/bitops.h         | 9 +++++++++
 24 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h
index 578ed3f1a60..302201f1a09 100644
--- a/include/asm-alpha/bitops.h
+++ b/include/asm-alpha/bitops.h
@@ -321,6 +321,7 @@ static inline int fls(int word)
 #else
 #define fls	generic_fls
 #endif
+#define fls64   generic_fls64
 
 /* Compute powers of two for the given integer.  */
 static inline long floor_log2(unsigned long word)
diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h
index 7399d431edf..d02de721ecc 100644
--- a/include/asm-arm/bitops.h
+++ b/include/asm-arm/bitops.h
@@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as
@@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word)
 #define fls(x) \
 	( __builtin_constant_p(x) ? generic_fls(x) : \
 	  ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) )
+#define fls64(x)   generic_fls64(x)
 #define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
 #define __ffs(x) (ffs(x) - 1)
 #define ffz(x) __ffs( ~(x) )
diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h
index 7d062fb2e34..15cc6f2da79 100644
--- a/include/asm-arm26/bitops.h
+++ b/include/asm-arm26/bitops.h
@@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as
diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index 1bddb3f3a28..d3eb0f1e420 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN - returns the hamming weight of a N-bit word
diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index b664bd5b666..02be7b3a8a8 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -228,6 +228,7 @@ found_middle:
 							\
 	bit ? 33 - bit : bit;				\
 })
+#define fls64(x)   generic_fls64(x)
 
 /*
  * Every architecture must define this function. It's the fastest
diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
index ce31b739fd8..0e6d9852008 100644
--- a/include/asm-generic/bitops.h
+++ b/include/asm-generic/bitops.h
@@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h
index 5036f595f8c..c0411ec9d65 100644
--- a/include/asm-h8300/bitops.h
+++ b/include/asm-h8300/bitops.h
@@ -406,5 +406,6 @@ found_middle:
 #endif /* __KERNEL__ */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* _H8300_BITOPS_H */
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index ddf1739dc7f..4807aa1d2e3 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index 7232528e2d0..36d0fb95ea8 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -345,6 +345,7 @@ fls (int t)
 	x |= x >> 16;
 	return ia64_popcnt(x);
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * ffs: find first bit set. This is defined the same way as the libc and compiler builtin
diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h
index e7844398134..abea2fdd868 100644
--- a/include/asm-m32r/bitops.h
+++ b/include/asm-m32r/bitops.h
@@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word)
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h
index b1bcf7c6651..13f4c004846 100644
--- a/include/asm-m68k/bitops.h
+++ b/include/asm-m68k/bitops.h
@@ -310,6 +310,7 @@ static inline int fls(int x)
 
 	return 32 - cnt;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * Every architecture must define this function. It's the fastest
diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h
index c42f88a9b9f..4058dd086a0 100644
--- a/include/asm-m68knommu/bitops.h
+++ b/include/asm-m68knommu/bitops.h
@@ -499,5 +499,6 @@ found_middle:
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* _M68KNOMMU_BITOPS_H */
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index 5496f9064a6..3b0c8aaf6e8 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word)
 
 	return flz(~word) + 1;
 }
-
+#define fls64(x)   generic_fls64(x)
 
 /*
  * find_next_zero_bit - find the first zero bit in a memory region
diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h
index 55b98c67fd8..15d8c2b5158 100644
--- a/include/asm-parisc/bitops.h
+++ b/include/asm-parisc/bitops.h
@@ -263,6 +263,7 @@ static __inline__ int fls(int x)
 
 	return ret;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h
index 5727229b044..1996eaa8aea 100644
--- a/include/asm-powerpc/bitops.h
+++ b/include/asm-powerpc/bitops.h
@@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x)
 	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x));
 	return 32 - lz;
 }
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h
index b07c578b22e..61232760cc3 100644
--- a/include/asm-s390/bitops.h
+++ b/include/asm-s390/bitops.h
@@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b)
  * fls: find last bit set.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h
index 5163d1ff2f1..1c526086004 100644
--- a/include/asm-sh/bitops.h
+++ b/include/asm-sh/bitops.h
@@ -470,6 +470,7 @@ found_middle:
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h
index e1ff63e0922..ce9c3ad45fe 100644
--- a/include/asm-sh64/bitops.h
+++ b/include/asm-sh64/bitops.h
@@ -510,6 +510,7 @@ found_middle:
 
 #define ffs(x)	generic_ffs(x)
 #define fls(x)	generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h
index bfbd795a0a8..41722b5e45e 100644
--- a/include/asm-sparc/bitops.h
+++ b/include/asm-sparc/bitops.h
@@ -298,6 +298,7 @@ static inline int ffs(int x)
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 /*
  * hweightN: returns the hamming weight (i.e. the number
diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h
index 6388b8376c5..6efc0162fb0 100644
--- a/include/asm-sparc64/bitops.h
+++ b/include/asm-sparc64/bitops.h
@@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word)
  */
 
 #define fls(x) generic_fls(x)
+#define fls64(x)   generic_fls64(x)
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index b91e799763f..8955d2376ac 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -276,6 +276,7 @@ found_middle:
 
 #define ffs(x) generic_ffs (x)
 #define fls(x) generic_fls (x)
+#define fls64(x) generic_fls64(x)
 #define __ffs(x) ffs(x)
 
 
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 05a0d374404..94b52c8ce97 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -409,6 +409,7 @@ static __inline__ int ffs(int x)
 
 /* find last set bit */
 #define fls(x) generic_fls(x)
+#define fls64(x) generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h
index e76ee889e21..0a2065f1a37 100644
--- a/include/asm-xtensa/bitops.h
+++ b/include/asm-xtensa/bitops.h
@@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x)
 {
 	return __cntlz(x);
 }
+#define fls64(x)   generic_fls64(x)
 
 static __inline__ int
 find_next_bit(const unsigned long *addr, int size, int offset)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 38c2fb7ebe0..6a2a19f14bb 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x)
  */
 #include <asm/bitops.h>
 
+
+static inline int generic_fls64(__u64 x)
+{
+	__u32 h = x >> 32;
+	if (h)
+		return fls(x) + 32;
+	return fls(x);
+}
+
 static __inline__ int get_bitmask_order(unsigned int count)
 {
 	int order;
-- 
cgit v1.2.3


From 90933fc8ba5cc9034e3c04ee19938a22b0b4fe4e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:31:36 -0800
Subject: [FLS64]: x86_64 version

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/asm-x86_64/bitops.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index 94b52c8ce97..a4d5d090945 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -340,6 +340,20 @@ static __inline__ unsigned long __ffs(unsigned long word)
 	return word;
 }
 
+/*
+ * __fls: find last bit set.
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static __inline__ unsigned long __fls(unsigned long word)
+{
+	__asm__("bsrq %1,%0"
+		:"=r" (word)
+		:"rm" (word));
+	return word;
+}
+
 #ifdef __KERNEL__
 
 static inline int sched_find_first_bit(const unsigned long *b)
@@ -369,6 +383,19 @@ static __inline__ int ffs(int x)
 	return r+1;
 }
 
+/**
+ * fls64 - find last bit set in 64 bit word
+ * @x: the word to search
+ *
+ * This is defined the same way as fls.
+ */
+static __inline__ int fls64(__u64 x)
+{
+	if (x == 0)
+		return 0;
+	return __fls(x) + 1;
+}
+
 /**
  * hweightN - returns the hamming weight of a N-bit word
  * @x: the word to weigh
@@ -409,7 +436,6 @@ static __inline__ int ffs(int x)
 
 /* find last set bit */
 #define fls(x) generic_fls(x)
-#define fls64(x) generic_fls64(x)
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 89b3d9aaf46791177c5a5fa07a3ed38a035b5ef5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:32:08 -0800
Subject: [TCP] cubic: precompute constants

Revised version of patch to pre-compute values for TCP cubic.
  * d32,d64 replaced with descriptive names
  * cube_factor replaces
	 srtt[scaled by count] / HZ * ((1 << (10+2*BICTCP_HZ)) / bic_scale)
  * beta_scale replaces
	8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta);

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cubic.c | 133 ++++++++++++++++++++++-----------------------------
 1 file changed, 57 insertions(+), 76 deletions(-)

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index bb5dc4bfb6b..44fd40891ac 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -16,7 +16,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <net/tcp.h>
-
+#include <asm/div64.h>
 
 #define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
 					 * max_cwnd = snd_cwnd * beta
@@ -34,15 +34,20 @@ static int initial_ssthresh = 100;
 static int bic_scale = 41;
 static int tcp_friendliness = 1;
 
+static u32 cube_rtt_scale;
+static u32 beta_scale;
+static u64 cube_factor;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
 module_param(fast_convergence, int, 0644);
 MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
 module_param(max_increment, int, 0644);
 MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
-module_param(beta, int, 0644);
+module_param(beta, int, 0444);
 MODULE_PARM_DESC(beta, "beta for multiplicative increase");
 module_param(initial_ssthresh, int, 0644);
 MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
-module_param(bic_scale, int, 0644);
+module_param(bic_scale, int, 0444);
 MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
 module_param(tcp_friendliness, int, 0644);
 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
@@ -151,65 +156,13 @@ static u32 cubic_root(u64 x)
                 return (u32)end;
 }
 
-static inline u32 bictcp_K(u32 dist, u32 srtt)
-{
-        u64 d64;
-        u32 d32;
-        u32 count;
-        u32 result;
-
-        /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
-           so K = cubic_root( (wmax-cwnd)*rtt/c )
-           the unit of K is bictcp_HZ=2^10, not HZ
-
-           c = bic_scale >> 10
-           rtt = (tp->srtt >> 3 ) / HZ
-
-           the following code has been designed and tested for
-           cwnd < 1 million packets
-           RTT < 100 seconds
-           HZ < 1,000,00  (corresponding to 10 nano-second)
-
-        */
-
-        /* 1/c * 2^2*bictcp_HZ */
-        d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale;
-        d64 = (__u64)d32;
-
-        /* srtt * 2^count / HZ
-           1) to get a better accuracy of the following d32,
-           the larger the "count", the better the accuracy
-           2) and avoid overflow of the following d64
-           the larger the "count", the high possibility of overflow
-           3) so find a "count" between bictcp_hz-3 and bictcp_hz
-           "count" may be less than bictcp_HZ,
-           then d64 becomes 0. that is OK
-        */
-        d32 = srtt;
-        count = 0;
-        while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){
-                d32 = d32 << 1;
-                count++;
-        }
-        d32 = d32 / HZ;
-
-        /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)  */
-        d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ);
-
-        /* cubic root */
-        d64 = cubic_root(d64);
-
-        result = (u32)d64;
-        return result;
-}
-
 /*
  * Compute congestion window to use.
  */
 static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 {
-	u64 d64;
-	u32 d32, t, srtt, bic_target, min_cnt, max_cnt;
+	u64 offs;
+	u32 delta, t, bic_target, min_cnt, max_cnt;
 
 	ca->ack_cnt++;	/* count the number of ACKs */
 
@@ -220,8 +173,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 	ca->last_cwnd = cwnd;
 	ca->last_time = tcp_time_stamp;
 
-	srtt = (HZ << 3)/10;	/* use real time-based growth function */
-
 	if (ca->epoch_start == 0) {
 		ca->epoch_start = tcp_time_stamp;	/* record the beginning of an epoch */
 		ca->ack_cnt = 1;			/* start counting */
@@ -231,7 +182,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 			ca->bic_K = 0;
 			ca->bic_origin_point = cwnd;
 		} else {
-			ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt);
+			/* Compute new K based on
+			 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+			 */
+			ca->bic_K = cubic_root(cube_factor
+					       * (ca->last_max_cwnd - cwnd));
 			ca->bic_origin_point = ca->last_max_cwnd;
 		}
 	}
@@ -239,9 +194,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
         /* cubic function - calc*/
         /* calculate c * time^3 / rtt,
          *  while considering overflow in calculation of time^3
-	 * (so time^3 is done by using d64)
+	 * (so time^3 is done by using 64 bit)
 	 * and without the support of division of 64bit numbers
-	 * (so all divisions are done by using d32)
+	 * (so all divisions are done by using 32 bit)
          *  also NOTE the unit of those veriables
          *	  time  = (t - K) / 2^bictcp_HZ
          *	  c = bic_scale >> 10
@@ -255,18 +210,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 	     << BICTCP_HZ) / HZ;
 
         if (t < ca->bic_K)		/* t - K */
-                d32 = ca->bic_K - t;
+		offs = ca->bic_K - t;
         else
-                d32 = t - ca->bic_K;
+                offs = t - ca->bic_K;
 
-        d64 = (u64)d32;
-        d32 = (bic_scale << 3) * HZ / srtt;			/* 1024*c/rtt */
-        d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ);	/* c/rtt * (t-K)^3 */
-        d32 = (u32)d64;
+	/* c/rtt * (t-K)^3 */
+	delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
         if (t < ca->bic_K)                                	/* below origin*/
-                bic_target = ca->bic_origin_point - d32;
+                bic_target = ca->bic_origin_point - delta;
         else                                                	/* above origin*/
-                bic_target = ca->bic_origin_point + d32;
+                bic_target = ca->bic_origin_point + delta;
 
         /* cubic function - calc bictcp_cnt*/
         if (bic_target > cwnd) {
@@ -288,16 +241,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 
 	/* TCP Friendly */
 	if (tcp_friendliness) {
-		u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta);
-		d32 = (cwnd * scale) >> 3;
-	        while (ca->ack_cnt > d32) {		/* update tcp cwnd */
-	                ca->ack_cnt -= d32;
+		u32 scale = beta_scale;
+		delta = (cwnd * scale) >> 3;
+	        while (ca->ack_cnt > delta) {		/* update tcp cwnd */
+	                ca->ack_cnt -= delta;
         	        ca->tcp_cwnd++;
 		}
 
 		if (ca->tcp_cwnd > cwnd){	/* if bic is slower than tcp */
-			d32 = ca->tcp_cwnd - cwnd;
-			max_cnt = cwnd / d32;
+			delta = ca->tcp_cwnd - cwnd;
+			max_cnt = cwnd / delta;
 			if (ca->cnt > max_cnt)
 				ca->cnt = max_cnt;
 		}
@@ -428,6 +381,34 @@ static struct tcp_congestion_ops cubictcp = {
 static int __init cubictcp_register(void)
 {
 	BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
+
+	/* Precompute a bunch of the scaling factors that are used per-packet
+	 * based on SRTT of 100ms
+	 */
+
+	beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta);
+
+	cube_rtt_scale = (bic_scale << 3) / 10;	/* 1024*c/rtt */
+
+	/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+	 *  so K = cubic_root( (wmax-cwnd)*rtt/c )
+	 * the unit of K is bictcp_HZ=2^10, not HZ
+	 *
+	 *  c = bic_scale >> 10
+	 *  rtt = 100ms
+	 *
+	 * the following code has been designed and tested for
+	 * cwnd < 1 million packets
+	 * RTT < 100 seconds
+	 * HZ < 1,000,00  (corresponding to 10 nano-second)
+	 */
+
+	/* 1/c * 2^2*bictcp_HZ * srtt */
+	cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
+
+	/* divide by bic_scale and by constant Srtt (100ms) */
+	do_div(cube_factor, bic_scale * 10);
+
 	return tcp_register_congestion_control(&cubictcp);
 }
 
-- 
cgit v1.2.3


From 9eb2d627190a8afe4b9276b24615a9559504fa60 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 21 Dec 2005 19:32:36 -0800
Subject: [TCP] cubic: use Newton-Raphson

Replace cube root algorithim with a faster version using Newton-Raphson.
Surprisingly, doing the scaled div64_64 is faster than a true 64 bit
division on 64 bit CPU's.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_cubic.c | 93 ++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 54 deletions(-)

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 44fd40891ac..31a4986dfbf 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_
 module_param(tcp_friendliness, int, 0644);
 MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
 
+#include <asm/div64.h>
 
 /* BIC TCP Parameters */
 struct bictcp {
@@ -93,67 +94,51 @@ static void bictcp_init(struct sock *sk)
 		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
-/* 65536 times the cubic root */
-static const u64 cubic_table[8]
-	= {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};
+/* 64bit divisor, dividend and result. dynamic precision */
+static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+	u_int32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	/* avoid 64 bit division if possible */
+	if (dividend >> 32)
+		do_div(dividend, d);
+	else
+		dividend = (uint32_t) dividend / d;
+
+	return dividend;
+}
 
 /*
- * calculate the cubic root of x
- * the basic idea is that x can be expressed as i*8^j
- * so cubic_root(x) = cubic_root(i)*2^j
- *  in the following code, x is i, and y is 2^j
- *  because of integer calculation, there are errors in calculation
- *  so finally use binary search to find out the exact solution
+ * calculate the cubic root of x using Newton-Raphson
  */
-static u32 cubic_root(u64 x)
+static u32 cubic_root(u64 a)
 {
-        u64 y, app, target, start, end, mid, start_diff, end_diff;
-
-        if (x == 0)
-                return 0;
+	u32 x, x1;
 
-        target = x;
-
-        /* first estimate lower and upper bound */
-        y = 1;
-        while (x >= 8){
-                x = (x >> 3);
-                y = (y << 1);
-        }
-        start = (y*cubic_table[x])>>16;
-        if (x==7)
-                end = (y<<1);
-        else
-                end = (y*cubic_table[x+1]+65535)>>16;
-
-        /* binary search for more accurate one */
-        while (start < end-1) {
-                mid = (start+end) >> 1;
-                app = mid*mid*mid;
-                if (app < target)
-                        start = mid;
-                else if (app > target)
-                        end = mid;
-                else
-                        return mid;
-        }
+	/* Initial estimate is based on:
+	 * cbrt(x) = exp(log(x) / 3)
+	 */
+	x = 1u << (fls64(a)/3);
 
-        /* find the most accurate one from start and end */
-        app = start*start*start;
-        if (app < target)
-                start_diff = target - app;
-        else
-                start_diff = app - target;
-        app = end*end*end;
-        if (app < target)
-                end_diff = target - app;
-        else
-                end_diff = app - target;
+	/*
+	 * Iteration based on:
+	 *                         2
+	 * x    = ( 2 * x  +  a / x  ) / 3
+	 *  k+1          k         k
+	 */
+	do {
+		x1 = x;
+		x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3;
+	} while (abs(x1 - x) > 1);
 
-        if (start_diff < end_diff)
-                return (u32)start;
-        else
-                return (u32)end;
+	return x;
 }
 
 /*
-- 
cgit v1.2.3


From fd9662555cc35f8bf9242cd7bba8b44ae168a68b Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Thu, 22 Dec 2005 11:25:10 -0800
Subject: [IPV4] fib_trie: Add credits.

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 705e3ce86df..a3ed7e13f79 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -41,6 +41,13 @@
  *		modify it under the terms of the GNU General Public License
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ *		David S. Miller, <davem@davemloft.net>
+ *		Stephen Hemminger <shemminger@osdl.org>
+ *		Paul E. McKenney <paulmck@us.ibm.com>
+ *		Patrick McHardy <kaber@trash.net>
  */
 
 #define VERSION "0.404"
-- 
cgit v1.2.3


From 52ccb8e90c0ace233b8b740f2fc5de0dbd706b27 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilz@us.ibm.com>
Date: Thu, 22 Dec 2005 11:36:46 -0800
Subject: [SCTP]: Update SCTP_PEER_ADDR_PARAMS socket option to the latest api
 draft.

This patch adds support to set/get heartbeat interval, maximum number of
retransmissions, pathmtu, sackdelay time for a particular transport/
association/socket as per the latest SCTP sockets api draft11.

Signed-off-by: Frank Filz <ffilz@us.ibm.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  76 +++++--
 include/net/sctp/user.h    |  16 ++
 net/sctp/associola.c       |  81 ++++---
 net/sctp/input.c           |  36 +++-
 net/sctp/output.c          |  17 +-
 net/sctp/sm_sideeffect.c   |  26 ++-
 net/sctp/sm_statefuns.c    |  20 +-
 net/sctp/socket.c          | 511 ++++++++++++++++++++++++++++++++++-----------
 net/sctp/transport.c       |  32 +--
 9 files changed, 595 insertions(+), 220 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8e7794ee27f..f5c22d77fea 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -277,6 +277,24 @@ struct sctp_sock {
 	__u32 default_context;
 	__u32 default_timetolive;
 
+	/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
+	 * the destination address every heartbeat interval. This value
+	 * will be inherited by all new associations.
+	 */
+	__u32 hbinterval;
+
+	/* This is the max_retrans value for new associations. */
+	__u16 pathmaxrxt;
+
+	/* The initial Path MTU to use for new associations. */
+	__u32 pathmtu;
+
+	/* The default SACK delay timeout for new associations. */
+	__u32 sackdelay;
+
+	/* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+	__u32 param_flags;
+
 	struct sctp_initmsg initmsg;
 	struct sctp_rtoinfo rtoinfo;
 	struct sctp_paddrparams paddrparam;
@@ -845,9 +863,6 @@ struct sctp_transport {
 	/* Data that has been sent, but not acknowledged. */
 	__u32 flight_size;
 
-	/* PMTU	      : The current known path MTU.  */
-	__u32 pmtu;
-
 	/* Destination */
 	struct dst_entry *dst;
 	/* Source address. */
@@ -862,7 +877,22 @@ struct sctp_transport {
 	/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
 	 * the destination address every heartbeat interval.
 	 */
-	int hb_interval;
+	__u32 hbinterval;
+
+	/* This is the max_retrans value for the transport and will
+	 * be initialized from the assocs value.  This can be changed
+	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+	 */
+	__u16 pathmaxrxt;
+
+	/* PMTU	      : The current known path MTU.  */
+	__u32 pathmtu;
+
+	/* SACK delay timeout */
+	__u32 sackdelay;
+
+	/* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+	__u32 param_flags;
 
 	/* When was the last time (in jiffies) that we heard from this
 	 * transport?  We use this to pick new active and retran paths.
@@ -882,22 +912,11 @@ struct sctp_transport {
 	 */
 	int state;
 
-	/* hb_allowed  : The current heartbeat state of this destination,
-	 *	       :  i.e. ALLOW-HB, NO-HEARTBEAT, etc.
-	 */
-	int hb_allowed;
-
 	/* These are the error stats for this destination.  */
 
 	/* Error count : The current error count for this destination.	*/
 	unsigned short error_count;
 
-	/* This is the max_retrans value for the transport and will
-	 * be initialized to proto.max_retrans.path.  This can be changed
-	 * using SCTP_SET_PEER_ADDR_PARAMS socket option.
-	 */
-	int max_retrans;
-
 	/* Per	       : A timer used by each destination.
 	 * Destination :
 	 * Timer       :
@@ -1502,6 +1521,28 @@ struct sctp_association {
 	/* The largest timeout or RTO value to use in attempting an INIT */
 	__u16 max_init_timeo;
 
+	/* Heartbeat interval: The endpoint sends out a Heartbeat chunk to
+	 * the destination address every heartbeat interval. This value
+	 * will be inherited by all new transports.
+	 */
+	__u32 hbinterval;
+
+	/* This is the max_retrans value for new transports in the
+	 * association.
+	 */
+	__u16 pathmaxrxt;
+
+	/* Association : The smallest PMTU discovered for all of the
+	 * PMTU	       : peer's transport addresses.
+	 */
+	__u32 pathmtu;
+
+	/* SACK delay timeout */
+	__u32 sackdelay;
+
+	/* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */
+	__u32 param_flags;
+
 	int timeouts[SCTP_NUM_TIMEOUT_TYPES];
 	struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES];
 
@@ -1571,11 +1612,6 @@ struct sctp_association {
 	 */
 	wait_queue_head_t	wait;
 
-	/* Association : The smallest PMTU discovered for all of the
-	 * PMTU	       : peer's transport addresses.
-	 */
-	__u32 pmtu;
-
 	/* The message size at which SCTP fragmentation will occur. */
 	__u32 frag_point;
 
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index f1c3bc54526..b9052864fa5 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -503,11 +503,27 @@ struct sctp_setadaption {
  *   unreachable. The following structure is used to access and modify an
  *   address's parameters:
  */
+enum  sctp_spp_flags {
+	SPP_HB_ENABLE = 1,		/*Enable heartbeats*/
+	SPP_HB_DISABLE = 2,		/*Disable heartbeats*/
+	SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE,
+	SPP_HB_DEMAND = 4,		/*Send heartbeat immediately*/
+	SPP_PMTUD_ENABLE = 8,		/*Enable PMTU discovery*/
+	SPP_PMTUD_DISABLE = 16,		/*Disable PMTU discovery*/
+	SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE,
+	SPP_SACKDELAY_ENABLE = 32,	/*Enable SACK*/
+	SPP_SACKDELAY_DISABLE = 64,	/*Disable SACK*/
+	SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
+};
+
 struct sctp_paddrparams {
 	sctp_assoc_t		spp_assoc_id;
 	struct sockaddr_storage	spp_address;
 	__u32			spp_hbinterval;
 	__u16			spp_pathmaxrxt;
+	__u32			spp_pathmtu;
+	__u32			spp_sackdelay;
+	__u32			spp_flags;
 } __attribute__((packed, aligned(4)));
 
 /*
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index dec68a60477..9d05e13e92f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000;
 	asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000)
 					* 1000;
-	asoc->pmtu = 0;
 	asoc->frag_point = 0;
 
 	/* Set the association max_retrans and RTO values from the
@@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
 	asoc->overall_error_count = 0;
 
+	/* Initialize the association's heartbeat interval based on the
+	 * sock configured value.
+	 */
+	asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
+
+	/* Initialize path max retrans value. */
+	asoc->pathmaxrxt = sp->pathmaxrxt;
+
+	/* Initialize default path MTU. */
+	asoc->pathmtu = sp->pathmtu;
+
+	/* Set association default SACK delay */
+	asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
+
+	/* Set the association default flags controlling
+	 * Heartbeat, SACK delay, and Path MTU Discovery.
+	 */
+	asoc->param_flags = sp->param_flags;
+
 	/* Initialize the maximum mumber of new data packets that can be sent
 	 * in a burst.
 	 */
@@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 		= 5 * asoc->rto_max;
 
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
-	asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
-		SCTP_DEFAULT_TIMEOUT_SACK;
+	asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
 		sp->autoclose * HZ;
 	
@@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 
 	sctp_transport_set_owner(peer, asoc);
 
+	/* Initialize the peer's heartbeat interval based on the
+	 * association configured value.
+	 */
+	peer->hbinterval = asoc->hbinterval;
+
+	/* Set the path max_retrans.  */
+	peer->pathmaxrxt = asoc->pathmaxrxt;
+
+	/* Initialize the peer's SACK delay timeout based on the
+	 * association configured value.
+	 */
+	peer->sackdelay = asoc->sackdelay;
+
+	/* Enable/disable heartbeat, SACK delay, and path MTU discovery
+	 * based on association setting.
+	 */
+	peer->param_flags = asoc->param_flags;
+
 	/* Initialize the pmtu of the transport. */
-	sctp_transport_pmtu(peer);
+	if (peer->param_flags & SPP_PMTUD_ENABLE)
+		sctp_transport_pmtu(peer);
+	else if (asoc->pathmtu)
+		peer->pathmtu = asoc->pathmtu;
+	else
+		peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 
 	/* If this is the first transport addr on this association,
 	 * initialize the association PMTU to the peer's PMTU.
 	 * If not and the current association PMTU is higher than the new
 	 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
 	 */
-	if (asoc->pmtu)
-		asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu);
+	if (asoc->pathmtu)
+		asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
 	else
-		asoc->pmtu = peer->pmtu;
+		asoc->pathmtu = peer->pathmtu;
 
 	SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to "
-			  "%d\n", asoc, asoc->pmtu);
+			  "%d\n", asoc, asoc->pathmtu);
 
-	asoc->frag_point = sctp_frag_point(sp, asoc->pmtu);
+	asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu);
 
 	/* The asoc->peer.port might not be meaningful yet, but
 	 * initialize the packet structure anyway.
@@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	 *   (for example, implementations MAY use the size of the
 	 *   receiver advertised window).
 	 */
-	peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380));
+	peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
 
 	/* At this point, we may not have the receiver's advertised window,
 	 * so initialize ssthresh to the default value and it will be set
@@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	peer->partial_bytes_acked = 0;
 	peer->flight_size = 0;
 
-	/* By default, enable heartbeat for peer address. */
-	peer->hb_allowed = 1;
-
-	/* Initialize the peer's heartbeat interval based on the
-	 * sock configured value.
-	 */
-	peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval);
-
-	/* Set the path max_retrans.  */
-	peer->max_retrans = sp->paddrparam.spp_pathmaxrxt;
-
 	/* Set the transport's RTO.initial value */
 	peer->rto = asoc->rto_initial;
 
@@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 	/* Get the lowest pmtu of all the transports. */
 	list_for_each(pos, &asoc->peer.transport_addr_list) {
 		t = list_entry(pos, struct sctp_transport, transports);
-		if (!pmtu || (t->pmtu < pmtu))
-			pmtu = t->pmtu;
+		if (!pmtu || (t->pathmtu < pmtu))
+			pmtu = t->pathmtu;
 	}
 
 	if (pmtu) {
 		struct sctp_sock *sp = sctp_sk(asoc->base.sk);
-		asoc->pmtu = pmtu;
+		asoc->pathmtu = pmtu;
 		asoc->frag_point = sctp_frag_point(sp, pmtu);
 	}
 
 	SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n",
-			  __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point);
+			  __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point);
 }
 
 /* Should we send a SACK to update our peer? */
@@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc)
 	case SCTP_STATE_SHUTDOWN_SENT:
 		if ((asoc->rwnd > asoc->a_rwnd) &&
 		    ((asoc->rwnd - asoc->a_rwnd) >=
-		     min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu)))
+		     min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu)))
 			return 1;
 		break;
 	default:
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b24ff2c1aef..238f1bffa68 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 			   struct sctp_transport *t, __u32 pmtu)
 {
-	if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
-		printk(KERN_WARNING "%s: Reported pmtu %d too low, "
-		       "using default minimum of %d\n", __FUNCTION__, pmtu,
-		       SCTP_DEFAULT_MINSEGMENT);
-		pmtu = SCTP_DEFAULT_MINSEGMENT;
-	}
+	if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu))
+		return;
 
-	if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) {
-		t->pmtu = pmtu;
+	if (t->param_flags & SPP_PMTUD_ENABLE) {
+		if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
+			printk(KERN_WARNING "%s: Reported pmtu %d too low, "
+			       "using default minimum of %d\n",
+			       __FUNCTION__, pmtu,
+			       SCTP_DEFAULT_MINSEGMENT);
+			/* Use default minimum segment size and disable
+			 * pmtu discovery on this transport.
+			 */
+			t->pathmtu = SCTP_DEFAULT_MINSEGMENT;
+			t->param_flags = (t->param_flags & ~SPP_HB) |
+				SPP_PMTUD_DISABLE;
+		} else {
+			t->pathmtu = pmtu;
+		}
+
+		/* Update association pmtu. */
 		sctp_assoc_sync_pmtu(asoc);
-		sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
 	}
+
+	/* Retransmit with the new pmtu setting.
+	 * Normally, if PMTU discovery is disabled, an ICMP Fragmentation
+	 * Needed will never be sent, but if a message was sent before
+	 * PMTU discovery was disabled that was larger than the PMTU, it
+	 * would not be fragmented, so it must be re-transmitted fragmented.	 
+	 */
+	sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
 }
 
 /*
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 93137163346..a40991ef72c 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
 		goto finish;
 
 	pmtu  = ((packet->transport->asoc) ?
-		 (packet->transport->asoc->pmtu) :
-		 (packet->transport->pmtu));
+		 (packet->transport->asoc->pathmtu) :
+		 (packet->transport->pathmtu));
 
 	too_big = (psize + chunk_len > pmtu);
 
@@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	if (!dst || (dst->obsolete > 1)) {
 		dst_release(dst);
 		sctp_transport_route(tp, NULL, sctp_sk(sk));
-		sctp_assoc_sync_pmtu(asoc);
+		if (asoc->param_flags & SPP_PMTUD_ENABLE) {
+			sctp_assoc_sync_pmtu(asoc);
+		}
 	}
 
 	nskb->dst = dst_clone(tp->dst);
@@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n",
 			  nskb->len);
 
-	(*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+	if (tp->param_flags & SPP_PMTUD_ENABLE)
+		(*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok);
+	else
+		(*tp->af_specific->sctp_xmit)(nskb, tp, 1);
 
 out:
 	packet->size = packet->overhead;
@@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
 	 * 	if ((flightsize + Max.Burst * MTU) < cwnd)
 	 *		cwnd = flightsize + Max.Burst * MTU
 	 */
-	max_burst_bytes = asoc->max_burst * asoc->pmtu;
+	max_burst_bytes = asoc->max_burst * asoc->pathmtu;
 	if ((transport->flight_size + max_burst_bytes) < transport->cwnd) {
 		transport->cwnd = transport->flight_size + max_burst_bytes;
 		SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: "
@@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet,
 		 * data will fit or delay in hopes of bundling a full
 		 * sized packet.
 		 */
-		if (len < asoc->pmtu - packet->overhead) {
+		if (len < asoc->pathmtu - packet->overhead) {
 			retval = SCTP_XMIT_NAGLE_DELAY;
 			goto finish;
 		}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 823947170a3..2d7d8a5db2a 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
 {
 	__u32 ctsn, max_tsn_seen;
 	struct sctp_chunk *sack;
+	struct sctp_transport *trans = asoc->peer.last_data_from;
 	int error = 0;
 
-	if (force)
+	if (force || 
+	    (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
+	    (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
 		asoc->peer.sack_needed = 1;
 
 	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
@@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
 	if (!asoc->peer.sack_needed) {
 		/* We will need a SACK for the next packet.  */
 		asoc->peer.sack_needed = 1;
-		goto out;
+
+		/* Set the SACK delay timeout based on the
+		 * SACK delay for the last transport
+		 * data was received from, or the default
+		 * for the association.
+		 */
+		if (trans)
+			asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 
+				trans->sackdelay;
+		else
+			asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = 
+				asoc->sackdelay;
+
+		/* Restart the SACK timer. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
 	} else {
 		if (asoc->a_rwnd > asoc->rwnd)
 			asoc->a_rwnd = asoc->rwnd;
@@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
 		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
 	}
-out:
+
 	return error;
 nomem:
 	error = -ENOMEM;
@@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 	asoc->overall_error_count++;
 
 	if (transport->state != SCTP_INACTIVE &&
-	    (transport->error_count++ >= transport->max_retrans)) {
+	    (transport->error_count++ >= transport->pathmaxrxt)) {
 		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
 					 " transport IP: port:%d failed.\n",
 					 asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 475bfb4972d..557a7d90b92 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
 	 * HEARTBEAT is sent (see Section 8.3).
 	 */
 
-	if (transport->hb_allowed) {
+	if (transport->param_flags & SPP_HB_ENABLE) {
 		if (SCTP_DISPOSITION_NOMEM ==
 				sctp_sf_heartbeat(ep, asoc, type, arg,
 						  commands))
@@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
 		return SCTP_DISPOSITION_DISCARD;
 	}
 
-	max_interval = link->hb_interval + link->rto;
+	max_interval = link->hbinterval + link->rto;
 
 	/* Check if the timestamp looks valid.  */
 	if (time_after(hbinfo->sent_at, jiffies) ||
@@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
 	 * document allow. However, an SCTP transmitter MUST NOT be
 	 * more aggressive than the following algorithms allow.
 	 */
-	if (chunk->end_of_packet) {
+	if (chunk->end_of_packet)
 		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
 
-		/* Start the SACK timer.  */
-		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
-				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
-	}
-
 	return SCTP_DISPOSITION_CONSUME;
 
 discard_force:
@@ -2721,13 +2716,9 @@ discard_force:
 	return SCTP_DISPOSITION_DISCARD;
 
 discard_noforce:
-	if (chunk->end_of_packet) {
+	if (chunk->end_of_packet)
 		sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
 
-		/* Start the SACK timer.  */
-		sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
-				SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
-	}
 	return SCTP_DISPOSITION_DISCARD;
 consume:
 	return SCTP_DISPOSITION_CONSUME;
@@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep,
 	 * send another. 
 	 */
 	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());
-	/* Start the SACK timer.  */
-	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
-			SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
 
 	return SCTP_DISPOSITION_CONSUME;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9df888e932c..adb5ee62c2a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1941,106 +1941,275 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  * address's parameters:
  *
  *  struct sctp_paddrparams {
- *      sctp_assoc_t            spp_assoc_id;
- *      struct sockaddr_storage spp_address;
- *      uint32_t                spp_hbinterval;
- *      uint16_t                spp_pathmaxrxt;
- *  };
- *
- *   spp_assoc_id    - (UDP style socket) This is filled in the application,
- *                     and identifies the association for this query.
+ *     sctp_assoc_t            spp_assoc_id;
+ *     struct sockaddr_storage spp_address;
+ *     uint32_t                spp_hbinterval;
+ *     uint16_t                spp_pathmaxrxt;
+ *     uint32_t                spp_pathmtu;
+ *     uint32_t                spp_sackdelay;
+ *     uint32_t                spp_flags;
+ * };
+ *
+ *   spp_assoc_id    - (one-to-many style socket) This is filled in the
+ *                     application, and identifies the association for
+ *                     this query.
  *   spp_address     - This specifies which address is of interest.
  *   spp_hbinterval  - This contains the value of the heartbeat interval,
- *                     in milliseconds.  A value of 0, when modifying the
- *                     parameter, specifies that the heartbeat on this
- *                     address should be disabled. A value of UINT32_MAX
- *                     (4294967295), when modifying the parameter,
- *                     specifies that a heartbeat should be sent
- *                     immediately to the peer address, and the current
- *                     interval should remain unchanged.
+ *                     in milliseconds.  If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
  *   spp_pathmaxrxt  - This contains the maximum number of
  *                     retransmissions before this address shall be
- *                     considered unreachable.
+ *                     considered unreachable. If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmtu     - When Path MTU discovery is disabled the value
+ *                     specified here will be the "fixed" path mtu.
+ *                     Note that if the spp_address field is empty
+ *                     then all associations on this address will
+ *                     have this fixed path mtu set upon them.
+ *
+ *   spp_sackdelay   - When delayed sack is enabled, this value specifies
+ *                     the number of milliseconds that sacks will be delayed
+ *                     for. This value will apply to all addresses of an
+ *                     association if the spp_address field is empty. Note
+ *                     also, that if delayed sack is enabled and this
+ *                     value is set to 0, no change is made to the last
+ *                     recorded delayed sack timer value.
+ *
+ *   spp_flags       - These flags are used to control various features
+ *                     on an association. The flag field may contain
+ *                     zero or more of the following options.
+ *
+ *                     SPP_HB_ENABLE  - Enable heartbeats on the
+ *                     specified address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     have heartbeats enabled upon them.
+ *
+ *                     SPP_HB_DISABLE - Disable heartbeats on the
+ *                     speicifed address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     will have their heartbeats disabled. Note also
+ *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ *                     mutually exclusive, only one of these two should
+ *                     be specified. Enabling both fields will have
+ *                     undetermined results.
+ *
+ *                     SPP_HB_DEMAND - Request a user initiated heartbeat
+ *                     to be made immediately.
+ *
+ *                     SPP_PMTUD_ENABLE - This field will enable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected.
+ *
+ *                     SPP_PMTUD_DISABLE - This field will disable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected. Not also that
+ *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ *                     exclusive. Enabling both will have undetermined
+ *                     results.
+ *
+ *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
+ *                     on delayed sack. The time specified in spp_sackdelay
+ *                     is used to specify the sack delay for this address. Note
+ *                     that if spp_address is empty then all addresses will
+ *                     enable delayed sack and take on the sack delay
+ *                     value specified in spp_sackdelay.
+ *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
+ *                     off delayed sack. If the spp_address field is blank then
+ *                     delayed sack is disabled for the entire association. Note
+ *                     also that this field is mutually exclusive to
+ *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
+ *                     results.
  */
+int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+				struct sctp_transport   *trans,
+				struct sctp_association *asoc,
+				struct sctp_sock        *sp,
+				int                      hb_change,
+				int                      pmtud_change,
+				int                      sackdelay_change)
+{
+	int error;
+
+	if (params->spp_flags & SPP_HB_DEMAND && trans) {
+		error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
+		if (error)
+			return error;
+	}
+
+	if (params->spp_hbinterval) {
+		if (trans) {
+			trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+		} else if (asoc) {
+			asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval);
+		} else {
+			sp->hbinterval = params->spp_hbinterval;
+		}
+	}
+
+	if (hb_change) {
+		if (trans) {
+			trans->param_flags =
+				(trans->param_flags & ~SPP_HB) | hb_change;
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_HB) | hb_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_HB) | hb_change;
+		}
+	}
+
+	if (params->spp_pathmtu) {
+		if (trans) {
+			trans->pathmtu = params->spp_pathmtu;
+			sctp_assoc_sync_pmtu(asoc);
+		} else if (asoc) {
+			asoc->pathmtu = params->spp_pathmtu;
+			sctp_frag_point(sp, params->spp_pathmtu);
+		} else {
+			sp->pathmtu = params->spp_pathmtu;
+		}
+	}
+
+	if (pmtud_change) {
+		if (trans) {
+			int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
+				(params->spp_flags & SPP_PMTUD_ENABLE);
+			trans->param_flags =
+				(trans->param_flags & ~SPP_PMTUD) | pmtud_change;
+			if (update) {
+				sctp_transport_pmtu(trans);
+				sctp_assoc_sync_pmtu(asoc);
+			}
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_PMTUD) | pmtud_change;
+		}
+	}
+
+	if (params->spp_sackdelay) {
+		if (trans) {
+			trans->sackdelay =
+				msecs_to_jiffies(params->spp_sackdelay);
+		} else if (asoc) {
+			asoc->sackdelay =
+				msecs_to_jiffies(params->spp_sackdelay);
+		} else {
+			sp->sackdelay = params->spp_sackdelay;
+		}
+	}
+
+	if (sackdelay_change) {
+		if (trans) {
+			trans->param_flags =
+				(trans->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		} else if (asoc) {
+			asoc->param_flags =
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		} else {
+			sp->param_flags =
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				sackdelay_change;
+		}
+	}
+
+	if (params->spp_pathmaxrxt) {
+		if (trans) {
+			trans->pathmaxrxt = params->spp_pathmaxrxt;
+		} else if (asoc) {
+			asoc->pathmaxrxt = params->spp_pathmaxrxt;
+		} else {
+			sp->pathmaxrxt = params->spp_pathmaxrxt;
+		}
+	}
+
+	return 0;
+}
+
 static int sctp_setsockopt_peer_addr_params(struct sock *sk,
 					    char __user *optval, int optlen)
 {
-	struct sctp_paddrparams params;
-	struct sctp_transport *trans;
+	struct sctp_paddrparams  params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
 	int error;
+	int hb_change, pmtud_change, sackdelay_change;
 
 	if (optlen != sizeof(struct sctp_paddrparams))
-		return -EINVAL;
+		return - EINVAL;
+
 	if (copy_from_user(&params, optval, optlen))
 		return -EFAULT;
 
-	/*
-	 * API 7. Socket Options (setting the default value for the endpoint)
-	 * All options that support specific settings on an association by
-	 * filling in either an association id variable or a sockaddr_storage
-	 * SHOULD also support setting of the same value for the entire endpoint
-	 * (i.e. future associations). To accomplish this the following logic is
-	 * used when setting one of these options:
-
-	 * c) If neither the sockaddr_storage or association identification is
-	 *    set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and
-	 *    the association identification is 0, the settings are a default
-	 *    and to be applied to the endpoint (all future associations).
-	 */
+	/* Validate flags and value parameters. */
+	hb_change        = params.spp_flags & SPP_HB;
+	pmtud_change     = params.spp_flags & SPP_PMTUD;
+	sackdelay_change = params.spp_flags & SPP_SACKDELAY;
+
+	if (hb_change        == SPP_HB ||
+	    pmtud_change     == SPP_PMTUD ||
+	    sackdelay_change == SPP_SACKDELAY ||
+	    params.spp_sackdelay > 500 ||
+	    (params.spp_pathmtu
+	    && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
+		return -EINVAL;
 
-	/* update default value for endpoint (all future associations) */
-	if (!params.spp_assoc_id && 
-	    sctp_is_any(( union sctp_addr *)&params.spp_address)) {
-		/* Manual heartbeat on an endpoint is invalid. */
-		if (0xffffffff == params.spp_hbinterval)
+	/* If an address other than INADDR_ANY is specified, and
+	 * no transport is found, then the request is invalid.
+	 */
+	if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+		trans = sctp_addr_id2transport(sk, &params.spp_address,
+					       params.spp_assoc_id);
+		if (!trans)
 			return -EINVAL;
-		else if (params.spp_hbinterval)
-			sctp_sk(sk)->paddrparam.spp_hbinterval =
-						params.spp_hbinterval;
-		if (params.spp_pathmaxrxt)
-			sctp_sk(sk)->paddrparam.spp_pathmaxrxt =
-						params.spp_pathmaxrxt;
-		return 0;
 	}
 
-	trans = sctp_addr_id2transport(sk, &params.spp_address,
-				       params.spp_assoc_id);
-	if (!trans)
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+	 */
+	asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+	if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP))
 		return -EINVAL;
 
-	/* Applications can enable or disable heartbeats for any peer address
-	 * of an association, modify an address's heartbeat interval, force a
-	 * heartbeat to be sent immediately, and adjust the address's maximum
-	 * number of retransmissions sent before an address is considered
-	 * unreachable.
-	 *
-	 * The value of the heartbeat interval, in milliseconds. A value of
-	 * UINT32_MAX (4294967295), when modifying the parameter, specifies
-	 * that a heartbeat should be sent immediately to the peer address,
-	 * and the current interval should remain unchanged.
-	 */
-	if (0xffffffff == params.spp_hbinterval) {
-		error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans);
-		if (error)
-			return error;
-	} else {
-	/* The value of the heartbeat interval, in milliseconds. A value of 0,
-	 * when modifying the parameter, specifies that the heartbeat on this
-	 * address should be disabled.
+	/* Heartbeat demand can only be sent on a transport or
+	 * association, but not a socket.
 	 */
-		if (params.spp_hbinterval) {
-			trans->hb_allowed = 1;
-			trans->hb_interval = 
-				msecs_to_jiffies(params.spp_hbinterval);
-		} else
-			trans->hb_allowed = 0;
-	}
+	if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc)
+		return -EINVAL;
+
+	/* Process parameters. */
+	error = sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+					    hb_change, pmtud_change,
+					    sackdelay_change);
 
-	/* spp_pathmaxrxt contains the maximum number of retransmissions
-	 * before this address shall be considered unreachable.
+	if (error)
+		return error;
+
+	/* If changes are for association, also apply parameters to each
+	 * transport.
 	 */
-	if (params.spp_pathmaxrxt)
-		trans->max_retrans = params.spp_pathmaxrxt;
+	if (!trans && asoc) {
+		struct list_head *pos;
+
+		list_for_each(pos, &asoc->peer.transport_addr_list) {
+			trans = list_entry(pos, struct sctp_transport,
+					   transports);
+			sctp_apply_peer_addr_params(&params, trans, asoc, sp,
+						    hb_change, pmtud_change,
+						    sackdelay_change);
+		}
+	}
 
 	return 0;
 }
@@ -2334,7 +2503,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl
 	/* Update the frag_point of the existing associations. */
 	list_for_each(pos, &(sp->ep->asocs)) {
 		asoc = list_entry(pos, struct sctp_association, asocs);
-		asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); 
+		asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); 
 	}
 
 	return 0;
@@ -2715,8 +2884,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	/* Default Peer Address Parameters.  These defaults can
 	 * be modified via SCTP_PEER_ADDR_PARAMS
 	 */
-	sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval);
-	sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path;
+	sp->hbinterval  = jiffies_to_msecs(sctp_hb_interval);
+	sp->pathmaxrxt  = sctp_max_retrans_path;
+	sp->pathmtu     = 0; // allow default discovery
+	sp->sackdelay   = sctp_sack_timeout;
+	sp->param_flags = SPP_HB_ENABLE |
+	                  SPP_PMTUD_ENABLE |
+	                  SPP_SACKDELAY_ENABLE;
 
 	/* If enabled no SCTP message fragmentation will be performed.
 	 * Configure through SCTP_DISABLE_FRAGMENTS socket option.
@@ -2865,7 +3039,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
 	status.sstat_primary.spinfo_cwnd = transport->cwnd;
 	status.sstat_primary.spinfo_srtt = transport->srtt;
 	status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
-	status.sstat_primary.spinfo_mtu = transport->pmtu;
+	status.sstat_primary.spinfo_mtu = transport->pathmtu;
 
 	if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
 		status.sstat_primary.spinfo_state = SCTP_ACTIVE;
@@ -2924,7 +3098,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
 	pinfo.spinfo_cwnd = transport->cwnd;
 	pinfo.spinfo_srtt = transport->srtt;
 	pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
-	pinfo.spinfo_mtu = transport->pmtu;
+	pinfo.spinfo_mtu = transport->pathmtu;
 
 	if (pinfo.spinfo_state == SCTP_UNKNOWN)
 		pinfo.spinfo_state = SCTP_ACTIVE;
@@ -3086,69 +3260,154 @@ out:
  * address's parameters:
  *
  *  struct sctp_paddrparams {
- *      sctp_assoc_t            spp_assoc_id;
- *      struct sockaddr_storage spp_address;
- *      uint32_t                spp_hbinterval;
- *      uint16_t                spp_pathmaxrxt;
- *  };
- *
- *   spp_assoc_id    - (UDP style socket) This is filled in the application,
- *                     and identifies the association for this query.
+ *     sctp_assoc_t            spp_assoc_id;
+ *     struct sockaddr_storage spp_address;
+ *     uint32_t                spp_hbinterval;
+ *     uint16_t                spp_pathmaxrxt;
+ *     uint32_t                spp_pathmtu;
+ *     uint32_t                spp_sackdelay;
+ *     uint32_t                spp_flags;
+ * };
+ *
+ *   spp_assoc_id    - (one-to-many style socket) This is filled in the
+ *                     application, and identifies the association for
+ *                     this query.
  *   spp_address     - This specifies which address is of interest.
  *   spp_hbinterval  - This contains the value of the heartbeat interval,
- *                     in milliseconds.  A value of 0, when modifying the
- *                     parameter, specifies that the heartbeat on this
- *                     address should be disabled. A value of UINT32_MAX
- *                     (4294967295), when modifying the parameter,
- *                     specifies that a heartbeat should be sent
- *                     immediately to the peer address, and the current
- *                     interval should remain unchanged.
+ *                     in milliseconds.  If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
  *   spp_pathmaxrxt  - This contains the maximum number of
  *                     retransmissions before this address shall be
- *                     considered unreachable.
+ *                     considered unreachable. If a  value of zero
+ *                     is present in this field then no changes are to
+ *                     be made to this parameter.
+ *   spp_pathmtu     - When Path MTU discovery is disabled the value
+ *                     specified here will be the "fixed" path mtu.
+ *                     Note that if the spp_address field is empty
+ *                     then all associations on this address will
+ *                     have this fixed path mtu set upon them.
+ *
+ *   spp_sackdelay   - When delayed sack is enabled, this value specifies
+ *                     the number of milliseconds that sacks will be delayed
+ *                     for. This value will apply to all addresses of an
+ *                     association if the spp_address field is empty. Note
+ *                     also, that if delayed sack is enabled and this
+ *                     value is set to 0, no change is made to the last
+ *                     recorded delayed sack timer value.
+ *
+ *   spp_flags       - These flags are used to control various features
+ *                     on an association. The flag field may contain
+ *                     zero or more of the following options.
+ *
+ *                     SPP_HB_ENABLE  - Enable heartbeats on the
+ *                     specified address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     have heartbeats enabled upon them.
+ *
+ *                     SPP_HB_DISABLE - Disable heartbeats on the
+ *                     speicifed address. Note that if the address
+ *                     field is empty all addresses for the association
+ *                     will have their heartbeats disabled. Note also
+ *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
+ *                     mutually exclusive, only one of these two should
+ *                     be specified. Enabling both fields will have
+ *                     undetermined results.
+ *
+ *                     SPP_HB_DEMAND - Request a user initiated heartbeat
+ *                     to be made immediately.
+ *
+ *                     SPP_PMTUD_ENABLE - This field will enable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected.
+ *
+ *                     SPP_PMTUD_DISABLE - This field will disable PMTU
+ *                     discovery upon the specified address. Note that
+ *                     if the address feild is empty then all addresses
+ *                     on the association are effected. Not also that
+ *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
+ *                     exclusive. Enabling both will have undetermined
+ *                     results.
+ *
+ *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
+ *                     on delayed sack. The time specified in spp_sackdelay
+ *                     is used to specify the sack delay for this address. Note
+ *                     that if spp_address is empty then all addresses will
+ *                     enable delayed sack and take on the sack delay
+ *                     value specified in spp_sackdelay.
+ *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
+ *                     off delayed sack. If the spp_address field is blank then
+ *                     delayed sack is disabled for the entire association. Note
+ *                     also that this field is mutually exclusive to
+ *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
+ *                     results.
  */
 static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
-						char __user *optval, int __user *optlen)
+					    char __user *optval, int __user *optlen)
 {
-	struct sctp_paddrparams params;
-	struct sctp_transport *trans;
+	struct sctp_paddrparams  params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
 
 	if (len != sizeof(struct sctp_paddrparams))
 		return -EINVAL;
+
 	if (copy_from_user(&params, optval, len))
 		return -EFAULT;
 
-	/* If no association id is specified retrieve the default value
-	 * for the endpoint that will be used for all future associations
+	/* If an address other than INADDR_ANY is specified, and
+	 * no transport is found, then the request is invalid.
 	 */
-	if (!params.spp_assoc_id &&
-	    sctp_is_any(( union sctp_addr *)&params.spp_address)) {
-		params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval;
-		params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt;
-
-		goto done;
+	if (!sctp_is_any(( union sctp_addr *)&params.spp_address)) {
+		trans = sctp_addr_id2transport(sk, &params.spp_address,
+					       params.spp_assoc_id);
+		if (!trans) {
+			SCTP_DEBUG_PRINTK("Failed no transport\n");
+			return -EINVAL;
+		}
 	}
 
-	trans = sctp_addr_id2transport(sk, &params.spp_address,
-				       params.spp_assoc_id);
-	if (!trans)
-		return -EINVAL;
-
-	/* The value of the heartbeat interval, in milliseconds. A value of 0,
-	 * when modifying the parameter, specifies that the heartbeat on this
-	 * address should be disabled.
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
 	 */
-	if (!trans->hb_allowed)
-		params.spp_hbinterval = 0;
-	else
-		params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval);
+	asoc = sctp_id2assoc(sk, params.spp_assoc_id);
+	if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) {
+		SCTP_DEBUG_PRINTK("Failed no association\n");
+		return -EINVAL;
+	}
 
-	/* spp_pathmaxrxt contains the maximum number of retransmissions
-	 * before this address shall be considered unreachable.
-	 */
-	params.spp_pathmaxrxt = trans->max_retrans;
+	if (trans) {
+		/* Fetch transport values. */
+		params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
+		params.spp_pathmtu    = trans->pathmtu;
+		params.spp_pathmaxrxt = trans->pathmaxrxt;
+		params.spp_sackdelay  = jiffies_to_msecs(trans->sackdelay);
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = trans->param_flags;
+	} else if (asoc) {
+		/* Fetch association values. */
+		params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
+		params.spp_pathmtu    = asoc->pathmtu;
+		params.spp_pathmaxrxt = asoc->pathmaxrxt;
+		params.spp_sackdelay  = jiffies_to_msecs(asoc->sackdelay);
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = asoc->param_flags;
+	} else {
+		/* Fetch socket values. */
+		params.spp_hbinterval = sp->hbinterval;
+		params.spp_pathmtu    = sp->pathmtu;
+		params.spp_sackdelay  = sp->sackdelay;
+		params.spp_pathmaxrxt = sp->pathmaxrxt;
+
+		/*draft-11 doesn't say what to return in spp_flags*/
+		params.spp_flags      = sp->param_flags;
+	}
 
-done:
 	if (copy_to_user(optval, &params, len))
 		return -EFAULT;
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 268ddaf2dc0..68d73e2dd15 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 	peer->init_sent_count = 0;
 
 	peer->state = SCTP_ACTIVE;
-	peer->hb_allowed = 0;
+	peer->param_flags = SPP_HB_DISABLE |
+			    SPP_PMTUD_ENABLE |
+			    SPP_SACKDELAY_ENABLE;
+	peer->hbinterval  = 0;
 
 	/* Initialize the default path max_retrans.  */
-	peer->max_retrans = sctp_max_retrans_path;
+	peer->pathmaxrxt  = sctp_max_retrans_path;
 	peer->error_count = 0;
 
 	INIT_LIST_HEAD(&peer->transmitted);
@@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport)
 	dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL);
 
 	if (dst) {
-		transport->pmtu = dst_mtu(dst);
+		transport->pathmtu = dst_mtu(dst);
 		dst_release(dst);
 	} else
-		transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
 /* Caches the dst entry and source address for a transport's destination
@@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport,
 		af->get_saddr(asoc, dst, daddr, &transport->saddr);
 
 	transport->dst = dst;
+	if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
+		return;
+	}
 	if (dst) {
-		transport->pmtu = dst_mtu(dst);
+		transport->pathmtu = dst_mtu(dst);
 
 		/* Initialize sk->sk_rcv_saddr, if the transport is the
 		 * association's active path for getsockname().
@@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport,
 			opt->pf->af->to_sk_saddr(&transport->saddr,
 						 asoc->base.sk);
 	} else
-		transport->pmtu = SCTP_DEFAULT_MAXSEGMENT;
+		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
 /* Hold a reference to a transport.  */
@@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
 
 	ssthresh = transport->ssthresh;
 	pba = transport->partial_bytes_acked;
-	pmtu = transport->asoc->pmtu;
+	pmtu = transport->asoc->pathmtu;
 
 	if (cwnd <= ssthresh) {
 		/* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less
@@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 *      partial_bytes_acked = 0
 		 */
 		transport->ssthresh = max(transport->cwnd/2,
-					  4*transport->asoc->pmtu);
-		transport->cwnd = transport->asoc->pmtu;
+					  4*transport->asoc->pathmtu);
+		transport->cwnd = transport->asoc->pathmtu;
 		break;
 
 	case SCTP_LOWER_CWND_FAST_RTX:
@@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 *      partial_bytes_acked = 0
 		 */
 		transport->ssthresh = max(transport->cwnd/2,
-					  4*transport->asoc->pmtu);
+					  4*transport->asoc->pathmtu);
 		transport->cwnd = transport->ssthresh;
 		break;
 
@@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		if ((jiffies - transport->last_time_ecne_reduced) >
 		    transport->rtt) {
 			transport->ssthresh = max(transport->cwnd/2,
-					  	  4*transport->asoc->pmtu);
+					  	  4*transport->asoc->pathmtu);
 			transport->cwnd = transport->ssthresh;
 			transport->last_time_ecne_reduced = jiffies;
 		}
@@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 		 */
 		if ((jiffies - transport->last_time_used) > transport->rto)
 			transport->cwnd = max(transport->cwnd/2,
-						 4*transport->asoc->pmtu);
+						 4*transport->asoc->pathmtu);
 		break;
 	};
 
@@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
 unsigned long sctp_transport_timeout(struct sctp_transport *t)
 {
 	unsigned long timeout;
-	timeout = t->hb_interval + t->rto + sctp_jitter(t->rto);
+	timeout = t->hbinterval + t->rto + sctp_jitter(t->rto);
 	timeout += jiffies;
 	return timeout;
 }
-- 
cgit v1.2.3


From 7708610b1bff4a0ba8a73733d3c7c4bda9f94b21 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilz@us.ibm.com>
Date: Thu, 22 Dec 2005 11:37:30 -0800
Subject: [SCTP]: Add support for SCTP_DELAYED_ACK_TIME socket option.

Signed-off-by: Frank Filz <ffilz@us.ibm.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/user.h |  14 ++++
 net/sctp/socket.c       | 184 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)

diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
index b9052864fa5..8a6bef6f91e 100644
--- a/include/net/sctp/user.h
+++ b/include/net/sctp/user.h
@@ -93,6 +93,8 @@ enum sctp_optname {
 #define SCTP_STATUS SCTP_STATUS
 	SCTP_GET_PEER_ADDR_INFO,
 #define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO
+	SCTP_DELAYED_ACK_TIME,
+#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME
 
 	/* Internal Socket Options. Some of the sctp library functions are 
 	 * implemented using these socket options.
@@ -526,6 +528,18 @@ struct sctp_paddrparams {
 	__u32			spp_flags;
 } __attribute__((packed, aligned(4)));
 
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ *   This options will get or set the delayed ack timer.  The time is set
+ *   in milliseconds.  If the assoc_id is 0, then this sets or gets the
+ *   endpoints default delayed ack timer value.  If the assoc_id field is
+ *   non-zero, then the set or get effects the specified association.
+ */
+struct sctp_assoc_value {
+    sctp_assoc_t            assoc_id;
+    uint32_t                assoc_value;
+};
+
 /*
  * 7.2.2 Peer Address Information
  *
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index adb5ee62c2a..fc04d185fa3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2214,6 +2214,109 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
 	return 0;
 }
 
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ *   This options will get or set the delayed ack timer.  The time is set
+ *   in milliseconds.  If the assoc_id is 0, then this sets or gets the
+ *   endpoints default delayed ack timer value.  If the assoc_id field is
+ *   non-zero, then the set or get effects the specified association.
+ *
+ *   struct sctp_assoc_value {
+ *       sctp_assoc_t            assoc_id;
+ *       uint32_t                assoc_value;
+ *   };
+ *
+ *     assoc_id    - This parameter, indicates which association the
+ *                   user is preforming an action upon. Note that if
+ *                   this field's value is zero then the endpoints
+ *                   default value is changed (effecting future
+ *                   associations only).
+ *
+ *     assoc_value - This parameter contains the number of milliseconds
+ *                   that the user is requesting the delayed ACK timer
+ *                   be set to. Note that this value is defined in
+ *                   the standard to be between 200 and 500 milliseconds.
+ *
+ *                   Note: a value of zero will leave the value alone,
+ *                   but disable SACK delay. A non-zero value will also
+ *                   enable SACK delay.
+ */
+
+static int sctp_setsockopt_delayed_ack_time(struct sock *sk,
+					    char __user *optval, int optlen)
+{
+	struct sctp_assoc_value  params;
+	struct sctp_transport   *trans = NULL;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+
+	if (optlen != sizeof(struct sctp_assoc_value))
+		return - EINVAL;
+
+	if (copy_from_user(&params, optval, optlen))
+		return -EFAULT;
+
+	/* Validate value parameter. */
+	if (params.assoc_value > 500)
+		return -EINVAL;
+
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+ 	 */
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (params.assoc_value) {
+		if (asoc) {
+			asoc->sackdelay =
+				msecs_to_jiffies(params.assoc_value);
+			asoc->param_flags = 
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		} else {
+			sp->sackdelay = params.assoc_value;
+			sp->param_flags = 
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_ENABLE;
+		}
+	} else {
+		if (asoc) {
+			asoc->param_flags = 
+				(asoc->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_DISABLE;
+		} else {
+			sp->param_flags = 
+				(sp->param_flags & ~SPP_SACKDELAY) |
+				SPP_SACKDELAY_DISABLE;
+		}
+	}
+
+	/* If change is for association, also apply to each transport. */
+	if (asoc) {
+		struct list_head *pos;
+
+		list_for_each(pos, &asoc->peer.transport_addr_list) {
+			trans = list_entry(pos, struct sctp_transport,
+					   transports);
+			if (params.assoc_value) {
+				trans->sackdelay =
+					msecs_to_jiffies(params.assoc_value);
+				trans->param_flags = 
+					(trans->param_flags & ~SPP_SACKDELAY) |
+					SPP_SACKDELAY_ENABLE;
+			} else {
+				trans->param_flags = 
+					(trans->param_flags & ~SPP_SACKDELAY) |
+					SPP_SACKDELAY_DISABLE;
+			}
+		}
+	}
+ 
+	return 0;
+}
+
 /* 7.1.3 Initialization Parameters (SCTP_INITMSG)
  *
  * Applications can specify protocol parameters for the default association
@@ -2660,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen);
 		break;
 
+	case SCTP_DELAYED_ACK_TIME:
+		retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen);
+		break;
+
 	case SCTP_INITMSG:
 		retval = sctp_setsockopt_initmsg(sk, optval, optlen);
 		break;
@@ -3417,6 +3524,79 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 	return 0;
 }
 
+/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME)
+ *
+ *   This options will get or set the delayed ack timer.  The time is set
+ *   in milliseconds.  If the assoc_id is 0, then this sets or gets the
+ *   endpoints default delayed ack timer value.  If the assoc_id field is
+ *   non-zero, then the set or get effects the specified association.
+ *
+ *   struct sctp_assoc_value {
+ *       sctp_assoc_t            assoc_id;
+ *       uint32_t                assoc_value;
+ *   };
+ *
+ *     assoc_id    - This parameter, indicates which association the
+ *                   user is preforming an action upon. Note that if
+ *                   this field's value is zero then the endpoints
+ *                   default value is changed (effecting future
+ *                   associations only).
+ *
+ *     assoc_value - This parameter contains the number of milliseconds
+ *                   that the user is requesting the delayed ACK timer
+ *                   be set to. Note that this value is defined in
+ *                   the standard to be between 200 and 500 milliseconds.
+ *
+ *                   Note: a value of zero will leave the value alone,
+ *                   but disable SACK delay. A non-zero value will also
+ *                   enable SACK delay.
+ */
+static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len,
+					    char __user *optval,
+					    int __user *optlen)
+{
+	struct sctp_assoc_value  params;
+	struct sctp_association *asoc = NULL;
+	struct sctp_sock        *sp = sctp_sk(sk);
+
+	if (len != sizeof(struct sctp_assoc_value))
+		return - EINVAL;
+
+	if (copy_from_user(&params, optval, len))
+		return -EFAULT;
+
+	/* Get association, if assoc_id != 0 and the socket is a one
+	 * to many style socket, and an association was not found, then
+	 * the id was invalid.
+ 	 */
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	if (asoc) {
+		/* Fetch association values. */
+		if (asoc->param_flags & SPP_SACKDELAY_ENABLE)
+			params.assoc_value = jiffies_to_msecs(
+				asoc->sackdelay);
+		else
+			params.assoc_value = 0;
+	} else {
+		/* Fetch socket values. */
+		if (sp->param_flags & SPP_SACKDELAY_ENABLE)
+			params.assoc_value  = sp->sackdelay;
+		else
+			params.assoc_value  = 0;
+	}
+
+	if (copy_to_user(optval, &params, len))
+		return -EFAULT;
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return 0;
+}
+
 /* 7.1.3 Initialization Parameters (SCTP_INITMSG)
  *
  * Applications can specify protocol parameters for the default association
@@ -4274,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
 							  optlen);
 		break;
+	case SCTP_DELAYED_ACK_TIME:
+		retval = sctp_getsockopt_delayed_ack_time(sk, len, optval,
+							  optlen);
+		break;
 	case SCTP_INITMSG:
 		retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
 		break;
-- 
cgit v1.2.3


From 77d76ea310b50a9c8ff15bd290fcb4ed4961adf2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 22 Dec 2005 12:43:42 -0800
Subject: [NET]: Small cleanup to socket initialization

sock_init can be done as a core_initcall instead of calling
it directly in init/main.c

Also I removed an out of date #ifdef.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  1 -
 include/linux/socket.h |  1 -
 init/main.c            |  4 ----
 net/socket.c           | 10 +++++-----
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 97f6580ce03..971677178e0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -32,7 +32,6 @@
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
-#define SLAB_SKB 		/* Slabified skbuffs 	   */
 
 #define CHECKSUM_NONE 0
 #define CHECKSUM_HW 1
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 1739c2d5b95..9f4019156fd 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage {
 #include <linux/compiler.h>		/* __user			*/
 
 extern int sysctl_somaxconn;
-extern void sock_init(void);
 #ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void socket_seq_show(struct seq_file *seq);
diff --git a/init/main.c b/init/main.c
index 27f97f9b463..54aaf561cf6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,7 +47,6 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
-#include <net/sock.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -614,9 +613,6 @@ static void __init do_basic_setup(void)
 	sysctl_init();
 #endif
 
-	/* Networking initialization needs a process context */ 
-	sock_init();
-
 	do_initcalls();
 }
 
diff --git a/net/socket.c b/net/socket.c
index 3145103cdf5..98be7ef3c08 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2036,7 +2036,7 @@ int sock_unregister(int family)
 	return 0;
 }
 
-void __init sock_init(void)
+static int __init sock_init(void)
 {
 	/*
 	 *	Initialize sock SLAB cache.
@@ -2044,12 +2044,10 @@ void __init sock_init(void)
 	 
 	sk_init();
 
-#ifdef SLAB_SKB
 	/*
 	 *	Initialize skbuff SLAB cache 
 	 */
 	skb_init();
-#endif
 
 	/*
 	 *	Initialize the protocols module. 
@@ -2058,8 +2056,8 @@ void __init sock_init(void)
 	init_inodecache();
 	register_filesystem(&sock_fs_type);
 	sock_mnt = kern_mount(&sock_fs_type);
-	/* The real protocol initialization is performed when
-	 *  do_initcalls is run.  
+
+	/* The real protocol initialization is performed in later initcalls.
 	 */
 
 #ifdef CONFIG_NETFILTER
@@ -2067,6 +2065,8 @@ void __init sock_init(void)
 #endif
 }
 
+core_initcall(sock_init);	/* early initcall */
+
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-- 
cgit v1.2.3


From 90ddc4f0470427df306f308ad03db6b6b21644b8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 22 Dec 2005 12:49:22 -0800
Subject: [NET]: move struct proto_ops to const

I noticed that some of 'struct proto_ops' used in the kernel may share
a cache line used by locks or other heavily modified data. (default
linker alignement is 32 bytes, and L1_CACHE_LINE is 64 or 128 at
least)

This patch makes sure a 'struct proto_ops' can be declared as const,
so that all cpus can share all parts of it without false sharing.

This is not mandatory : a driver can still use a read/write structure
if it needs to (and eventually a __read_mostly)

I made a global stubstitute to change all existing occurences to make
them const.

This should reduce the possibility of false sharing on SMP, and
speedup some socket system calls.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h         |  4 ++--
 include/net/inet_common.h   |  4 ++--
 include/net/ipv6.h          |  4 ++--
 include/net/protocol.h      |  2 +-
 net/appletalk/ddp.c         |  4 ++--
 net/atm/pvc.c               |  2 +-
 net/atm/svc.c               |  2 +-
 net/ax25/af_ax25.c          |  4 ++--
 net/bluetooth/bnep/sock.c   |  2 +-
 net/bluetooth/cmtp/sock.c   |  2 +-
 net/bluetooth/hci_sock.c    |  2 +-
 net/bluetooth/hidp/sock.c   |  2 +-
 net/bluetooth/l2cap.c       |  4 ++--
 net/bluetooth/rfcomm/sock.c |  4 ++--
 net/bluetooth/sco.c         |  4 ++--
 net/dccp/proto.c            |  2 +-
 net/decnet/af_decnet.c      |  4 ++--
 net/econet/af_econet.c      |  4 ++--
 net/ipv4/af_inet.c          |  6 +++---
 net/ipv6/af_inet6.c         |  6 +++---
 net/ipx/af_ipx.c            |  4 ++--
 net/irda/af_irda.c          | 16 ++++++++--------
 net/key/af_key.c            |  4 ++--
 net/llc/af_llc.c            |  4 ++--
 net/netlink/af_netlink.c    |  4 ++--
 net/netrom/af_netrom.c      |  4 ++--
 net/packet/af_packet.c      |  8 ++++----
 net/sctp/ipv6.c             |  2 +-
 net/sctp/protocol.c         |  2 +-
 net/sunrpc/svcsock.c        |  2 +-
 net/unix/af_unix.c          |  6 +++---
 net/wanrouter/af_wanpipe.c  |  4 ++--
 net/x25/af_x25.c            |  4 ++--
 33 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index d6a41e6577f..28195a2d8ff 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -107,7 +107,7 @@ enum sock_type {
 struct socket {
 	socket_state		state;
 	unsigned long		flags;
-	struct proto_ops	*ops;
+	const struct proto_ops	*ops;
 	struct fasync_struct	*fasync_list;
 	struct file		*file;
 	struct sock		*sk;
@@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms
 SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \
 	      (file, sock, vma)) \
 	      \
-static struct proto_ops name##_ops = {			\
+static const struct proto_ops name##_ops = {			\
 	.family		= fam,				\
 	.owner		= THIS_MODULE,			\
 	.release	= __lock_##name##_release,	\
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index f943306ce5f..227adcbdfec 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -1,8 +1,8 @@
 #ifndef _INET_COMMON_H
 #define _INET_COMMON_H
 
-extern struct proto_ops		inet_stream_ops;
-extern struct proto_ops		inet_dgram_ops;
+extern const struct proto_ops		inet_stream_ops;
+extern const struct proto_ops		inet_dgram_ops;
 
 /*
  *	INET4 prototypes used by INET6
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index e3d5d7bc883..11a725662c3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -538,8 +538,8 @@ extern int sysctl_ip6frag_low_thresh;
 extern int sysctl_ip6frag_time;
 extern int sysctl_ip6frag_secret_interval;
 
-extern struct proto_ops inet6_stream_ops;
-extern struct proto_ops inet6_dgram_ops;
+extern const struct proto_ops inet6_stream_ops;
+extern const struct proto_ops inet6_dgram_ops;
 
 extern int ip6_mc_source(int add, int omode, struct sock *sk,
 			 struct group_source_req *pgsr);
diff --git a/include/net/protocol.h b/include/net/protocol.h
index a29cb29647d..63f7db99c2a 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -65,7 +65,7 @@ struct inet_protosw {
 	int		 protocol; /* This is the L4 protocol number.  */
 
 	struct proto	 *prot;
-	struct proto_ops *ops;
+	const struct proto_ops *ops;
   
 	int              capability; /* Which (if any) capability do
 				      * we need to use this socket
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 7982656b9c8..296f186802f 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -63,7 +63,7 @@
 #include <linux/atalk.h>
 
 struct datalink_proto *ddp_dl, *aarp_dl;
-static struct proto_ops atalk_dgram_ops;
+static const struct proto_ops atalk_dgram_ops;
 
 /**************************************************************************\
 *                                                                          *
@@ -1841,7 +1841,7 @@ static struct net_proto_family atalk_family_ops = {
 	.owner		= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = {
 	.family		= PF_APPLETALK,
 	.owner		= THIS_MODULE,
 	.release	= atalk_release,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2684a92da22..f2c541774dc 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr,
 }
 
 
-static struct proto_ops pvc_proto_ops = {
+static const struct proto_ops pvc_proto_ops = {
 	.family =	PF_ATMPVC,
 	.owner =	THIS_MODULE,
 
diff --git a/net/atm/svc.c b/net/atm/svc.c
index d7b266136bf..3a180cfd7b4 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return error;
 }
 
-static struct proto_ops svc_proto_ops = {
+static const struct proto_ops svc_proto_ops = {
 	.family =	PF_ATMSVC,
 	.owner =	THIS_MODULE,
 
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 1b683f30265..8b5d10aaba0 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -54,7 +54,7 @@
 HLIST_HEAD(ax25_list);
 DEFINE_SPINLOCK(ax25_list_lock);
 
-static struct proto_ops ax25_proto_ops;
+static const struct proto_ops ax25_proto_ops;
 
 static void ax25_free_sock(struct sock *sk)
 {
@@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops ax25_proto_ops = {
+static const struct proto_ops ax25_proto_ops = {
 	.family		= PF_AX25,
 	.owner		= THIS_MODULE,
 	.release	= ax25_release,
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 9778c6acd53..ccbaf69afc5 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return 0;
 }
 
-static struct proto_ops bnep_sock_ops = {
+static const struct proto_ops bnep_sock_ops = {
 	.family     = PF_BLUETOOTH,
 	.owner      = THIS_MODULE,
 	.release    = bnep_sock_release,
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index beb045bf571..5e22343b609 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return -EINVAL;
 }
 
-static struct proto_ops cmtp_sock_ops = {
+static const struct proto_ops cmtp_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= cmtp_sock_release,
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1d6d0a15c09..84e6c93a044 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char
 	return 0;
 }
 
-static struct proto_ops hci_sock_ops = {
+static const struct proto_ops hci_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= hci_sock_release,
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index f8986f88143..8f8dd931b29 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
 	return -EINVAL;
 }
 
-static struct proto_ops hidp_sock_ops = {
+static const struct proto_ops hidp_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= hidp_sock_release,
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 95f33cc7a24..7f0781e4326 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -57,7 +57,7 @@
 
 #define VERSION "2.8"
 
-static struct proto_ops l2cap_sock_ops;
+static const struct proto_ops l2cap_sock_ops;
 
 static struct bt_sock_list l2cap_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -2161,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
 
-static struct proto_ops l2cap_sock_ops = {
+static const struct proto_ops l2cap_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= l2cap_sock_release,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 6c34261b232..757d2dd3b02 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -58,7 +58,7 @@
 #define BT_DBG(D...)
 #endif
 
-static struct proto_ops rfcomm_sock_ops;
+static const struct proto_ops rfcomm_sock_ops;
 
 static struct bt_sock_list rfcomm_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
 
-static struct proto_ops rfcomm_sock_ops = {
+static const struct proto_ops rfcomm_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= rfcomm_sock_release,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 64818143069..6b61323ce23 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -56,7 +56,7 @@
 
 #define VERSION "0.5"
 
-static struct proto_ops sco_sock_ops;
+static const struct proto_ops sco_sock_ops;
 
 static struct bt_sock_list sco_sk_list = {
 	.lock = RW_LOCK_UNLOCKED
@@ -914,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf)
 
 static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
 
-static struct proto_ops sco_sock_ops = {
+static const struct proto_ops sco_sock_ops = {
 	.family		= PF_BLUETOOTH,
 	.owner		= THIS_MODULE,
 	.release	= sco_sock_release,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 40a4c689905..e4e629ed9bf 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -680,7 +680,7 @@ void dccp_shutdown(struct sock *sk, int how)
 
 EXPORT_SYMBOL_GPL(dccp_shutdown);
 
-static struct proto_ops inet_dccp_ops = {
+static const struct proto_ops inet_dccp_ops = {
 	.family		= PF_INET,
 	.owner		= THIS_MODULE,
 	.release	= inet_release,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d402e9020c6..65e3baed025 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk);
 #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
 
 
-static struct proto_ops dn_proto_ops;
+static const struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
@@ -2342,7 +2342,7 @@ static struct net_proto_family	dn_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops dn_proto_ops = {
+static const struct proto_ops dn_proto_ops = {
 	.family =	AF_DECnet,
 	.owner =	THIS_MODULE,
 	.release =	dn_release,
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 34fdac51df9..ff58f49c8b4 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -45,7 +45,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-static struct proto_ops econet_ops;
+static const struct proto_ops econet_ops;
 static struct hlist_head econet_sklist;
 static DEFINE_RWLOCK(econet_lock);
 
@@ -698,7 +698,7 @@ static struct net_proto_family econet_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = {
 	.family =	PF_ECONET,
 	.owner =	THIS_MODULE,
 	.release =	econet_release,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 617e858beff..4ed8a814c6c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -785,7 +785,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-struct proto_ops inet_stream_ops = {
+const struct proto_ops inet_stream_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
@@ -806,7 +806,7 @@ struct proto_ops inet_stream_ops = {
 	.sendpage =	tcp_sendpage
 };
 
-struct proto_ops inet_dgram_ops = {
+const struct proto_ops inet_dgram_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
@@ -831,7 +831,7 @@ struct proto_ops inet_dgram_ops = {
  * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
  * udp_poll
  */
-static struct proto_ops inet_sockraw_ops = {
+static const struct proto_ops inet_sockraw_ops = {
 	.family =	PF_INET,
 	.owner =	THIS_MODULE,
 	.release =	inet_release,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 70a510ff31e..7c9f19269f2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -462,7 +462,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return(0);
 }
 
-struct proto_ops inet6_stream_ops = {
+const struct proto_ops inet6_stream_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
@@ -483,7 +483,7 @@ struct proto_ops inet6_stream_ops = {
 	.sendpage =	tcp_sendpage
 };
 
-struct proto_ops inet6_dgram_ops = {
+const struct proto_ops inet6_dgram_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
@@ -511,7 +511,7 @@ static struct net_proto_family inet6_family_ops = {
 };
 
 /* Same as inet6_dgram_ops, sans udp_poll.  */
-static struct proto_ops inet6_sockraw_ops = {
+static const struct proto_ops inet6_sockraw_ops = {
 	.family =	PF_INET6,
 	.owner =	THIS_MODULE,
 	.release =	inet6_release,
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 34b3bb86840..6c464c11bb0 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink;
 static struct datalink_proto *p8023_datalink;
 static struct datalink_proto *pSNAP_datalink;
 
-static struct proto_ops ipx_dgram_ops;
+static const struct proto_ops ipx_dgram_ops;
 
 LIST_HEAD(ipx_interfaces);
 DEFINE_SPINLOCK(ipx_interfaces_lock);
@@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = {
 	.owner		= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
 	.family		= PF_IPX,
 	.owner		= THIS_MODULE,
 	.release	= ipx_release,
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index f121f7de203..e57683d424f 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -62,12 +62,12 @@
 
 static int irda_create(struct socket *sock, int protocol);
 
-static struct proto_ops irda_stream_ops;
-static struct proto_ops irda_seqpacket_ops;
-static struct proto_ops irda_dgram_ops;
+static const struct proto_ops irda_stream_ops;
+static const struct proto_ops irda_seqpacket_ops;
+static const struct proto_ops irda_dgram_ops;
 
 #ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops irda_ultra_ops;
+static const struct proto_ops irda_ultra_ops;
 #define ULTRA_MAX_DATA 382
 #endif /* CONFIG_IRDA_ULTRA */
 
@@ -2464,7 +2464,7 @@ static struct net_proto_family irda_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2485,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2506,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
@@ -2528,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = {
 };
 
 #ifdef CONFIG_IRDA_ULTRA
-static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = {
 	.family =	PF_IRDA,
 	.owner =	THIS_MODULE,
 	.release =	irda_release,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index d32f7791f1e..52efd04cbed 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void)
 }
 
 
-static struct proto_ops pfkey_ops;
+static const struct proto_ops pfkey_ops;
 
 static void pfkey_insert(struct sock *sk)
 {
@@ -3127,7 +3127,7 @@ out:
 	return err;
 }
 
-static struct proto_ops pfkey_ops = {
+static const struct proto_ops pfkey_ops = {
 	.family		=	PF_KEY,
 	.owner		=	THIS_MODULE,
 	/* Operations that make no sense on pfkey sockets. */
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index b6d3df5c911..9cf65f9d890 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -36,7 +36,7 @@
 static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
 static u16 llc_ui_sap_link_no_max[256];
 static struct sockaddr_llc llc_ui_addrnull;
-static struct proto_ops llc_ui_ops;
+static const struct proto_ops llc_ui_ops;
 
 static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
 static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
@@ -1098,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-static struct proto_ops llc_ui_ops = {
+static const struct proto_ops llc_ui_ops = {
 	.family	     = PF_LLC,
 	.owner       = THIS_MODULE,
 	.release     = llc_ui_release,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 96020d7087e..7849cac14d3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len)
 	return 0;
 }
 
-static struct proto_ops netlink_ops;
+static const struct proto_ops netlink_ops;
 
 static int netlink_insert(struct sock *sk, u32 pid)
 {
@@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb)
 	return notifier_chain_unregister(&netlink_chain, nb);
 }
                 
-static struct proto_ops netlink_ops = {
+static const struct proto_ops netlink_ops = {
 	.family =	PF_NETLINK,
 	.owner =	THIS_MODULE,
 	.release =	netlink_release,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index e5d82d711ca..05b653c0597 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -63,7 +63,7 @@ static unsigned short circuit = 0x101;
 static HLIST_HEAD(nr_list);
 static DEFINE_SPINLOCK(nr_list_lock);
 
-static struct proto_ops nr_proto_ops;
+static const struct proto_ops nr_proto_ops;
 
 /*
  *	Socket removal during an interrupt is now safe.
@@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = {
 	.owner		=	THIS_MODULE,
 };
 
-static struct proto_ops nr_proto_ops = {
+static const struct proto_ops nr_proto_ops = {
 	.family		=	PF_NETROM,
 	.owner		=	THIS_MODULE,
 	.release	=	nr_release,
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3e246276041..deda6fdb1e5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk)
 }
 
 
-static struct proto_ops packet_ops;
+static const struct proto_ops packet_ops;
 
 #ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt;
+static const struct proto_ops packet_ops_spkt;
 
 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -1784,7 +1784,7 @@ out:
 
 
 #ifdef CONFIG_SOCK_PACKET
-static struct proto_ops packet_ops_spkt = {
+static const struct proto_ops packet_ops_spkt = {
 	.family =	PF_PACKET,
 	.owner =	THIS_MODULE,
 	.release =	packet_release,
@@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = {
 };
 #endif
 
-static struct proto_ops packet_ops = {
+static const struct proto_ops packet_ops = {
 	.family =	PF_PACKET,
 	.owner =	THIS_MODULE,
 	.release =	packet_release,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fa3be2b8fb5..15c05165c90 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
 	return 2;
 }
 
-static struct proto_ops inet6_seqpacket_ops = {
+static const struct proto_ops inet6_seqpacket_ops = {
 	.family     = PF_INET6,
 	.owner      = THIS_MODULE,
 	.release    = inet6_release,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f775d78aa59..d1b0747a5b9 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -829,7 +829,7 @@ static struct notifier_block sctp_inetaddr_notifier = {
 };
 
 /* Socket operations.  */
-static struct proto_ops inet_seqpacket_ops = {
+static const struct proto_ops inet_seqpacket_ops = {
 	.family      = PF_INET,
 	.owner       = THIS_MODULE,
 	.release     = inet_release,       /* Needs to be wrapped... */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c6a51911e71..d68eba48129 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk)
 	struct svc_serv	*serv = svsk->sk_server;
 	struct socket	*sock = svsk->sk_sock;
 	struct socket	*newsock;
-	struct proto_ops *ops;
+	const struct proto_ops *ops;
 	struct svc_sock	*newsvsk;
 	int		err, slen;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 04e850e04e3..7d3fe6aebcd 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *,
 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 				  struct msghdr *, size_t);
 
-static struct proto_ops unix_stream_ops = {
+static const struct proto_ops unix_stream_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
@@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops unix_dgram_ops = {
+static const struct proto_ops unix_dgram_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
@@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = {
 	.sendpage =	sock_no_sendpage,
 };
 
-static struct proto_ops unix_seqpacket_ops = {
+static const struct proto_ops unix_seqpacket_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
 	.release =	unix_release,
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 59fec59b213..67948bf22dc 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -181,7 +181,7 @@ struct wanpipe_opt
 #endif
 
 static int sk_count;
-extern struct proto_ops wanpipe_ops;
+extern const struct proto_ops wanpipe_ops;
 static unsigned long find_free_critical;
 
 static void wanpipe_unlink_driver(struct sock *sk);
@@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr
 	return 0;
 }
 
-struct proto_ops wanpipe_ops = {
+const struct proto_ops wanpipe_ops = {
 	.family = 	PF_WANPIPE,
 	.owner =	THIS_MODULE,
 	.release = 	wanpipe_release,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 020d73cc841..ca8b3b0b920 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout    = X25_DEFAULT_T2;
 HLIST_HEAD(x25_list);
 DEFINE_RWLOCK(x25_list_lock);
 
-static struct proto_ops x25_proto_ops;
+static const struct proto_ops x25_proto_ops;
 
 static struct x25_address null_x25_address = {"               "};
 
@@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = {
 	.owner	=	THIS_MODULE,
 };
 
-static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
+static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
 	.family =	AF_X25,
 	.owner =	THIS_MODULE,
 	.release =	x25_release,
-- 
cgit v1.2.3


From f34fbb971368c20f757f8758833a534590b16518 Mon Sep 17 00:00:00 2001
From: Jaco Kroon <jaco@kroon.co.za>
Date: Thu, 22 Dec 2005 12:51:46 -0800
Subject: [PKTGEN]: Deinitialise static variables.

static variables should not be explicitly initialised to 0.  This causes
them to be placed in .data instead of .bss.  This patch de-initialises 3
static variables in net/core/pktgen.c.

There are approximately 800 more such variables in the source tree
(2.6.15rc5).  If there is more interrest I'd be willing to track down the
rest of these as well and de-initialise them as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7fc3e9e28c3..06cad2d63e8 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]);
 
 /* Module parameters, defaults. */
 static int pg_count_d = 1000; /* 1000 pkts by default */
-static int pg_delay_d = 0;
-static int pg_clone_skb_d = 0;
-static int debug = 0;
+static int pg_delay_d;
+static int pg_clone_skb_d;
+static int debug;
 
 static DECLARE_MUTEX(pktgen_sem);
 static struct pktgen_thread *pktgen_threads = NULL;
-- 
cgit v1.2.3


From cbeb321a64af5437fbde249605b191ff0fdfa21c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 22 Dec 2005 12:58:55 -0800
Subject: [NET]: Fix sock_init() return value.

It needs to return zero now that it is an initcall.

Also, net/nonet.c no longer needs a dummy sock_init().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/nonet.c  | 5 -----
 net/socket.c | 2 ++
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/nonet.c b/net/nonet.c
index e5241dceaa5..1230f0ae832 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -14,11 +14,6 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 
-void __init sock_init(void)
-{
-	printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n");
-}
-
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 {
 	return -ENXIO;
diff --git a/net/socket.c b/net/socket.c
index 98be7ef3c08..f40957adc7c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2063,6 +2063,8 @@ static int __init sock_init(void)
 #ifdef CONFIG_NETFILTER
 	netfilter_init();
 #endif
+
+	return 0;
 }
 
 core_initcall(sock_init);	/* early initcall */
-- 
cgit v1.2.3


From ce1d4d3e88b3a69d23c3feb436a0b36b6ca0642b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 22 Dec 2005 21:08:46 -0800
Subject: [NET]: restructure sock_aio_{read,write} / sock_{readv,writev}

Mid-term I plan to restructure the file_operations so that we don't need
to have all these duplicate aio and vectored versions.  This patch is
a small step in that direction but also a worthwile cleanup on it's own:

(1) introduce a alloc_sock_iocb helper that encapsulates allocating a
    proper sock_iocb
(2) add do_sock_read and do_sock_write helpers for common read/write
    code

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 224 +++++++++++++++++++++++++++++------------------------------
 1 file changed, 110 insertions(+), 114 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index f40957adc7c..0dde6dade2b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb)
 	kfree(iocb->private);
 }
 
-/*
- *	Read data from a socket. ubuf is a user mode pointer. We make sure the user
- *	area ubuf...ubuf+size-1 is writable before asking the protocol.
- */
-
-static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
-			 size_t size, loff_t pos)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+			     int offset, size_t size, loff_t *ppos, int more)
 {
-	struct sock_iocb *x, siocb;
 	struct socket *sock;
 	int flags;
 
-	if (pos != 0)
-		return -ESPIPE;
-	if (size==0)		/* Match SYS5 behaviour */
-		return 0;
+	sock = file->private_data;
 
-	if (is_sync_kiocb(iocb))
-		x = &siocb;
-	else {
-		x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
-		if (!x)
-			return -ENOMEM;
+	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+	if (more)
+		flags |= MSG_MORE;
+
+	return sock->ops->sendpage(sock, page, offset, size, flags);
+}
+
+static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
+		char __user *ubuf, size_t size, struct sock_iocb *siocb)
+{
+	if (!is_sync_kiocb(iocb)) {
+		siocb = kmalloc(sizeof(*siocb), GFP_KERNEL);
+		if (!siocb)
+			return NULL;
 		iocb->ki_dtor = sock_aio_dtor;
 	}
-	iocb->private = x;
-	x->kiocb = iocb;
-	sock = iocb->ki_filp->private_data; 
 
-	x->async_msg.msg_name = NULL;
-	x->async_msg.msg_namelen = 0;
-	x->async_msg.msg_iov = &x->async_iov;
-	x->async_msg.msg_iovlen = 1;
-	x->async_msg.msg_control = NULL;
-	x->async_msg.msg_controllen = 0;
-	x->async_iov.iov_base = ubuf;
-	x->async_iov.iov_len = size;
-	flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
+	siocb->kiocb = iocb;
+	siocb->async_iov.iov_base = ubuf;
+	siocb->async_iov.iov_len = size;
 
-	return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags);
+	iocb->private = siocb;
+	return siocb;
 }
 
+static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
+		struct file *file, struct iovec *iov, unsigned long nr_segs)
+{
+	struct socket *sock = file->private_data;
+	size_t size = 0;
+	int i;
 
-/*
- *	Write data to a socket. We verify that the user area ubuf..ubuf+size-1
- *	is readable by the user process.
- */
+        for (i = 0 ; i < nr_segs ; i++)
+                size += iov[i].iov_len;
 
-static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
-			  size_t size, loff_t pos)
+	msg->msg_name = NULL;
+	msg->msg_namelen = 0;
+	msg->msg_control = NULL;
+	msg->msg_controllen = 0;
+	msg->msg_iov = (struct iovec *) iov;
+	msg->msg_iovlen = nr_segs;
+	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+
+	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
+}
+
+static ssize_t sock_readv(struct file *file, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t *ppos)
 {
-	struct sock_iocb *x, siocb;
-	struct socket *sock;
-	
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	struct msghdr msg;
+	int ret;
+
+        init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
+
+	ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
+
+static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
+			 size_t count, loff_t pos)
+{
+	struct sock_iocb siocb, *x;
+
 	if (pos != 0)
 		return -ESPIPE;
-	if(size==0)		/* Match SYS5 behaviour */
+	if (count == 0)		/* Match SYS5 behaviour */
 		return 0;
 
-	if (is_sync_kiocb(iocb))
-		x = &siocb;
-	else {
-		x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL);
-		if (!x)
-			return -ENOMEM;
-		iocb->ki_dtor = sock_aio_dtor;
-	}
-	iocb->private = x;
-	x->kiocb = iocb;
-	sock = iocb->ki_filp->private_data; 
-
-	x->async_msg.msg_name = NULL;
-	x->async_msg.msg_namelen = 0;
-	x->async_msg.msg_iov = &x->async_iov;
-	x->async_msg.msg_iovlen = 1;
-	x->async_msg.msg_control = NULL;
-	x->async_msg.msg_controllen = 0;
-	x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
-	if (sock->type == SOCK_SEQPACKET)
-		x->async_msg.msg_flags |= MSG_EOR;
-	x->async_iov.iov_base = (void __user *)ubuf;
-	x->async_iov.iov_len = size;
-	
-	return __sock_sendmsg(iocb, sock, &x->async_msg, size);
+	x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
+	if (!x)
+		return -ENOMEM;
+	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
+			&x->async_iov, 1);
 }
 
-static ssize_t sock_sendpage(struct file *file, struct page *page,
-			     int offset, size_t size, loff_t *ppos, int more)
+static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
+		struct file *file, struct iovec *iov, unsigned long nr_segs)
 {
-	struct socket *sock;
-	int flags;
+	struct socket *sock = file->private_data;
+	size_t size = 0;
+	int i;
 
-	sock = file->private_data;
+        for (i = 0 ; i < nr_segs ; i++)
+                size += iov[i].iov_len;
 
-	flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
-	if (more)
-		flags |= MSG_MORE;
+	msg->msg_name = NULL;
+	msg->msg_namelen = 0;
+	msg->msg_control = NULL;
+	msg->msg_controllen = 0;
+	msg->msg_iov = (struct iovec *) iov;
+	msg->msg_iovlen = nr_segs;
+	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+	if (sock->type == SOCK_SEQPACKET)
+		msg->msg_flags |= MSG_EOR;
 
-	return sock->ops->sendpage(sock, page, offset, size, flags);
+	return __sock_sendmsg(iocb, sock, msg, size);
 }
 
-static int sock_readv_writev(int type,
-			     struct file * file, const struct iovec * iov,
-			     long count, size_t size)
+static ssize_t sock_writev(struct file *file, const struct iovec *iov,
+			   unsigned long nr_segs, loff_t *ppos)
 {
 	struct msghdr msg;
-	struct socket *sock;
+	struct kiocb iocb;
+	struct sock_iocb siocb;
+	int ret;
 
-	sock = file->private_data;
+	init_sync_kiocb(&iocb, NULL);
+	iocb.private = &siocb;
 
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	msg.msg_control = NULL;
-	msg.msg_controllen = 0;
-	msg.msg_iov = (struct iovec *) iov;
-	msg.msg_iovlen = count;
-	msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+	ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+	if (-EIOCBQUEUED == ret)
+		ret = wait_on_sync_kiocb(&iocb);
+	return ret;
+}
 
-	/* read() does a VERIFY_WRITE */
-	if (type == VERIFY_WRITE)
-		return sock_recvmsg(sock, &msg, size, msg.msg_flags);
+static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
+			  size_t count, loff_t pos)
+{
+	struct sock_iocb siocb, *x;
 
-	if (sock->type == SOCK_SEQPACKET)
-		msg.msg_flags |= MSG_EOR;
+	if (pos != 0)
+		return -ESPIPE;
+	if (count == 0)		/* Match SYS5 behaviour */
+		return 0;
 
-	return sock_sendmsg(sock, &msg, size);
-}
+	x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
+	if (!x)
+		return -ENOMEM;
 
-static ssize_t sock_readv(struct file *file, const struct iovec *vector,
-			  unsigned long count, loff_t *ppos)
-{
-	size_t tot_len = 0;
-	int i;
-        for (i = 0 ; i < count ; i++)
-                tot_len += vector[i].iov_len;
-	return sock_readv_writev(VERIFY_WRITE,
-				 file, vector, count, tot_len);
-}
-	
-static ssize_t sock_writev(struct file *file, const struct iovec *vector,
-			   unsigned long count, loff_t *ppos)
-{
-	size_t tot_len = 0;
-	int i;
-        for (i = 0 ; i < count ; i++)
-                tot_len += vector[i].iov_len;
-	return sock_readv_writev(VERIFY_READ,
-				 file, vector, count, tot_len);
+	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
+			&x->async_iov, 1);
 }
 
 
-- 
cgit v1.2.3


From 25995ff577675b58dbd848b7758e7bad87411947 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 27 Dec 2005 02:42:22 -0200
Subject: [SOCK]: Introduce sk_receive_skb

Its common enough to to justify that, TCP still can't use it as it has the
prequeueing stuff, still to be made generic in the not so distant future :-)

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c    | 22 ++--------------------
 include/net/sock.h     | 23 +++++++++++++++++++++++
 net/dccp/ipv4.c        | 23 ++---------------------
 net/dccp/ipv6.c        | 17 +----------------
 net/decnet/dn_nsp_in.c | 17 +----------------
 5 files changed, 29 insertions(+), 73 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index a842ecc60a3..71e303b2864 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb,
 {
 	struct pppoe_hdr *ph;
 	struct pppox_sock *po;
-	struct sock *sk;
-	int ret;
 
 	if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
 		goto drop;
@@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb,
 	ph = (struct pppoe_hdr *) skb->nh.raw;
 
 	po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source);
-	if (!po) 
-		goto drop;
-
-	sk = sk_pppox(po);
-	bh_lock_sock(sk);
-
-	/* Socket state is unknown, must put skb into backlog. */
-	if (sock_owned_by_user(sk) != 0) {
-		sk_add_backlog(sk, skb);
-		ret = NET_RX_SUCCESS;
-	} else {
-		ret = pppoe_rcv_core(sk, skb);
-	}
-
-	bh_unlock_sock(sk);
-	sock_put(sk);
-
-	return ret;
+	if (po != NULL) 
+		return sk_receive_skb(sk_pppox(po), skb);
 drop:
 	kfree_skb(skb);
 out:
diff --git a/include/net/sock.h b/include/net/sock.h
index 91d28957dc1..6961700ff3a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk)
 		sk_free(sk);
 }
 
+static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int rc = NET_RX_SUCCESS;
+
+	if (sk_filter(sk, skb, 0))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		rc = sk->sk_backlog_rcv(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+out:
+	sock_put(sk);
+	return rc;
+discard_and_relse:
+	kfree_skb(skb);
+	goto out;
+}
+
 /* Detach socket from process context.
  * Announce socket dead, detach it from wait queue and inode.
  * Note that parent inode held reference count on this struct sock,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index c363051a7f1..99e8afa7ba1 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -914,7 +914,6 @@ int dccp_v4_rcv(struct sk_buff *skb)
 {
 	const struct dccp_hdr *dh;
 	struct sock *sk;
-	int rc;
 
 	/* Step 1: Check header basics: */
 
@@ -984,28 +983,10 @@ int dccp_v4_rcv(struct sk_buff *skb)
                 goto do_time_wait;
 	}
 
-	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
-		dccp_pr_debug("xfrm4_policy_check failed\n");
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
-	}
-
-        if (sk_filter(sk, skb, 0)) {
-		dccp_pr_debug("sk_filter failed\n");
-                goto discard_and_relse;
-	}
-
-	skb->dev = NULL;
 
-	bh_lock_sock(sk);
-	rc = 0;
-	if (!sock_owned_by_user(sk))
-		rc = dccp_v4_do_rcv(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
-
-	sock_put(sk);
-	return rc;
+	return sk_receive_skb(sk, skb);
 
 no_dccp_socket:
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 599b0be2151..2e194c8f995 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1032,7 +1032,6 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
-	int rc;
 
 	/* Step 1: Check header basics: */
 
@@ -1077,21 +1076,7 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
 
-	if (sk_filter(sk, skb, 0))
-		goto discard_and_relse;
-
-	skb->dev = NULL;
-
-	bh_lock_sock(sk);
-	rc = 0;
-	if (!sock_owned_by_user(sk))
-		rc = dccp_v6_do_rcv(sk, skb);
-	else
-		sk_add_backlog(sk, skb);
-	bh_unlock_sock(sk);
-
-	sock_put(sk);
-	return rc ? -1 : 0;
+	return sk_receive_skb(sk, skb) ? -1 : 0;
 
 no_dccp_socket:
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 369f25b60f3..44bda85e678 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb)
 got_it:
 	if (sk != NULL) {
 		struct dn_scp *scp = DN_SK(sk);
-		int ret;
 
 		/* Reset backoff */
 		scp->nsp_rxtshift = 0;
@@ -807,21 +806,7 @@ got_it:
 				goto free_out;
 		}
 
-		bh_lock_sock(sk);
-		ret = NET_RX_SUCCESS;
-		if (decnet_debug_level & 8)
-			printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n",
-				(int)cb->rt_flags, (int)cb->nsp_flags, 
-				(int)cb->src_port, (int)cb->dst_port, 
-				!!sock_owned_by_user(sk));
-		if (!sock_owned_by_user(sk))
-			ret = dn_nsp_backlog_rcv(sk, skb);
-		else
-			sk_add_backlog(sk, skb);
-		bh_unlock_sock(sk);
-		sock_put(sk);
-
-		return ret;
+		return sk_receive_skb(sk, skb);
 	}
 
 	return dn_nsp_no_socket(skb, reason);
-- 
cgit v1.2.3


From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 27 Dec 2005 02:43:12 -0200
Subject: [INET_SOCK]: Move struct inet_sock & helper functions to
 net/inet_sock.h

To help in reducing the number of include dependencies, several files were
touched as they were getting needed headers indirectly for stuff they use.

Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had
linux/dccp.h include twice.

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   2 +
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   2 +
 drivers/net/ns83820.c                          |   1 +
 drivers/net/sk98lin/skge.c                     |   1 +
 drivers/net/skge.c                             |   1 +
 drivers/net/tg3.c                              |   1 +
 fs/9p/trans_sock.c                             |   1 +
 fs/nfs/callback.c                              |   3 +
 include/linux/dccp.h                           |   3 +-
 include/linux/ip.h                             | 126 +---------------
 include/linux/ipv6.h                           |   7 +-
 include/linux/udp.h                            |   6 +-
 include/net/atmclip.h                          |   2 +-
 include/net/dst.h                              |   1 +
 include/net/icmp.h                             |   9 +-
 include/net/ieee80211_crypt.h                  |   9 +-
 include/net/inet_connection_sock.h             |   3 +-
 include/net/inet_ecn.h                         |   2 +
 include/net/inet_hashtables.h                  |  21 +--
 include/net/inet_sock.h                        | 193 +++++++++++++++++++++++++
 include/net/inet_timewait_sock.h               |   2 +-
 include/net/ip.h                               |  17 ++-
 include/net/ip_fib.h                           |   2 +
 include/net/ip_vs.h                            |  12 +-
 include/net/ipv6.h                             |   3 +
 include/net/ndisc.h                            |  17 ++-
 include/net/neighbour.h                        |   2 +-
 include/net/pkt_act.h                          |   1 -
 include/net/raw.h                              |   2 +
 include/net/udp.h                              |   4 +-
 include/net/xfrm.h                             |   3 +-
 net/bridge/br_netfilter.c                      |   4 +
 net/bridge/netfilter/ebt_log.c                 |   1 +
 net/core/netpoll.c                             |   1 +
 net/dccp/ccid.h                                |   2 +
 net/dccp/ipv4.c                                |   1 +
 net/dccp/ipv6.c                                |   1 +
 net/dccp/output.c                              |   1 +
 net/dccp/proto.c                               |   3 +-
 net/econet/af_econet.c                         |   1 +
 net/ipv4/af_inet.c                             |   1 +
 net/ipv4/ah4.c                                 |   1 +
 net/ipv4/arp.c                                 |   1 +
 net/ipv4/devinet.c                             |   1 +
 net/ipv4/esp4.c                                |   1 +
 net/ipv4/fib_frontend.c                        |   1 +
 net/ipv4/fib_hash.c                            |   1 +
 net/ipv4/fib_rules.c                           |   1 +
 net/ipv4/fib_semantics.c                       |   2 +
 net/ipv4/icmp.c                                |   1 +
 net/ipv4/igmp.c                                |   2 +
 net/ipv4/ip_input.c                            |   1 +
 net/ipv4/ip_options.c                          |   1 +
 net/ipv4/ip_sockglue.c                         |   1 +
 net/ipv4/ipcomp.c                              |   1 +
 net/ipv4/ipconfig.c                            |   2 +
 net/ipv4/ipmr.c                                |   1 +
 net/ipv4/ipvs/ip_vs_conn.c                     |   2 +
 net/ipv4/ipvs/ip_vs_ctl.c                      |   1 +
 net/ipv4/ipvs/ip_vs_dh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_est.c                      |   3 +
 net/ipv4/ipvs/ip_vs_lblc.c                     |   2 +
 net/ipv4/ipvs/ip_vs_lblcr.c                    |   2 +
 net/ipv4/ipvs/ip_vs_proto_ah.c                 |   2 +
 net/ipv4/ipvs/ip_vs_proto_esp.c                |   2 +
 net/ipv4/ipvs/ip_vs_proto_udp.c                |   3 +
 net/ipv4/ipvs/ip_vs_sh.c                       |   2 +
 net/ipv4/ipvs/ip_vs_sync.c                     |   2 +
 net/ipv4/netfilter/ip_conntrack_amanda.c       |   2 +
 net/ipv4/netfilter/ip_conntrack_proto_gre.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_proto_udp.c    |   1 +
 net/ipv4/netfilter/ip_conntrack_standalone.c   |   1 +
 net/ipv4/netfilter/ip_nat_snmp_basic.c         |   2 +
 net/ipv4/netfilter/ipt_MASQUERADE.c            |   2 +
 net/ipv4/netfilter/ipt_physdev.c               |   1 +
 net/ipv4/proc.c                                |   1 +
 net/ipv4/sysctl_net_ipv4.c                     |   1 +
 net/ipv4/udp.c                                 |   1 +
 net/ipv6/ah6.c                                 |   1 +
 net/ipv6/esp6.c                                |   1 +
 net/ipv6/ipcomp6.c                             |   1 +
 net/ipv6/netfilter/ip6_tables.c                |   1 +
 net/ipv6/netfilter/ip6t_LOG.c                  |   1 +
 net/ipv6/netfilter/ip6t_ah.c                   |   1 +
 net/ipv6/netfilter/ip6t_esp.c                  |   1 +
 net/sched/sch_teql.c                           |   1 +
 net/sctp/protocol.c                            |   1 +
 87 files changed, 355 insertions(+), 184 deletions(-)
 create mode 100644 include/net/inet_sock.h

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 475d98fa9e2..780009c7eaa 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -47,6 +47,8 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 
+#include <net/dst.h>
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ef3ee035bbc..ed0c2ead8bc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -43,6 +43,8 @@
 #include <linux/delay.h>
 #include <linux/completion.h>
 
+#include <net/dst.h>
+
 #include "ipoib.h"
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index f857ae94d26..b0c3b6ab626 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -115,6 +115,7 @@
 #include <linux/ethtool.h>
 #include <linux/timer.h>
 #include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ae734393475..e1a2d52cc1f 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -107,6 +107,7 @@
 
 #include	"h/skversion.h"
 
+#include	<linux/in.h>
 #include	<linux/module.h>
 #include	<linux/moduleparam.h>
 #include	<linux/init.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 00d683063c0..d8cc3aea032 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 2fc9893d69e..59d916ccc81 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/in.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c
index a93c2bf94c3..6a9a75d40f7 100644
--- a/fs/9p/trans_sock.c
+++ b/fs/9p/trans_sock.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f2ca782aba3..30cae360286 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,9 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
+
+#include <net/inet_sock.h>
+
 #include "nfs4_fs.h"
 #include "callback.h"
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index d0bdb499cf8..088529f5496 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -192,10 +192,9 @@ enum {
 #include <linux/workqueue.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/inet_timewait_sock.h>
-#include <net/sock.h>
 #include <net/tcp_states.h>
-#include <net/tcp.h>
 
 enum dccp_state {
 	DCCP_OPEN	= TCP_ESTABLISHED,
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 6ccc596c19c..9e2eb9a602e 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -16,6 +16,7 @@
  */
 #ifndef _LINUX_IP_H
 #define _LINUX_IP_H
+#include <linux/types.h>
 #include <asm/byteorder.h>
 
 #define IPTOS_TOS_MASK		0x1E
@@ -78,131 +79,6 @@
 #define	IPOPT_TS_TSANDADDR	1		/* timestamps and addresses */
 #define	IPOPT_TS_PRESPEC	3		/* specified modules only */
 
-#ifdef __KERNEL__
-#include <linux/config.h>
-#include <linux/types.h>
-#include <net/request_sock.h>
-#include <net/sock.h>
-#include <linux/igmp.h>
-#include <net/flow.h>
-
-struct ip_options {
-  __u32		faddr;				/* Saved first hop address */
-  unsigned char	optlen;
-  unsigned char srr;
-  unsigned char rr;
-  unsigned char ts;
-  unsigned char is_setbyuser:1,			/* Set by setsockopt?			*/
-                is_data:1,			/* Options in __data, rather than skb	*/
-                is_strictroute:1,		/* Strict source route			*/
-                srr_is_hit:1,			/* Packet destination addr was our one	*/
-                is_changed:1,			/* IP checksum more not valid		*/	
-                rr_needaddr:1,			/* Need to record addr of outgoing dev	*/
-                ts_needtime:1,			/* Need to record timestamp		*/
-                ts_needaddr:1;			/* Need to record addr of outgoing dev  */
-  unsigned char router_alert;
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __data[0];
-};
-
-#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
-
-struct inet_request_sock {
-	struct request_sock	req;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	u16			inet6_rsk_offset;
-	/* 2 bytes hole, try to pack */
-#endif
-	u32			loc_addr;
-	u32			rmt_addr;
-	u16			rmt_port;
-	u16			snd_wscale : 4, 
-				rcv_wscale : 4, 
-				tstamp_ok  : 1,
-				sack_ok	   : 1,
-				wscale_ok  : 1,
-				ecn_ok	   : 1,
-				acked	   : 1;
-	struct ip_options	*opt;
-};
-
-static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
-{
-	return (struct inet_request_sock *)sk;
-}
-
-struct ipv6_pinfo;
-
-struct inet_sock {
-	/* sk and pinet6 has to be the first two members of inet_sock */
-	struct sock		sk;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	struct ipv6_pinfo	*pinet6;
-#endif
-	/* Socket demultiplex comparisons on incoming packets. */
-	__u32			daddr;		/* Foreign IPv4 addr */
-	__u32			rcv_saddr;	/* Bound local IPv4 addr */
-	__u16			dport;		/* Destination port */
-	__u16			num;		/* Local port */
-	__u32			saddr;		/* Sending source */
-	__s16			uc_ttl;		/* Unicast TTL */
-	__u16			cmsg_flags;
-	struct ip_options	*opt;
-	__u16			sport;		/* Source port */
-	__u16			id;		/* ID counter for DF pkts */
-	__u8			tos;		/* TOS */
-	__u8			mc_ttl;		/* Multicasting TTL */
-	__u8			pmtudisc;
-	unsigned		recverr : 1,
-				is_icsk : 1,	/* inet_connection_sock? */
-				freebind : 1,
-				hdrincl : 1,
-				mc_loop : 1;
-	int			mc_index;	/* Multicast device index */
-	__u32			mc_addr;
-	struct ip_mc_socklist	*mc_list;	/* Group array */
-	/*
-	 * Following members are used to retain the infomation to build
-	 * an ip header on each ip fragmentation while the socket is corked.
-	 */
-	struct {
-		unsigned int		flags;
-		unsigned int		fragsize;
-		struct ip_options	*opt;
-		struct rtable		*rt;
-		int			length; /* Total length of all frames */
-		u32			addr;
-		struct flowi		fl;
-	} cork;
-};
-
-#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
-
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
-	return (struct inet_sock *)sk;
-}
-
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
-					     const struct sock *sk_from,
-					     const int ancestor_size)
-{
-	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
-	       sk_from->sk_prot->obj_size - ancestor_size);
-}
-#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
-static inline void inet_sk_copy_descendant(struct sock *sk_to,
-					   const struct sock *sk_from)
-{
-	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
-}
-#endif
-#endif
-
-extern int inet_sk_rebuild_header(struct sock *sk);
-
 struct iphdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8	ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index a0d04891fe1..93bbed5c6cf 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -171,12 +171,13 @@ enum {
 };
 
 #ifdef __KERNEL__
-#include <linux/in6.h>          /* struct sockaddr_in6 */
 #include <linux/icmpv6.h>
-#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
+#include <net/if_inet6.h>       /* struct ipv6_mc_socklist */
+#include <net/inet_sock.h>
+
 /* 
    This structure contains results of exthdrs parsing
    as offsets from skb->nh.
@@ -346,8 +347,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
 #define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
 
-#include <linux/tcp.h>
-
 struct inet6_timewait_sock {
 	struct in6_addr tw_v6_daddr;
 	struct in6_addr	tw_v6_rcv_saddr;
diff --git a/include/linux/udp.h b/include/linux/udp.h
index b60e0b4a25c..85a55658831 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -35,10 +35,10 @@ struct udphdr {
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
 
 #ifdef __KERNEL__
-
 #include <linux/config.h>
-#include <net/sock.h>
-#include <linux/ip.h>
+#include <linux/types.h>
+
+#include <net/inet_sock.h>
 
 struct udp_sock {
 	/* inet_sock has to be the first member */
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 47048b1d179..90fcc98e676 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -7,7 +7,6 @@
 #define _ATMCLIP_H
 
 #include <linux/netdevice.h>
-#include <linux/skbuff.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
 #include <linux/atmarp.h>
@@ -18,6 +17,7 @@
 #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
 #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
 
+struct sk_buff;
 
 struct clip_vcc {
 	struct atm_vcc	*vcc;		/* VCC descriptor */
diff --git a/include/net/dst.h b/include/net/dst.h
index 6c196a5baf2..bee8b84d329 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -9,6 +9,7 @@
 #define _NET_DST_H
 
 #include <linux/config.h>
+#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
 #include <linux/jiffies.h>
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 6cdebeee5f9..e7c3f20fbaf 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -20,12 +20,9 @@
 
 #include <linux/config.h>
 #include <linux/icmp.h>
-#include <linux/skbuff.h>
 
-#include <net/sock.h>
-#include <net/protocol.h>
+#include <net/inet_sock.h>
 #include <net/snmp.h>
-#include <linux/ip.h>
 
 struct icmp_err {
   int		errno;
@@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
 #define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmp_statistics, field)
 #define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmp_statistics, field)
 
+struct dst_entry;
+struct net_proto_family;
+struct sk_buff;
+
 extern void	icmp_send(struct sk_buff *skb_in,  int type, int code, u32 info);
 extern int	icmp_rcv(struct sk_buff *skb);
 extern int	icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h
index 225fc751d46..03b766afdc3 100644
--- a/include/net/ieee80211_crypt.h
+++ b/include/net/ieee80211_crypt.h
@@ -23,12 +23,17 @@
 #ifndef IEEE80211_CRYPT_H
 #define IEEE80211_CRYPT_H
 
-#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
 
 enum {
 	IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0),
 };
 
+struct sk_buff;
+struct module;
+
 struct ieee80211_crypto_ops {
 	const char *name;
 	struct list_head list;
@@ -87,6 +92,8 @@ struct ieee80211_crypt_data {
 	atomic_t refcnt;
 };
 
+struct ieee80211_device;
+
 int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops);
 int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops);
 struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 91888967d3e..50234fa56a6 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -16,9 +16,10 @@
 #define _INET_CONNECTION_SOCK_H
 
 #include <linux/compiler.h>
-#include <linux/ip.h>
 #include <linux/string.h>
 #include <linux/timer.h>
+
+#include <net/inet_sock.h>
 #include <net/request_sock.h>
 
 #define INET_CSK_DEBUG 1
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index b0c47e2eccf..d599c6bfbb8 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -3,6 +3,8 @@
 
 #include <linux/ip.h>
 #include <linux/skbuff.h>
+
+#include <net/inet_sock.h>
 #include <net/dsfield.h>
 
 enum {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index c83baa79f66..135d80fd658 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 
 #include <net/inet_connection_sock.h>
+#include <net/inet_sock.h>
 #include <net/route.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
@@ -128,26 +129,6 @@ struct inet_hashinfo {
 	kmem_cache_t			*bind_bucket_cachep;
 };
 
-static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
-			       const __u32 faddr, const __u16 fport)
-{
-	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
-	h ^= h >> 16;
-	h ^= h >> 8;
-	return h;
-}
-
-static inline int inet_sk_ehashfn(const struct sock *sk)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const __u32 laddr = inet->rcv_saddr;
-	const __u16 lport = inet->num;
-	const __u32 faddr = inet->daddr;
-	const __u16 fport = inet->dport;
-
-	return inet_ehashfn(laddr, lport, faddr, fport);
-}
-
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
 	struct inet_hashinfo *hashinfo,
 	unsigned int hash)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
new file mode 100644
index 00000000000..883eb529ef8
--- /dev/null
+++ b/include/net/inet_sock.h
@@ -0,0 +1,193 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for inet_sock
+ *
+ * Authors:	Many, reorganised here by
+ * 		Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_SOCK_H
+#define _INET_SOCK_H
+
+#include <linux/config.h>
+
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <net/flow.h>
+#include <net/sock.h>
+#include <net/request_sock.h>
+
+/** struct ip_options - IP Options
+ *
+ * @faddr - Saved first hop address
+ * @is_setbyuser - Set by setsockopt?
+ * @is_data - Options in __data, rather than skb
+ * @is_strictroute - Strict source route
+ * @srr_is_hit - Packet destination addr was our one
+ * @is_changed - IP checksum more not valid
+ * @rr_needaddr - Need to record addr of outgoing dev
+ * @ts_needtime - Need to record timestamp
+ * @ts_needaddr - Need to record addr of outgoing dev
+ */
+struct ip_options {
+	__u32		faddr;
+	unsigned char	optlen;
+	unsigned char	srr;
+	unsigned char	rr;
+	unsigned char	ts;
+	unsigned char	is_setbyuser:1,
+			is_data:1,
+			is_strictroute:1,
+			srr_is_hit:1,
+			is_changed:1,
+			rr_needaddr:1,
+			ts_needtime:1,
+			ts_needaddr:1;
+	unsigned char	router_alert;
+	unsigned char	__pad1;
+	unsigned char	__pad2;
+	unsigned char	__data[0];
+};
+
+#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
+
+struct inet_request_sock {
+	struct request_sock	req;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	u16			inet6_rsk_offset;
+	/* 2 bytes hole, try to pack */
+#endif
+	u32			loc_addr;
+	u32			rmt_addr;
+	u16			rmt_port;
+	u16			snd_wscale : 4, 
+				rcv_wscale : 4, 
+				tstamp_ok  : 1,
+				sack_ok	   : 1,
+				wscale_ok  : 1,
+				ecn_ok	   : 1,
+				acked	   : 1;
+	struct ip_options	*opt;
+};
+
+static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
+{
+	return (struct inet_request_sock *)sk;
+}
+
+struct ip_mc_socklist;
+struct ipv6_pinfo;
+struct rtable;
+
+/** struct inet_sock - representation of INET sockets
+ *
+ * @sk - ancestor class
+ * @pinet6 - pointer to IPv6 control block
+ * @daddr - Foreign IPv4 addr
+ * @rcv_saddr - Bound local IPv4 addr
+ * @dport - Destination port
+ * @num - Local port
+ * @saddr - Sending source
+ * @uc_ttl - Unicast TTL
+ * @sport - Source port
+ * @id - ID counter for DF pkts
+ * @tos - TOS
+ * @mc_ttl - Multicasting TTL
+ * @is_icsk - is this an inet_connection_sock?
+ * @mc_index - Multicast device index
+ * @mc_list - Group array
+ * @cork - info to build ip hdr on each ip frag while socket is corked
+ */
+struct inet_sock {
+	/* sk and pinet6 has to be the first two members of inet_sock */
+	struct sock		sk;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	struct ipv6_pinfo	*pinet6;
+#endif
+	/* Socket demultiplex comparisons on incoming packets. */
+	__u32			daddr;
+	__u32			rcv_saddr;
+	__u16			dport;
+	__u16			num;
+	__u32			saddr;
+	__s16			uc_ttl;
+	__u16			cmsg_flags;
+	struct ip_options	*opt;
+	__u16			sport;
+	__u16			id;
+	__u8			tos;
+	__u8			mc_ttl;
+	__u8			pmtudisc;
+	__u8			recverr:1,
+				is_icsk:1,
+				freebind:1,
+				hdrincl:1,
+				mc_loop:1;
+	int			mc_index;
+	__u32			mc_addr;
+	struct ip_mc_socklist	*mc_list;
+	struct {
+		unsigned int		flags;
+		unsigned int		fragsize;
+		struct ip_options	*opt;
+		struct rtable		*rt;
+		int			length; /* Total length of all frames */
+		u32			addr;
+		struct flowi		fl;
+	} cork;
+};
+
+#define IPCORK_OPT	1	/* ip-options has been held in ipcork.opt */
+#define IPCORK_ALLFRAG	2	/* always fragment (for ipv6 for now) */
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+	return (struct inet_sock *)sk;
+}
+
+static inline void __inet_sk_copy_descendant(struct sock *sk_to,
+					     const struct sock *sk_from,
+					     const int ancestor_size)
+{
+	memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
+	       sk_from->sk_prot->obj_size - ancestor_size);
+}
+#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE))
+static inline void inet_sk_copy_descendant(struct sock *sk_to,
+					   const struct sock *sk_from)
+{
+	__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
+}
+#endif
+
+extern int inet_sk_rebuild_header(struct sock *sk);
+
+static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
+					const __u32 faddr, const __u16 fport)
+{
+	unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	return h;
+}
+
+static inline int inet_sk_ehashfn(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const __u32 laddr = inet->rcv_saddr;
+	const __u16 lport = inet->num;
+	const __u32 faddr = inet->daddr;
+	const __u16 fport = inet->dport;
+
+	return inet_ehashfn(laddr, lport, faddr, fport);
+}
+
+#endif	/* _INET_SOCK_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index e396a65473d..1da294c4752 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -17,13 +17,13 @@
 
 #include <linux/config.h>
 
-#include <linux/ip.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/timewait_sock.h>
diff --git a/include/net/ip.h b/include/net/ip.h
index 4d6294ba038..f7e7fd728b6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -24,14 +24,10 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
-#include <linux/socket.h>
 #include <linux/ip.h>
 #include <linux/in.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/in_route.h>
-#include <net/route.h>
-#include <net/arp.h>
+
+#include <net/inet_sock.h>
 #include <net/snmp.h>
 
 struct sock;
@@ -75,6 +71,13 @@ extern rwlock_t ip_ra_lock;
 
 #define IP_FRAG_TIME	(30 * HZ)		/* fragment lifetime	*/
 
+struct msghdr;
+struct net_device;
+struct packet_type;
+struct rtable;
+struct sk_buff;
+struct sockaddr;
+
 extern void		ip_mc_dropsocket(struct sock *);
 extern void		ip_mc_dropdevice(struct net_device *dev);
 extern int		igmp_mc_proc_init(void);
@@ -184,6 +187,8 @@ extern int sysctl_ip_dynaddr;
 extern void ipfrag_init(void);
 
 #ifdef CONFIG_INET
+#include <net/dst.h>
+
 /* The function in 2.2 was invalid, producing wrong result for
  * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
 static inline
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14de4ebd121..e000fa2cd5f 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
 			       struct net_device *dev, u32 *spec_dst, u32 *itag);
 extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res);
 
+struct rtentry;
+
 /* Exported by fib_semantics.c */
 extern int ip_fib_check_default(u32 gw, struct net_device *dev);
 extern int fib_sync_down(u32 local, struct net_device *dev, int force);
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3b5559a023a..7d2674fde19 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -251,16 +251,15 @@ struct ip_vs_daemon_user {
 #include <linux/config.h>
 #include <linux/list.h>                 /* for struct list_head */
 #include <linux/spinlock.h>             /* for struct rwlock_t */
-#include <linux/skbuff.h>               /* for struct sk_buff */
-#include <linux/ip.h>                   /* for struct iphdr */
 #include <asm/atomic.h>                 /* for struct atomic_t */
-#include <linux/netdevice.h>		/* for struct neighbour */
-#include <net/dst.h>			/* for struct dst_entry */
-#include <net/udp.h>
 #include <linux/compiler.h>
+#include <linux/timer.h>
 
+#include <net/checksum.h>
 
 #ifdef CONFIG_IP_VS_DEBUG
+#include <linux/net.h>
+
 extern int ip_vs_get_debug_level(void);
 #define IP_VS_DBG(level, msg...)			\
     do {						\
@@ -429,8 +428,11 @@ struct ip_vs_stats
 	spinlock_t              lock;           /* spin lock */
 };
 
+struct dst_entry;
+struct iphdr;
 struct ip_vs_conn;
 struct ip_vs_app;
+struct sk_buff;
 
 struct ip_vs_protocol {
 	struct ip_vs_protocol	*next;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 11a725662c3..860bbac4c4e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -541,6 +541,9 @@ extern int sysctl_ip6frag_secret_interval;
 extern const struct proto_ops inet6_stream_ops;
 extern const struct proto_ops inet6_dgram_ops;
 
+struct group_source_req;
+struct group_filter;
+
 extern int ip6_mc_source(int add, int omode, struct sock *sk,
 			 struct group_source_req *pgsr);
 extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index f85d6e4b744..bbac87eeb42 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -35,11 +35,20 @@ enum {
 
 #ifdef __KERNEL__
 
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/config.h>
+#include <linux/compiler.h>
 #include <linux/icmpv6.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+
 #include <net/neighbour.h>
-#include <asm/atomic.h>
+
+struct ctl_table;
+struct file;
+struct inet6_dev;
+struct net_device;
+struct net_proto_family;
+struct sk_buff;
 
 extern struct neigh_table nd_tbl;
 
@@ -108,7 +117,7 @@ extern int			igmp6_event_report(struct sk_buff *skb);
 extern void			igmp6_cleanup(void);
 
 #ifdef CONFIG_SYSCTL
-extern int 			ndisc_ifinfo_sysctl_change(ctl_table *ctl,
+extern int 			ndisc_ifinfo_sysctl_change(struct ctl_table *ctl,
 							   int write,
 							   struct file * filp,
 							   void __user *buffer,
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 34c07731933..6fa9ae19074 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -49,8 +49,8 @@
 #ifdef __KERNEL__
 
 #include <asm/atomic.h>
-#include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/skbuff.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
 
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
index bd08964b72c..b225d8472b7 100644
--- a/include/net/pkt_act.h
+++ b/include/net/pkt_act.h
@@ -15,7 +15,6 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
-#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/module.h>
diff --git a/include/net/raw.h b/include/net/raw.h
index f47917469b1..e67b28a0248 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -19,6 +19,8 @@
 
 #include <linux/config.h>
 
+#include <net/protocol.h>
+
 extern struct proto raw_prot;
 
 extern void 	raw_err(struct sock *, struct sk_buff *, u32 info);
diff --git a/include/net/udp.h b/include/net/udp.h
index 107b9d791a1..766fba1369c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -22,9 +22,8 @@
 #ifndef _UDP_H
 #define _UDP_H
 
-#include <linux/udp.h>
-#include <linux/ip.h>
 #include <linux/list.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/snmp.h>
 #include <linux/seq_file.h>
@@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num)
 
 extern struct proto udp_prot;
 
+struct sk_buff;
 
 extern void	udp_err(struct sk_buff *, u32);
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 487abca3ca6..07d7b50cdd7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2,11 +2,12 @@
 #define _NET_XFRM_H
 
 #include <linux/compiler.h>
+#include <linux/in.h>
 #include <linux/xfrm.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/skbuff.h>
-#include <linux/netdevice.h>
+#include <linux/socket.h>
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <linux/in6.h>
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 23422bd53a5..223f8270dae 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -26,6 +26,7 @@
 #include <linux/ip.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/netfilter_bridge.h>
@@ -33,8 +34,11 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_arp.h>
 #include <linux/in_route.h>
+
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
+
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
 #include "br_private.h"
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index c436e6c6242..9f6e0193ae1 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -9,6 +9,7 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_log.h>
 #include <linux/netfilter.h>
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 49424a42a2c..281a632fa6a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/string.h>
+#include <linux/if_arp.h>
 #include <linux/inetdevice.h>
 #include <linux/inet.h>
 #include <linux/interrupt.h>
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index c37eeeaf5c6..de681c6ad08 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -21,6 +21,8 @@
 
 #define CCID_MAX 255
 
+struct tcp_info;
+
 struct ccid {
 	unsigned char	ccid_id;
 	const char	*ccid_name;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 99e8afa7ba1..3f244670764 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -19,6 +19,7 @@
 
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/timewait_sock.h>
 #include <net/tcp_states.h>
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 2e194c8f995..c609dc78f48 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -20,6 +20,7 @@
 #include <net/addrconf.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
+#include <net/inet_sock.h>
 #include <net/inet6_connection_sock.h>
 #include <net/inet6_hashtables.h>
 #include <net/ip6_route.h>
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 95a3c2c6a3c..efd7ffb903a 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 
+#include <net/inet_sock.h>
 #include <net/sock.h>
 
 #include "ackvec.h"
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e4e629ed9bf..65b11ea90d8 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,7 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_common.h>
-#include <net/ip.h>
+#include <net/inet_sock.h>
 #include <net/protocol.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
@@ -34,7 +34,6 @@
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/poll.h>
-#include <linux/dccp.h>
 
 #include "ccid.h"
 #include "dccp.h"
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index ff58f49c8b4..70fb2b88da6 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -31,6 +31,7 @@
 #include <linux/if_arp.h>
 #include <linux/wireless.h>
 #include <linux/skbuff.h>
+#include <linux/udp.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <linux/stat.h>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4ed8a814c6c..36a6306ca5a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -93,6 +93,7 @@
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/ip.h>
 #include <net/protocol.h>
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 035ad2c9e1b..aed537fa2c8 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -6,6 +6,7 @@
 #include <linux/crypto.h>
 #include <linux/pfkeyv2.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <asm/scatterlist.h>
 
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b425748f02d..37432088fe6 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -86,6 +86,7 @@
 #include <linux/in.h>
 #include <linux/mm.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/fddidevice.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 04a6fe3e95a..7b9bb28e2ee 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -58,6 +58,7 @@
 #endif
 #include <linux/kmod.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b18ce66e7b..73bfcae8af9 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -9,6 +9,7 @@
 #include <linux/pfkeyv2.h>
 #include <linux/random.h>
 #include <net/icmp.h>
+#include <net/protocol.h>
 #include <net/udp.h>
 
 /* decapsulation data for use when post-processing */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 19b1b984d68..18f5e509281 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -30,6 +30,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 7ea0209cb16..e2890ec8159 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0b298bbc151..0dd4d06e456 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -33,6 +33,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6d2a6ac070e..ef4724de735 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
@@ -36,6 +37,7 @@
 #include <linux/netlink.h>
 #include <linux/init.h>
 
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 92e23b2ad4d..be5a519cd2f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -73,6 +73,7 @@
 #include <linux/socket.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/string.h>
 #include <linux/netfilter_ipv4.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4a195c724f0..34758118c10 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -91,6 +91,8 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/times.h>
+
+#include <net/arp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 473d0f2b2e0..e45846ae570 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -128,6 +128,7 @@
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index dbe12da8d8b..d3f6c468faf 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -22,6 +22,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
+#include <net/route.h>
 
 /* 
  * Write options to IP header, record destination address to
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index add019c746f..6986e11d65c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -25,6 +25,7 @@
 #include <linux/skbuff.h>
 #include <linux/ip.h>
 #include <linux/icmp.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <net/sock.h>
 #include <net/ip.h>
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index fc718df17b4..d64e2ec8da7 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -28,6 +28,7 @@
 #include <net/xfrm.h>
 #include <net/icmp.h>
 #include <net/ipcomp.h>
+#include <net/protocol.h>
 
 struct ipcomp_tfms {
 	struct list_head list;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e8674baaa8d..bb3613ec448 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -42,6 +42,7 @@
 #include <linux/in.h>
 #include <linux/if.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
@@ -58,6 +59,7 @@
 #include <net/arp.h>
 #include <net/ip.h>
 #include <net/ipconfig.h>
+#include <net/route.h>
 
 #include <asm/uaccess.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 302b7eb507c..caa3b7d2e48 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -52,6 +52,7 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <net/route.h>
 #include <net/sock.h>
 #include <net/icmp.h>
 #include <net/udp.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 2a3a8c59c65..d35cea31cb5 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -24,7 +24,9 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/proc_fs.h>		/* for proc_net_* */
 #include <linux/seq_file.h>
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 9bdcf31b760..fe2c39d2a00 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,6 +35,7 @@
 #include <linux/netfilter_ipv4.h>
 
 #include <net/ip.h>
+#include <net/route.h>
 #include <net/sock.h>
 
 #include <asm/uaccess.h>
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index f3bc320dce9..9fee19c4c61 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -37,8 +37,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 67b3e2fc1fa..e7004741ac7 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -13,7 +13,10 @@
  * Changes:
  *
  */
+#include <linux/config.h>
 #include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 #include <net/ip_vs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index b6dad7e3710..6e5cb92a5c8 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -41,8 +41,10 @@
  * me to write this module.
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 8c78ef76c12..32ba37ba72d 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -39,8 +39,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 /* for sysctl */
 #include <linux/fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
index 453e94a0bbd..8b0505b0931 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
index 478e5c7c7e8..c36ccf057a1 100644
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -12,6 +12,8 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 8ae5f2e0aef..89d9175d8f2 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -15,8 +15,11 @@
  *
  */
 
+#include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 6f7c50e44a3..7775e6cc68b 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -34,8 +34,10 @@
  *
  */
 
+#include <linux/ip.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 2e5ced3d806..1bca714bda3 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -21,12 +21,14 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/inetdevice.h>
 #include <linux/net.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/igmp.h>                 /* for ip_mc_join_group */
+#include <linux/udp.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index e52847fa10f..0366eedb4d7 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -18,11 +18,13 @@
  *
  */
 
+#include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/ip.h>
 #include <linux/moduleparam.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 744abb9d377..57956dee60c 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -31,6 +31,7 @@
 #include <linux/ip.h>
 #include <linux/in.h>
 #include <linux/list.h>
+#include <linux/seq_file.h>
 
 static DEFINE_RWLOCK(ip_ct_gre_lock);
 #define ASSERT_READ_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index f2dcac7c766..46becbe4fe5 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -11,6 +11,7 @@
 #include <linux/timer.h>
 #include <linux/netfilter.h>
 #include <linux/in.h>
+#include <linux/ip.h>
 #include <linux/udp.h>
 #include <linux/seq_file.h>
 #include <net/checksum.h>
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dd476b191f4..a88bcc55124 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -27,6 +27,7 @@
 #endif
 #include <net/checksum.h>
 #include <net/ip.h>
+#include <net/route.h>
 
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 8acb7ed40b4..4f95d477805 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -44,6 +44,7 @@
  *
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -53,6 +54,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 #include <linux/ip.h>
+#include <linux/udp.h>
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <asm/uaccess.h>
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 275a174c6fe..27860510ca6 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -11,6 +11,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/timer.h>
 #include <linux/module.h>
@@ -18,6 +19,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/checksum.h>
+#include <net/route.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
index 1a53924041f..03f554857a4 100644
--- a/net/ipv4/netfilter/ipt_physdev.c
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/netfilter_ipv4/ipt_physdev.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 0d7dc668db4..39d49dc333a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -38,6 +38,7 @@
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/udp.h>
+#include <linux/inetdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/sock.h>
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index dbf82955aab..16984d4a8a0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
 #include <linux/sysctl.h>
 #include <linux/config.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 67c036384e7..223abaa72bc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -86,6 +86,7 @@
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
+#include <linux/igmp.h>
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f3629730eb1..13cc7f89558 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <net/xfrm.h>
 #include <asm/scatterlist.h>
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8bfbe997079..6de8ee1a5ad 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
 #include <linux/random.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/icmpv6.h>
 
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 55917fb1709..626dd39685f 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -47,6 +47,7 @@
 #include <linux/rtnetlink.h>
 #include <net/icmp.h>
 #include <net/ipv6.h>
+#include <net/protocol.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index dd80020d874..ea43ef1d94a 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -15,6 +15,7 @@
  *      - new extension header parser code
  */
 #include <linux/config.h>
+#include <linux/in.h>
 #include <linux/skbuff.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0cd1d1bd903..ae4653bfd65 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/spinlock.h>
 #include <linux/icmpv6.h>
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index dde37793d20..268918d5dee 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c
index 24bc0cde43a..65937de1b58 100644
--- a/net/ipv6/netfilter/ip6t_esp.c
+++ b/net/ipv6/netfilter/ip6t_esp.c
@@ -9,6 +9,7 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
+#include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/types.h>
 #include <net/checksum.h>
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 6cf0342706b..c4a2a8c4c33 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d1b0747a5b9..de693b43c8e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -54,6 +54,7 @@
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/route.h>
 #include <net/sctp/sctp.h>
 #include <net/addrconf.h>
 #include <net/inet_common.h>
-- 
cgit v1.2.3


From 8639a11e23d9eb0a6ceac2feed27acdfbb158f95 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Tue, 27 Dec 2005 15:17:57 -0200
Subject: [TCP]: Don't use __constant_htonl for a non const arg

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 176221cd0cc..36993049740 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -994,11 +994,11 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock
 		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
 		int this_sack;
 
-		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
-					  (TCPOPT_NOP << 16) |
-					  (TCPOPT_SACK << 8) |
-					  (TCPOLEN_SACK_BASE +
-					   (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)));
+		*ptr++ = htonl((TCPOPT_NOP  << 24) |
+			       (TCPOPT_NOP  << 16) |
+			       (TCPOPT_SACK <<  8) |
+			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+						     TCPOLEN_SACK_PERBLOCK)));
 		for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
 			*ptr++ = htonl(sp[this_sack].start_seq);
 			*ptr++ = htonl(sp[this_sack].end_seq);
-- 
cgit v1.2.3


From 17ba15fb6264f27374bc87f4c3f8519b80289d85 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Tue, 27 Dec 2005 20:57:40 -0800
Subject: [PPPOX]: Fix assignment into const proto_ops.

And actually, with this, the whole pppox layer can basically
be removed and subsumed into pppoe.c, no other pppox sub-protocol
implementation exists and we've had this thing for at least 4
years.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c      |  9 ++++-----
 drivers/net/pppox.c      | 10 +++-------
 include/linux/if_pppox.h |  3 +--
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 71e303b2864..9369f811075 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb);
 static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);
 
-static struct proto_ops pppoe_ops;
+static const struct proto_ops pppoe_ops;
 static DEFINE_RWLOCK(pppoe_hash_lock);
 
 static struct ppp_channel_ops pppoe_chan_ops;
@@ -1063,9 +1063,7 @@ static int __init pppoe_proc_init(void)
 static inline int pppoe_proc_init(void) { return 0; }
 #endif /* CONFIG_PROC_FS */
 
-/* ->ioctl are set at pppox_create */
-
-static struct proto_ops pppoe_ops = {
+static const struct proto_ops pppoe_ops = {
     .family		= AF_PPPOX,
     .owner		= THIS_MODULE,
     .release		= pppoe_release,
@@ -1081,7 +1079,8 @@ static struct proto_ops pppoe_ops = {
     .getsockopt		= sock_no_getsockopt,
     .sendmsg		= pppoe_sendmsg,
     .recvmsg		= pppoe_recvmsg,
-    .mmap		= sock_no_mmap
+    .mmap		= sock_no_mmap,
+    .ioctl		= pppox_ioctl,
 };
 
 static struct pppox_proto pppoe_proto = {
diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c
index 0c1e114527f..9315046b3f5 100644
--- a/drivers/net/pppox.c
+++ b/drivers/net/pppox.c
@@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto);
 EXPORT_SYMBOL(unregister_pppox_proto);
 EXPORT_SYMBOL(pppox_unbind_sock);
 
-static int pppox_ioctl(struct socket* sock, unsigned int cmd, 
-		       unsigned long arg)
+int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
 	struct pppox_sock *po = pppox_sk(sk);
@@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd,
 	return rc;
 }
 
+EXPORT_SYMBOL(pppox_ioctl);
 
 static int pppox_create(struct socket *sock, int protocol)
 {
@@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol)
 		goto out;
 
 	rc = pppox_protos[protocol]->create(sock);
-	if (!rc) {
-		/* We get to set the ioctl handler. */
-		/* For everything else, pppox is just a shell. */
-		sock->ops->ioctl = pppox_ioctl;
-	}
+
 	module_put(pppox_protos[protocol]->owner);
 out:
 	return rc;
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index e677f73f13d..4fab3d0a4bc 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -157,8 +157,7 @@ struct pppox_proto {
 extern int register_pppox_proto(int proto_num, struct pppox_proto *pp);
 extern void unregister_pppox_proto(int proto_num);
 extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */
-extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd,
-			       unsigned long arg);
+extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 
 /* PPPoX socket states */
 enum {
-- 
cgit v1.2.3


From 4947d3ef8de7b4f42aed6ea9ba689dc8fb45b5a5 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 3 Jan 2006 14:06:50 -0800
Subject: [NET]: Speed up __alloc_skb()

From: Benjamin LaHaise <bcrl@kvack.org>

In __alloc_skb(), the use of skb_shinfo() which casts a u8 * to the
shared info structure results in gcc being forced to do a reload of the
pointer since it has no information on possible aliasing.  Fix this by
using a pointer to refer to skb_shared_info.

By initializing skb_shared_info sequentially, the write combining buffers
can reduce the number of memory transactions to a single write.  Reorder
the initialization in __alloc_skb() to match the structure definition.
There is also an alignment issue on 64 bit systems with skb_shared_info
by converting nr_frags to a short everything packs up nicely.

Also, pass the slab cache pointer according to the fclone flag instead
of using two almost identical function calls.

This raises bw_unix performance up to a peak of 707KB/s when combined
with the spinlock patch.  It should help other networking protocols, too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 +-
 net/core/skbuff.c      | 27 +++++++++++++--------------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 971677178e0..483cfc47ec3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -133,7 +133,7 @@ struct skb_frag_struct {
  */
 struct skb_shared_info {
 	atomic_t	dataref;
-	unsigned int	nr_frags;
+	unsigned short	nr_frags;
 	unsigned short	tso_size;
 	unsigned short	tso_segs;
 	unsigned short  ufo_size;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83fee37de38..070f91cfde5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			    int fclone)
 {
+	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
 
 	/* Get the HEAD */
-	if (fclone)
-		skb = kmem_cache_alloc(skbuff_fclone_cache,
-				       gfp_mask & ~__GFP_DMA);
-	else
-		skb = kmem_cache_alloc(skbuff_head_cache,
-				       gfp_mask & ~__GFP_DMA);
-
+	skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache,
+				gfp_mask & ~__GFP_DMA);
 	if (!skb)
 		goto out;
 
@@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	skb->data = data;
 	skb->tail = data;
 	skb->end  = data + size;
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	atomic_set(&shinfo->dataref, 1);
+	shinfo->nr_frags  = 0;
+	shinfo->tso_size = 0;
+	shinfo->tso_segs = 0;
+	shinfo->ufo_size = 0;
+	shinfo->ip6_frag_id = 0;
+	shinfo->frag_list = NULL;
+
 	if (fclone) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
-	atomic_set(&(skb_shinfo(skb)->dataref), 1);
-	skb_shinfo(skb)->nr_frags  = 0;
-	skb_shinfo(skb)->tso_size = 0;
-	skb_shinfo(skb)->tso_segs = 0;
-	skb_shinfo(skb)->frag_list = NULL;
-	skb_shinfo(skb)->ufo_size = 0;
-	skb_shinfo(skb)->ip6_frag_id = 0;
 out:
 	return skb;
 nodata:
-- 
cgit v1.2.3


From fd19f329a32bdc4eb07885e0b3889567cfe00aa7 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 3 Jan 2006 14:10:46 -0800
Subject: [AF_UNIX]: Convert to use a spinlock instead of rwlock

From: Benjamin LaHaise <bcrl@kvack.org>

In af_unix, a rwlock is used to protect internal state.  At least on my
P4 with HT it is faster to use a spinlock due to the simpler memory
barrier used to unlock.  This patch raises bw_unix to ~690K/s.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h | 10 +++++-----
 net/unix/af_unix.c    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 3f302ae98c0..bfc1779fc75 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -58,10 +58,10 @@ struct unix_skb_parms {
 #define UNIXCB(skb) 	(*(struct unix_skb_parms*)&((skb)->cb))
 #define UNIXCREDS(skb)	(&UNIXCB((skb)).creds)
 
-#define unix_state_rlock(s)	read_lock(&unix_sk(s)->lock)
-#define unix_state_runlock(s)	read_unlock(&unix_sk(s)->lock)
-#define unix_state_wlock(s)	write_lock(&unix_sk(s)->lock)
-#define unix_state_wunlock(s)	write_unlock(&unix_sk(s)->lock)
+#define unix_state_rlock(s)	spin_lock(&unix_sk(s)->lock)
+#define unix_state_runlock(s)	spin_unlock(&unix_sk(s)->lock)
+#define unix_state_wlock(s)	spin_lock(&unix_sk(s)->lock)
+#define unix_state_wunlock(s)	spin_unlock(&unix_sk(s)->lock)
 
 #ifdef __KERNEL__
 /* The AF_UNIX socket */
@@ -76,7 +76,7 @@ struct unix_sock {
         struct sock		*other;
         struct sock		*gc_tree;
         atomic_t                inflight;
-        rwlock_t                lock;
+        spinlock_t		lock;
         wait_queue_head_t       peer_wait;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7d3fe6aebcd..1ddd36d5009 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock)
 	u	  = unix_sk(sk);
 	u->dentry = NULL;
 	u->mnt	  = NULL;
-	rwlock_init(&u->lock);
+	spin_lock_init(&u->lock);
 	atomic_set(&u->inflight, sock ? 0 : -1);
 	init_MUTEX(&u->readsem); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
-- 
cgit v1.2.3


From b461d2f2188c1c578ed651e4cdf608be7a993cd4 Mon Sep 17 00:00:00 2001
From: Per Liden <per.liden@ericsson.com>
Date: Tue, 3 Jan 2006 14:13:29 -0800
Subject: [NETLINK] genetlink: fix cmd type in genl_ops to be consistent to u8

Signed-off-by: Per Liden <per.liden@ericsson.com>
ACKed-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 +-
 net/netlink/genetlink.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 52d8b1a73d5..c5b96b2b815 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -60,7 +60,7 @@ struct genl_info
  */
 struct genl_ops
 {
-	unsigned int		cmd;
+	u8			cmd;
 	unsigned int		flags;
 	struct nla_policy	*policy;
 	int		       (*doit)(struct sk_buff *skb,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 287cfcc5695..3b1378498d5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -441,7 +441,7 @@ errout:
 }
 
 static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
-				      int seq, int cmd)
+				      int seq, u8 cmd)
 {
 	struct sk_buff *skb;
 	int err;
-- 
cgit v1.2.3


From 5ff7630e4aa6c3969094dc30ff1cdaa6f52b0ed0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Jan 2006 14:14:46 -0800
Subject: [NETROM]: Remove unessecary lock_sock calls in netrom_ioctl()

lock_sock is needed only in very few cases, so do it there instead of
around the switch statement.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netrom/af_netrom.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 05b653c0597..9ee672e19ec 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 	int ret;
 
-	lock_sock(sk);
 	switch (cmd) {
 	case TIOCOUTQ: {
 		long amount;
+
+		lock_sock(sk);
 		amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc);
 		if (amount < 0)
 			amount = 0;
@@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case TIOCINQ: {
 		struct sk_buff *skb;
 		long amount = 0L;
+
+		lock_sock(sk);
 		/* These two are safe on a single CPU system as only user tasks fiddle here */
 		if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
 			amount = skb->len;
@@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	}
 
 	case SIOCGSTAMP:
+		lock_sock(sk);
 		ret = sock_get_timestamp(sk, argp);
 		release_sock(sk);
 		return ret;
@@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCSIFNETMASK:
 	case SIOCGIFMETRIC:
 	case SIOCSIFMETRIC:
-		release_sock(sk);
 		return -EINVAL;
 
 	case SIOCADDRT:
 	case SIOCDELRT:
 	case SIOCNRDECOBS:
-		release_sock(sk);
 		if (!capable(CAP_NET_ADMIN)) return -EPERM;
 		return nr_rt_ioctl(cmd, argp);
 
 	default:
-		release_sock(sk);
 		return dev_ioctl(cmd, argp);
 	}
-	release_sock(sk);
 
 	return 0;
 }
-- 
cgit v1.2.3


From b5e5fa5e093e42cab4ee3d6dcbc4f450ad29a723 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 3 Jan 2006 14:18:33 -0800
Subject: [NET]: Add a dev_ioctl() fallback to sock_ioctl()

Currently all network protocols need to call dev_ioctl as the default
fallback in their ioctl implementations.  This patch adds a fallback
to dev_ioctl to sock_ioctl if the protocol returned -ENOIOCTLCMD.
This way all the procotol ioctl handlers can be simplified and we don't
need to export dev_ioctl.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/ddp.c        | 19 +------------------
 net/ax25/af_ax25.c         |  2 +-
 net/core/dev.c             |  1 -
 net/decnet/af_decnet.c     |  2 +-
 net/econet/af_econet.c     |  2 +-
 net/ipv4/af_inet.c         |  8 ++++----
 net/ipv6/af_inet6.c        |  8 +++-----
 net/ipx/af_ipx.c           |  2 +-
 net/irda/af_irda.c         |  2 +-
 net/llc/af_llc.c           |  2 +-
 net/netrom/af_netrom.c     |  2 +-
 net/packet/af_packet.c     |  2 +-
 net/rose/af_rose.c         |  2 +-
 net/socket.c               |  7 +++++++
 net/unix/af_unix.c         |  2 +-
 net/wanrouter/af_wanpipe.c |  2 +-
 net/x25/af_x25.c           |  2 +-
 17 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 296f186802f..a5144e43aae 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
  */
 static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-	int rc = -EINVAL;
+	int rc = -ENOIOCTLCMD;
 	struct sock *sk = sock->sk;
 	void __user *argp = (void __user *)arg;
 
@@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			rc = atif_ioctl(cmd, argp);
 			rtnl_unlock();
 			break;
-		/* Physical layer ioctl calls */
-		case SIOCSIFLINK:
-		case SIOCGIFHWADDR:
-		case SIOCSIFHWADDR:
-		case SIOCGIFFLAGS:
-		case SIOCSIFFLAGS:
-		case SIOCGIFTXQLEN:
-		case SIOCSIFTXQLEN:
-		case SIOCGIFMTU:
-		case SIOCGIFCONF:
-		case SIOCADDMULTI:
-		case SIOCDELMULTI:
-		case SIOCGIFCOUNT:
-		case SIOCGIFINDEX:
-		case SIOCGIFNAME:
-			rc = dev_ioctl(cmd, argp);
-			break;
 	}
 
 	return rc;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 8b5d10aaba0..e8753c7fcad 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		break;
 
 	default:
-		res = dev_ioctl(cmd, argp);
+		res = -ENOIOCTLCMD;
 		break;
 	}
 	release_sock(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index a5efc9ae010..29ba109d3e5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close);
 EXPORT_SYMBOL(dev_get_by_flags);
 EXPORT_SYMBOL(dev_get_by_index);
 EXPORT_SYMBOL(dev_get_by_name);
-EXPORT_SYMBOL(dev_ioctl);
 EXPORT_SYMBOL(dev_open);
 EXPORT_SYMBOL(dev_queue_xmit);
 EXPORT_SYMBOL(dev_remove_pack);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 65e3baed025..78ec5344be8 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		break;
 
 	default:
-		err = dev_ioctl(cmd, (void __user *)arg);
+		err = -ENOIOCTLCMD;
 		break;
 	}
 
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 70fb2b88da6..4fc245243cc 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -687,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg
 			break;
 
 		default:
-			return dev_ioctl(cmd, argp);
+			return -ENOIOCTLCMD;
 	}
 	/*NOTREACHED*/
 	return 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 36a6306ca5a..966a071a408 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -777,10 +777,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			err = devinet_ioctl(cmd, (void __user *)arg);
 			break;
 		default:
-			if (!sk->sk_prot->ioctl ||
-			    (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
-			    					-ENOIOCTLCMD)
-				err = dev_ioctl(cmd, (void __user *)arg);
+			if (sk->sk_prot->ioctl)
+				err = sk->sk_prot->ioctl(sk, cmd, arg);
+			else
+				err = -ENOIOCTLCMD;
 			break;
 	}
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7c9f19269f2..68afc53be66 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -434,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
-	int err = -EINVAL;
 
 	switch(cmd) 
 	{
@@ -453,10 +452,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCSIFDSTADDR:
 		return addrconf_set_dstaddr((void __user *) arg);
 	default:
-		if (!sk->sk_prot->ioctl ||
-		    (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD)
-			return(dev_ioctl(cmd,(void __user *) arg));		
-		return err;
+		if (!sk->sk_prot->ioctl)
+			return -ENOIOCTLCMD;
+		return sk->sk_prot->ioctl(sk, cmd, arg);
 	}
 	/*NOTREACHED*/
 	return(0);
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 6c464c11bb0..0dc519b4040 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		rc = -EINVAL;
 		break;
 	default:
-		rc = dev_ioctl(cmd, argp);
+		rc = -ENOIOCTLCMD;
 		break;
 	}
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index e57683d424f..fbfa9675441 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1822,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return -EINVAL;
 	default:
 		IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__);
-		return dev_ioctl(cmd, (void __user *) arg);
+		return -ENOIOCTLCMD;
 	}
 
 	/*NOTREACHED*/
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 9cf65f9d890..8171c53bc0e 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -959,7 +959,7 @@ out:
 static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
 			unsigned long arg)
 {
-	return dev_ioctl(cmd, (void __user *)arg);
+	return -ENOIOCTLCMD;
 }
 
 /**
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 9ee672e19ec..63b0e4afeb3 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1215,7 +1215,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return nr_rt_ioctl(cmd, argp);
 
 	default:
-		return dev_ioctl(cmd, argp);
+		return -ENOIOCTLCMD;
 	}
 
 	return 0;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index deda6fdb1e5..f69e5ed9bd0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
 #endif
 
 		default:
-			return dev_ioctl(cmd, (void __user *)arg);
+			return -ENOIOCTLCMD;
 	}
 	return 0;
 }
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 829fdbc4400..63090be2315 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return 0;
 
 	default:
-		return dev_ioctl(cmd, argp);
+		return -ENOIOCTLCMD;
 	}
 
 	return 0;
diff --git a/net/socket.c b/net/socket.c
index 0dde6dade2b..06fa217f58a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -900,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			break;
 		default:
 			err = sock->ops->ioctl(sock, cmd, arg);
+
+			/*
+			 * If this ioctl is unknown try to hand it down
+			 * to the NIC driver.
+			 */
+			if (err == -ENOIOCTLCMD)
+				err = dev_ioctl(cmd, argp);
 			break;
 	}
 	return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1ddd36d5009..5f6ae79b8b1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1859,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		}
 
 		default:
-			err = dev_ioctl(cmd, (void __user *)arg);
+			err = -ENOIOCTLCMD;
 			break;
 	}
 	return err;
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 67948bf22dc..7a43ae4721e 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar
 #endif
 
 		default:
-			return dev_ioctl(cmd,(void __user *) arg);
+			return -ENOIOCTLCMD;
 	}
 	/*NOTREACHED*/
 }
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index ca8b3b0b920..16459c7f54b 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		}
 
  		default:
-			rc = dev_ioctl(cmd, argp);
+			rc = -ENOIOCTLCMD;
 			break;
 	}
 
-- 
cgit v1.2.3


From fd30333d0fab9e870af89e112454996c188655e9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 3 Jan 2006 14:19:25 -0800
Subject: [TG3]: fixup tot_len calculation

Turning struct iphdr::tot_len into __be16 added sparse warning.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 59d916ccc81..eb86b059809 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3651,7 +3651,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			       TXD_FLAG_CPU_POST_DMA);
 
 		skb->nh.iph->check = 0;
-		skb->nh.iph->tot_len = ntohs(mss + ip_tcp_len + tcp_opt_len);
+		skb->nh.iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len);
 		if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) {
 			skb->h.th->check = 0;
 			base_flags &= ~TXD_FLAG_TCPUDP_CSUM;
-- 
cgit v1.2.3


From 4b5bdf5cc3695dc5caba011b9c616b40e6299638 Mon Sep 17 00:00:00 2001
From: Roberto Nibali <ratz@drugphish.ch>
Date: Tue, 3 Jan 2006 14:22:59 -0800
Subject: [IPVS]: Cleanup IP_VS_DBG statements.

From: Roberto Nibali <ratz@drugphish.ch>

The attached patch (against current -GIT) is a cleanup patch which does
following:

o lookup debug messages shifted back to 9
o added more informational value to flags and refcnt since those
entries can be in multiple referenced structures
o cleanup 80 char violation

It's the prepatch to the session pool implementation and helps very much
to debug and monitor important variables and structures regarding the
threshold limitation and persistency without the thousands of lookup
messages which noone is interested in.

Signed-off-by: Horms <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_conn.c      | 18 ++++++++++--------
 net/ipv4/ipvs/ip_vs_core.c      |  2 +-
 net/ipv4/ipvs/ip_vs_ctl.c       |  9 +++++----
 net/ipv4/ipvs/ip_vs_proto_tcp.c |  2 +-
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d35cea31cb5..ed5bd56a501 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -221,7 +221,7 @@ struct ip_vs_conn *ip_vs_conn_in_get
 	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
 		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
 
-	IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+	IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 		  ip_vs_proto_name(protocol),
 		  NIPQUAD(s_addr), ntohs(s_port),
 		  NIPQUAD(d_addr), ntohs(d_port),
@@ -256,7 +256,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
   out:
 	ct_read_unlock(hash);
 
-	IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+	IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 		  ip_vs_proto_name(protocol),
 		  NIPQUAD(s_addr), ntohs(s_port),
 		  NIPQUAD(d_addr), ntohs(d_port),
@@ -297,7 +297,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
 
 	ct_read_unlock(hash);
 
-	IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+	IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
 		  ip_vs_proto_name(protocol),
 		  NIPQUAD(s_addr), ntohs(s_port),
 		  NIPQUAD(d_addr), ntohs(d_port),
@@ -393,8 +393,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 	cp->flags |= atomic_read(&dest->conn_flags);
 	cp->dest = dest;
 
-	IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-		  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+	IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		  "dest->refcnt:%d\n",
 		  ip_vs_proto_name(cp->protocol),
 		  NIPQUAD(cp->caddr), ntohs(cp->cport),
 		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -432,8 +433,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
 	if (!dest)
 		return;
 
-	IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-		  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+	IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+		  "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+		  "dest->refcnt:%d\n",
 		  ip_vs_proto_name(cp->protocol),
 		  NIPQUAD(cp->caddr), ntohs(cp->cport),
 		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -573,7 +575,7 @@ static void ip_vs_conn_expire(unsigned long data)
 	ip_vs_conn_hash(cp);
 
   expire_later:
-	IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
 		  atomic_read(&cp->refcnt)-1,
 		  atomic_read(&cp->n_control));
 
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 1a0843cd58a..1aca94a9fd8 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		return NULL;
 
 	IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
-		  "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+		  "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
 		  ip_vs_fwd_tag(cp),
 		  NIPQUAD(cp->caddr), ntohs(cp->cport),
 		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index fe2c39d2a00..c935c5086d3 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -448,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
   out:
 	read_unlock(&__ip_vs_svc_lock);
 
-	IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+	IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
 		  fwmark, ip_vs_proto_name(protocol),
 		  NIPQUAD(vaddr), ntohs(vport),
 		  svc?"hit":"not hit");
@@ -598,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
 	 */
 	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
 		IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
-			  "refcnt=%d\n",
+			  "dest->refcnt=%d\n",
 			  dest->vfwmark,
 			  NIPQUAD(dest->addr), ntohs(dest->port),
 			  atomic_read(&dest->refcnt));
@@ -805,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
 	dest = ip_vs_trash_get_dest(svc, daddr, dport);
 	if (dest != NULL) {
 		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
-			  "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+			  "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
 			  NIPQUAD(daddr), ntohs(dport),
 			  atomic_read(&dest->refcnt),
 			  dest->vfwmark,
@@ -950,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 		atomic_dec(&dest->svc->refcnt);
 		kfree(dest);
 	} else {
-		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
+			  "dest->refcnt=%d\n",
 			  NIPQUAD(dest->addr), ntohs(dest->port),
 			  atomic_read(&dest->refcnt));
 		list_add(&dest->n_list, &ip_vs_dest_trash);
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index 638ba8c9e8b..bc28b1160a3 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -426,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		struct ip_vs_dest *dest = cp->dest;
 
 		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
-			  "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+			  "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
 			  pp->name,
 			  (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
 			  th->syn? 'S' : '.',
-- 
cgit v1.2.3


From 5062430c5cc526655e3d10c670fc9c263656f66c Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <patrick@tykepenguin.com>
Date: Tue, 3 Jan 2006 14:24:02 -0800
Subject: [DECNET]: Only use local routers

The attached patch makes DECnet routing only use routers from the same
area - rather than the highest rated router seen.

In theory there should not be an out-of-area router on a local network
but some networks are bridged rather than properly routed. VMS seems
to behave similarly: if I bring up a VMS node with no router then it
can't see anything else on the global network.

Signed-off-by: Patrick Caulfield <patrick@tykepenguin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_neigh.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 8d0cc3cf3e4..33ab256cfd4 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb)
 			}
 		}
 
-		if (!dn_db->router) {
-			dn_db->router = neigh_clone(neigh);
-		} else {
-			if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
-				neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+		/* Only use routers in our area */
+		if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) {
+			if (!dn_db->router) {
+				dn_db->router = neigh_clone(neigh);
+			} else {
+				if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
+					neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
+			}
 		}
 		write_unlock(&neigh->lock);
 		neigh_release(neigh);
-- 
cgit v1.2.3


From 709dd3aaf5304993083c2297c73f5531c36fba5a Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Tue, 3 Jan 2006 14:25:17 -0800
Subject: [DCCP]: Do not process a packet twice when it's not in state
 DCCP_OPEN.

When packets are received, the connection is either in DCCP_OPEN
[fast-path] or it isn't.  If it's not [e.g. DCCP_PARTOPEN] upper
layers will perform sanity checks and parse options.  If it is in
DCCP_OPEN, dccp_rcv_established() will do it.  It is important not to
re-parse options in dccp_rcv_established() when it is not called from
the fast-path.  Else, fore example, the ack vector will be added twice
and the CCID will see the packet twice.

The solution is to always enfore sanity checks from the upper layers.
When packets arrive in the fast-path, sanity checks will be performed
before calling dccp_rcv_established().

Note(acme): I rewrote the patch to achieve the same result but keeping
dccp_rcv_established with the previous semantics and having it split
into __dccp_rcv_established, that doesn't does do any sanity check,
code in state != DCCP_OPEN use this lighter version as they already do
the sanity checks.

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 56 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 55e921bdd13..5e312b04a7d 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct dccp_hdr *dh, const unsigned len)
+static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+					 const struct dccp_hdr *dh,
+					 const unsigned len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (dccp_check_seqno(sk, skb))
-		goto discard;
-
-	if (dccp_parse_options(sk, skb))
-		goto discard;
-
-	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
-		dccp_event_ack_recv(sk, skb);
-
-	if (dp->dccps_options.dccpo_send_ack_vector &&
-	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-			    DCCP_SKB_CB(skb)->dccpd_seq,
-			    DCCP_ACKVEC_STATE_RECEIVED))
-		goto discard;
-
-	ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
-	ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
 	switch (dccp_hdr(skb)->dccph_type) {
 	case DCCP_PKT_DATAACK:
 	case DCCP_PKT_DATA:
@@ -250,6 +233,35 @@ discard:
 	return 0;
 }
 
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			 const struct dccp_hdr *dh, const unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (dccp_check_seqno(sk, skb))
+		goto discard;
+
+	if (dccp_parse_options(sk, skb))
+		goto discard;
+
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_event_ack_recv(sk, skb);
+
+	if (dp->dccps_options.dccpo_send_ack_vector &&
+	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+			    DCCP_SKB_CB(skb)->dccpd_seq,
+			    DCCP_ACKVEC_STATE_RECEIVED))
+		goto discard;
+
+	ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+	ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+	return __dccp_rcv_established(sk, skb, dh, len);
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
 EXPORT_SYMBOL_GPL(dccp_rcv_established);
 
 static int dccp_rcv_request_sent_state_process(struct sock *sk,
@@ -400,9 +412,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
 
 		if (dh->dccph_type == DCCP_PKT_DATAACK ||
 		    dh->dccph_type == DCCP_PKT_DATA) {
-			dccp_rcv_established(sk, skb, dh, len);
+			__dccp_rcv_established(sk, skb, dh, len);
 			queued = 1; /* packet was queued
-				       (by dccp_rcv_established) */
+				       (by __dccp_rcv_established) */
 		}
 		break;
 	}
-- 
cgit v1.2.3


From 9e377202d2c968dde8efd6121d94c7f0a77787aa Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Tue, 3 Jan 2006 14:25:49 -0800
Subject: [DCCP]: Send an ACK vector when ACKing a response packet

If ACK vectors are used, each packet with an ACK should contain an ACK
vector.  The only exception currently is response packets.  It
probably is not a good idea to store ACK vector state before the
connection is completed (to help protect from syn floods).

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/dccp/input.c b/net/dccp/input.c
index 5e312b04a7d..cb0f5c9af1b 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -300,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
+                if (dp->dccps_options.dccpo_send_ack_vector &&
+                    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+                                    DCCP_SKB_CB(skb)->dccpd_seq,
+                                    DCCP_ACKVEC_STATE_RECEIVED))
+                        goto out_invalid_packet; /* FIXME: change error code */
+
 		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
 		dccp_update_gsr(sk, dp->dccps_isr);
 		/*
-- 
cgit v1.2.3


From e84a9f5e9cd2b229dda24002334bc3cd36c1109d Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Tue, 3 Jan 2006 14:26:15 -0800
Subject: [DCCP]: Notify CCID only after ACK vectors have been processed.

The CCID should be notified of packet reception only when a packet is
valid.  Therefore, the ACK vector needs to be processed before
notifying the CCID.  Also, the CCID might need information provided by
the ACK vector.

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/dccp/input.c b/net/dccp/input.c
index cb0f5c9af1b..b6cba72b44e 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -492,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
 
-		ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
-		ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-
  		if (dp->dccps_options.dccpo_send_ack_vector &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
  				    DCCP_SKB_CB(skb)->dccpd_seq,
  				    DCCP_ACKVEC_STATE_RECEIVED))
  			goto discard;
+
+		ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+		ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
 	}
 
 	/*
-- 
cgit v1.2.3


From 554c9a8ec37729bff69951cb740074abbae21afa Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 14:35:54 -0800
Subject: [BRIDGE]: Fix faulty check in br_stp_recalculate_bridge_id()

One of the conversions from memcmp to compare_ether_addr is incorrect.
We need to do relative comparison to determine min MAC address to
use in bridge id.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 2d2e969ae25..cc047f7fb6e 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -157,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br)
 
 	list_for_each_entry(p, &br->port_list, list) {
 		if (addr == br_mac_zero ||
-		    compare_ether_addr(p->dev->dev_addr, addr) < 0)
+		    memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
 			addr = p->dev->dev_addr;
 
 	}
-- 
cgit v1.2.3


From cd8787ab04d23f925f440b712b43a6fd5cb31ece Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 14:38:34 -0800
Subject: [IPV4] fib_trie: build fix

Need this to fix build of fib_trie in net-2.6.16 (rebased) tree.
The code needs the new inet_make_mask inline.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a3ed7e13f79..e320b32373e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -66,6 +66,7 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/inet.h>
+#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
-- 
cgit v1.2.3


From 88df8ef59a3eb54b1e2412765ff2736d2376d1ca Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 15:25:45 -0800
Subject: [NET]: Don't exclude broadcast addresses from
 is_multicast_ether_addr()

The check for multicast shouldn't exclude broadcast type addresses.
This reverts the incorrect change done in 2.6.13.

The broadcast address is a multicast address and should be excluded
from being a valid_ether_address for use in bridging or device address.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 5f49a30eb6f..745c988359c 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr)
  * @addr: Pointer to a six-byte array containing the Ethernet address
  *
  * Return true if the address is a multicast address.
+ * By definition the broadcast address is also a multicast address.
  */
 static inline int is_multicast_ether_addr(const u8 *addr)
 {
-	return ((addr[0] != 0xff) && (0x01 & addr[0]));
+	return (0x01 & addr[0]);
 }
 
 /**
-- 
cgit v1.2.3


From 3c19065a1e2c862becc576bc65e54f2bc1cbffe6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 15:27:38 -0800
Subject: [IEEE80211] ipw2200: Simplify multicast checks.

From: Stephen Hemminger <shemminger@osdl.org>

is_multicast_ether_addr() accepts broadcast too, so the
is_broadcast_ether_addr() calls are redundant.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wireless/ipw2200.c | 15 +++++----------
 net/ieee80211/ieee80211_rx.c   |  5 ++---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 5e7c7e944c9..64f6d1f2575 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -7456,8 +7456,7 @@ static void ipw_handle_data_packet(struct ipw_priv *priv,
 	/* HW decrypt will not clear the WEP bit, MIC, PN, etc. */
 	hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data;
 	if (priv->ieee->iw_mode != IW_MODE_MONITOR &&
-	    ((is_multicast_ether_addr(hdr->addr1) ||
-	      is_broadcast_ether_addr(hdr->addr1)) ?
+	    (is_multicast_ether_addr(hdr->addr1) ?
 	     !priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt))
 		ipw_rebuild_decrypted_skb(priv, rxb->skb);
 
@@ -7648,8 +7647,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
 			return 0;
 
 		/* {broad,multi}cast packets to our BSSID go through */
-		if (is_multicast_ether_addr(header->addr1) ||
-		    is_broadcast_ether_addr(header->addr1))
+		if (is_multicast_ether_addr(header->addr1))
 			return !memcmp(header->addr3, priv->bssid, ETH_ALEN);
 
 		/* packets to our adapter go through */
@@ -7662,8 +7660,7 @@ static inline int is_network_packet(struct ipw_priv *priv,
 			return 0;
 
 		/* {broad,multi}cast packets to our BSS go through */
-		if (is_multicast_ether_addr(header->addr1) ||
-		    is_broadcast_ether_addr(header->addr1))
+		if (is_multicast_ether_addr(header->addr1))
 			return !memcmp(header->addr2, priv->bssid, ETH_ALEN);
 
 		/* packets to our adapter go through */
@@ -9657,8 +9654,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
 	switch (priv->ieee->iw_mode) {
 	case IW_MODE_ADHOC:
 		hdr_len = IEEE80211_3ADDR_LEN;
-		unicast = !(is_multicast_ether_addr(hdr->addr1) ||
-			    is_broadcast_ether_addr(hdr->addr1));
+		unicast = !is_multicast_ether_addr(hdr->addr1);
 		id = ipw_find_station(priv, hdr->addr1);
 		if (id == IPW_INVALID_STATION) {
 			id = ipw_add_station(priv, hdr->addr1);
@@ -9673,8 +9669,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb,
 
 	case IW_MODE_INFRA:
 	default:
-		unicast = !(is_multicast_ether_addr(hdr->addr3) ||
-			    is_broadcast_ether_addr(hdr->addr3));
+		unicast = !is_multicast_ether_addr(hdr->addr3);
 		hdr_len = IEEE80211_3ADDR_LEN;
 		id = 0;
 		break;
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index 03efaacbdb7..4cc6f41c693 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
 		return 1;
 	}
 
-	if ((is_multicast_ether_addr(hdr->addr1) ||
-	     is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt :
-	    ieee->host_decrypt) {
+	if (is_multicast_ether_addr(hdr->addr1)
+	    ? ieee->host_mc_decrypt : ieee->host_decrypt) {
 		int idx = 0;
 		if (skb->len >= hdrlen + 3)
 			idx = skb->data[hdrlen + 3] >> 6;
-- 
cgit v1.2.3


From 40efc6fa179f440a008333ea98f701bc35a1f97f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 3 Jan 2006 16:03:49 -0800
Subject: [TCP]: less inline's

TCP inline usage cleanup:
 * get rid of inline in several places
 * replace __inline__ with inline where possible
 * move functions used in one file out of tcp.h
 * let compiler decide on used once cases

On x86_64:
   text	   data	    bss	    dec	    hex	filename
3594701	 648348	 567400	4810449	 4966d1	vmlinux.orig
3593133	 648580	 567400	4809113	 496199	vmlinux

On sparc64:
   text	   data	    bss	    dec	    hex	filename
2538278	 406152	 530392	3474822	 350586	vmlinux.ORIG
2536382	 406384	 530392	3473158	 34ff06	vmlinux

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 193 +++++++-------------------------------------------
 net/ipv4/tcp_cong.c   |  28 ++++++++
 net/ipv4/tcp_input.c  |  82 +++++++++++++++------
 net/ipv4/tcp_ipv4.c   |   9 ++-
 net/ipv4/tcp_output.c |  87 ++++++++++++++++++++---
 5 files changed, 198 insertions(+), 201 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 36993049740..77f21c65bbc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -445,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			 sk_read_actor_t recv_actor);
 
-/* Initialize RCV_MSS value.
- * RCV_MSS is an our guess about MSS used by the peer.
- * We haven't any direct information about the MSS.
- * It's better to underestimate the RCV_MSS rather than overestimate.
- * Overestimations make us ACKing less frequently than needed.
- * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
- */
+extern void tcp_initialize_rcv_mss(struct sock *sk);
 
-static inline void tcp_initialize_rcv_mss(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
-
-	hint = min(hint, tp->rcv_wnd/2);
-	hint = min(hint, TCP_MIN_RCVMSS);
-	hint = max(hint, TCP_MIN_MSS);
-
-	inet_csk(sk)->icsk_ack.rcv_mss = hint;
-}
-
-static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
 			       ntohl(TCP_FLAG_ACK) |
 			       snd_wnd);
 }
 
-static __inline__ void tcp_fast_path_on(struct tcp_sock *tp)
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
 {
 	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
 }
@@ -490,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp)
  * Rcv_nxt can be after the window if our peer push more data
  * than the offered window.
  */
-static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp)
+static inline u32 tcp_receive_window(const struct tcp_sock *tp)
 {
 	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
 
@@ -662,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk);
 extern int tcp_set_default_congestion_control(const char *name);
 extern void tcp_get_default_congestion_control(char *name);
 extern int tcp_set_congestion_control(struct sock *sk, const char *name);
+extern void tcp_slow_start(struct tcp_sock *tp);
 
 extern struct tcp_congestion_ops tcp_init_congestion_ops;
 extern u32 tcp_reno_ssthresh(struct sock *sk);
@@ -701,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
  *	"Packets left network, but not honestly ACKed yet" PLUS
  *	"Packets fast retransmitted"
  */
-static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
+static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 {
 	return (tp->packets_out - tp->left_out + tp->retrans_out);
 }
@@ -721,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 			    (tp->snd_cwnd >> 2)));
 }
 
-/*
- * Linear increase during slow start
- */
-static inline void tcp_slow_start(struct tcp_sock *tp)
-{
-	if (sysctl_tcp_abc) {
-		/* RFC3465: Slow Start
-		 * TCP sender SHOULD increase cwnd by the number of
-		 * previously unacknowledged bytes ACKed by each incoming
-		 * acknowledgment, provided the increase is not more than L
-		 */
-		if (tp->bytes_acked < tp->mss_cache)
-			return;
-
-		/* We MAY increase by 2 if discovered delayed ack */
-		if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-		}
-	}
-	tp->bytes_acked = 0;
-
-	if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-		tp->snd_cwnd++;
-}
-
-
 static inline void tcp_sync_left_out(struct tcp_sock *tp)
 {
 	if (tp->rx_opt.sack_ok &&
@@ -756,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
 	tp->left_out = tp->sacked_out + tp->lost_out;
 }
 
-/* Set slow start threshold and cwnd not falling to slow start */
-static inline void __tcp_enter_cwr(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->undo_marker = 0;
-	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-	tp->snd_cwnd = min(tp->snd_cwnd,
-			   tcp_packets_in_flight(tp) + 1U);
-	tp->snd_cwnd_cnt = 0;
-	tp->high_seq = tp->snd_nxt;
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-	TCP_ECN_queue_cwr(tp);
-}
-
-static inline void tcp_enter_cwr(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->prior_ssthresh = 0;
-	tp->bytes_acked = 0;
-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
-		__tcp_enter_cwr(sk);
-		tcp_set_ca_state(sk, TCP_CA_CWR);
-	}
-}
-
+extern void tcp_enter_cwr(struct sock *sk);
 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
 
 /* Slow start with delack produces 3 packets of burst, so that
@@ -815,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 		return left <= tcp_max_burst(tp);
 }
 
-static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, 
-					   const struct sk_buff *skb)
+static inline void tcp_minshall_update(struct tcp_sock *tp, int mss,
+				       const struct sk_buff *skb)
 {
 	if (skb->len < mss)
 		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
 }
 
-static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	if (!tp->packets_out && !icsk->icsk_pending)
@@ -830,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t
 					  icsk->icsk_rto, TCP_RTO_MAX);
 }
 
-static __inline__ void tcp_push_pending_frames(struct sock *sk,
-					       struct tcp_sock *tp)
+static inline void tcp_push_pending_frames(struct sock *sk,
+					   struct tcp_sock *tp)
 {
 	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
 }
 
-static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
+static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
 {
 	tp->snd_wl1 = seq;
 }
 
-static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
+static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
 {
 	tp->snd_wl1 = seq;
 }
@@ -849,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
 /*
  * Calculate(/check) TCP checksum
  */
-static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len,
-				   unsigned long saddr, unsigned long daddr, 
-				   unsigned long base)
+static inline u16 tcp_v4_check(struct tcphdr *th, int len,
+			       unsigned long saddr, unsigned long daddr, 
+			       unsigned long base)
 {
 	return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
 }
 
-static __inline__ int __tcp_checksum_complete(struct sk_buff *skb)
+static inline int __tcp_checksum_complete(struct sk_buff *skb)
 {
 	return __skb_checksum_complete(skb);
 }
 
-static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
+static inline int tcp_checksum_complete(struct sk_buff *skb)
 {
 	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
 		__tcp_checksum_complete(skb);
@@ -869,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb)
 
 /* Prequeue for VJ style copy to user, combined with checksumming. */
 
-static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
+static inline void tcp_prequeue_init(struct tcp_sock *tp)
 {
 	tp->ucopy.task = NULL;
 	tp->ucopy.len = 0;
@@ -885,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp)
  *
  * NOTE: is this not too big to inline?
  */
-static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -926,7 +855,7 @@ static const char *statename[]={
 };
 #endif
 
-static __inline__ void tcp_set_state(struct sock *sk, int state)
+static inline void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
@@ -960,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state)
 #endif	
 }
 
-static __inline__ void tcp_done(struct sock *sk)
+static inline void tcp_done(struct sock *sk)
 {
 	tcp_set_state(sk, TCP_CLOSE);
 	tcp_clear_xmit_timers(sk);
@@ -973,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk)
 		inet_csk_destroy_sock(sk);
 }
 
-static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
+static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
 {
 	rx_opt->dsack = 0;
 	rx_opt->eff_sacks = 0;
 	rx_opt->num_sacks = 0;
 }
 
-static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp)
-{
-	if (tp->rx_opt.tstamp_ok) {
-		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
-					  (TCPOPT_NOP << 16) |
-					  (TCPOPT_TIMESTAMP << 8) |
-					  TCPOLEN_TIMESTAMP);
-		*ptr++ = htonl(tstamp);
-		*ptr++ = htonl(tp->rx_opt.ts_recent);
-	}
-	if (tp->rx_opt.eff_sacks) {
-		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
-		int this_sack;
-
-		*ptr++ = htonl((TCPOPT_NOP  << 24) |
-			       (TCPOPT_NOP  << 16) |
-			       (TCPOPT_SACK <<  8) |
-			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
-						     TCPOLEN_SACK_PERBLOCK)));
-		for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
-			*ptr++ = htonl(sp[this_sack].start_seq);
-			*ptr++ = htonl(sp[this_sack].end_seq);
-		}
-		if (tp->rx_opt.dsack) {
-			tp->rx_opt.dsack = 0;
-			tp->rx_opt.eff_sacks--;
-		}
-	}
-}
-
-/* Construct a tcp options header for a SYN or SYN_ACK packet.
- * If this is every changed make sure to change the definition of
- * MAX_SYN_SIZE to match the new maximum number of options that you
- * can generate.
- */
-static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
-					     int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent)
-{
-	/* We always get an MSS option.
-	 * The option bytes which will be seen in normal data
-	 * packets should timestamps be used, must be in the MSS
-	 * advertised.  But we subtract them from tp->mss_cache so
-	 * that calculations in tcp_sendmsg are simpler etc.
-	 * So account for this fact here if necessary.  If we
-	 * don't do this correctly, as a receiver we won't
-	 * recognize data packets as being full sized when we
-	 * should, and thus we won't abide by the delayed ACK
-	 * rules correctly.
-	 * SACKs don't matter, we never delay an ACK when we
-	 * have any of those going out.
-	 */
-	*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
-	if (ts) {
-		if(sack)
-			*ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
-						  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
-		else
-			*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
-						  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
-		*ptr++ = htonl(tstamp);		/* TSVAL */
-		*ptr++ = htonl(ts_recent);	/* TSECR */
-	} else if(sack)
-		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
-					  (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
-	if (offer_wscale)
-		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
-}
-
 /* Determine a window scaling and initial window to offer. */
 extern void tcp_select_initial_window(int __space, __u32 mss,
 				      __u32 *rcv_wnd, __u32 *window_clamp,
@@ -1072,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk)
 	return tcp_win_from_space(sk->sk_rcvbuf); 
 }
 
-static __inline__ void tcp_openreq_init(struct request_sock *req,
-					struct tcp_options_received *rx_opt,
-					struct sk_buff *skb)
+static inline void tcp_openreq_init(struct request_sock *req,
+				    struct tcp_options_received *rx_opt,
+				    struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c7cc62c8dc1..e688c687d62 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 	return err;
 }
 
+
+/*
+ * Linear increase during slow start
+ */
+void tcp_slow_start(struct tcp_sock *tp)
+{
+	if (sysctl_tcp_abc) {
+		/* RFC3465: Slow Start
+		 * TCP sender SHOULD increase cwnd by the number of
+		 * previously unacknowledged bytes ACKed by each incoming
+		 * acknowledgment, provided the increase is not more than L
+		 */
+		if (tp->bytes_acked < tp->mss_cache)
+			return;
+
+		/* We MAY increase by 2 if discovered delayed ack */
+		if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+		}
+	}
+	tp->bytes_acked = 0;
+
+	if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+		tp->snd_cwnd++;
+}
+EXPORT_SYMBOL_GPL(tcp_slow_start);
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 981d1203b15..0a461232329 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1;
 /* Adapt the MSS value used to make delayed ack decision to the 
  * real world.
  */ 
-static inline void tcp_measure_rcv_mss(struct sock *sk,
-				       const struct sk_buff *skb)
+static void tcp_measure_rcv_mss(struct sock *sk,
+				const struct sk_buff *skb)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const unsigned int lss = icsk->icsk_ack.last_seg_size; 
@@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
 	return 0;
 }
 
-static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
-				   struct sk_buff *skb)
+static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+			    struct sk_buff *skb)
 {
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
@@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
 }
 
+
+/* Initialize RCV_MSS value.
+ * RCV_MSS is an our guess about MSS used by the peer.
+ * We haven't any direct information about the MSS.
+ * It's better to underestimate the RCV_MSS rather than overestimate.
+ * Overestimations make us ACKing less frequently than needed.
+ * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
+ */
+void tcp_initialize_rcv_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
+
+	hint = min(hint, tp->rcv_wnd/2);
+	hint = min(hint, TCP_MIN_RCVMSS);
+	hint = max(hint, TCP_MIN_MSS);
+
+	inet_csk(sk)->icsk_ack.rcv_mss = hint;
+}
+
 /* Receiver "autotuning" code.
  *
  * The algorithm for RTT estimation w/o timestamps is based on
@@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
+/* Set slow start threshold and cwnd not falling to slow start */
+void tcp_enter_cwr(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->prior_ssthresh = 0;
+	tp->bytes_acked = 0;
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+		tp->undo_marker = 0;
+		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+		tp->snd_cwnd = min(tp->snd_cwnd,
+				   tcp_packets_in_flight(tp) + 1U);
+		tp->snd_cwnd_cnt = 0;
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+		TCP_ECN_queue_cwr(tp);
+
+		tcp_set_ca_state(sk, TCP_CA_CWR);
+	}
+}
+
 /* Initialize metrics on socket. */
 
 static void tcp_init_metrics(struct sock *sk)
@@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
 		tcp_ack_no_tstamp(sk, seq_rtt, flag);
 }
 
-static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
-				  u32 in_flight, int good)
+static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
+			   u32 in_flight, int good)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
@@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
  * RFC2988 recommends to restart timer to now+rto.
  */
 
-static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 {
 	if (!tp->packets_out) {
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
@@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 	return acked;
 }
 
-static inline u32 tcp_usrtt(const struct sk_buff *skb)
+static u32 tcp_usrtt(const struct sk_buff *skb)
 {
 	struct timeval tv, now;
 
@@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 /* Fast parse options. This hopes to only see timestamps.
  * If it is wrong it falls back on tcp_parse_options().
  */
-static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
-					 struct tcp_sock *tp)
+static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+				  struct tcp_sock *tp)
 {
 	if (th->doff == sizeof(struct tcphdr)>>2) {
 		tp->rx_opt.saw_tstamp = 0;
@@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 	}
 }
 
-static __inline__ int
-tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
 {
 	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
 		if (before(seq, sp->start_seq))
@@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
 	return 0;
 }
 
-static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
 {
 	if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
 		if (before(seq, tp->rcv_nxt))
@@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
 	}
 }
 
-static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
 {
 	if (!tp->rx_opt.dsack)
 		tcp_dsack_set(tp, seq, end_seq);
@@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
 	}
 }
 
-static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
 {
 	__u32 tmp;
 
@@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
 {
 	/* If the user specified a specific send buffer setting, do
 	 * not modify it.
@@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk)
 	sk->sk_write_space(sk);
 }
 
-static inline void tcp_check_space(struct sock *sk)
+static void tcp_check_space(struct sock *sk)
 {
 	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
 		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
@@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk)
 	}
 }
 
-static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
+static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
 {
 	tcp_push_pending_frames(sk, tp);
 	tcp_check_space(sk);
@@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	}
 }
 
-static __inline__ void tcp_ack_snd_check(struct sock *sk)
+static inline void tcp_ack_snd_check(struct sock *sk)
 {
 	if (!inet_csk_ack_scheduled(sk)) {
 		/* We sent a data segment already. */
@@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
 	return result;
 }
 
-static __inline__ int
-tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
 {
 	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
 		__tcp_checksum_complete_user(sk, skb);
@@ -4474,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc);
 EXPORT_SYMBOL(tcp_parse_options);
 EXPORT_SYMBOL(tcp_rcv_established);
 EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9b62d80bb20..5c70493dff0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -270,8 +270,7 @@ failure:
 /*
  * This routine does path mtu discovery as defined in RFC1191.
  */
-static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
-				     u32 mtu)
+static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 {
 	struct dst_entry *dst;
 	struct inet_sock *inet = inet_sk(sk);
@@ -662,7 +661,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
-static inline void syn_flood_warning(struct sk_buff *skb)
+static void syn_flood_warning(struct sk_buff *skb)
 {
 	static unsigned long warntime;
 
@@ -677,8 +676,8 @@ static inline void syn_flood_warning(struct sk_buff *skb)
 /*
  * Save and compile IPv4 options into the request_sock if needed.
  */
-static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
-						     struct sk_buff *skb)
+static struct ip_options *tcp_v4_save_options(struct sock *sk,
+					      struct sk_buff *skb)
 {
 	struct ip_options *opt = &(IPCB(skb)->opt);
 	struct ip_options *dopt = NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3a0a914de91..a7623ead39a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1;
  */
 int sysctl_tcp_tso_win_divisor = 3;
 
-static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
-				    struct sk_buff *skb)
+static void update_send_head(struct sock *sk, struct tcp_sock *tp,
+			     struct sk_buff *skb)
 {
 	sk->sk_send_head = skb->next;
 	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 	tp->snd_cwnd_used = 0;
 }
 
-static inline void tcp_event_data_sent(struct tcp_sock *tp,
-				       struct sk_buff *skb, struct sock *sk)
+static void tcp_event_data_sent(struct tcp_sock *tp,
+				struct sk_buff *skb, struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const u32 now = tcp_time_stamp;
@@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
 		icsk->icsk_ack.pingpong = 1;
 }
 
-static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
 	tcp_dec_quickack_mode(sk, pkts);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
@@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
  * value can be stuffed directly into th->window for an outgoing
  * frame.
  */
-static __inline__ u16 tcp_select_window(struct sock *sk)
+static u16 tcp_select_window(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 cur_win = tcp_receive_window(tp);
@@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 	return new_win;
 }
 
+static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp,
+					 __u32 tstamp)
+{
+	if (tp->rx_opt.tstamp_ok) {
+		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) |
+					  (TCPOPT_NOP << 16) |
+					  (TCPOPT_TIMESTAMP << 8) |
+					  TCPOLEN_TIMESTAMP);
+		*ptr++ = htonl(tstamp);
+		*ptr++ = htonl(tp->rx_opt.ts_recent);
+	}
+	if (tp->rx_opt.eff_sacks) {
+		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+		int this_sack;
+
+		*ptr++ = htonl((TCPOPT_NOP  << 24) |
+			       (TCPOPT_NOP  << 16) |
+			       (TCPOPT_SACK <<  8) |
+			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+						     TCPOLEN_SACK_PERBLOCK)));
+		for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+			*ptr++ = htonl(sp[this_sack].start_seq);
+			*ptr++ = htonl(sp[this_sack].end_seq);
+		}
+		if (tp->rx_opt.dsack) {
+			tp->rx_opt.dsack = 0;
+			tp->rx_opt.eff_sacks--;
+		}
+	}
+}
+
+/* Construct a tcp options header for a SYN or SYN_ACK packet.
+ * If this is every changed make sure to change the definition of
+ * MAX_SYN_SIZE to match the new maximum number of options that you
+ * can generate.
+ */
+static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack,
+				  int offer_wscale, int wscale, __u32 tstamp,
+				  __u32 ts_recent)
+{
+	/* We always get an MSS option.
+	 * The option bytes which will be seen in normal data
+	 * packets should timestamps be used, must be in the MSS
+	 * advertised.  But we subtract them from tp->mss_cache so
+	 * that calculations in tcp_sendmsg are simpler etc.
+	 * So account for this fact here if necessary.  If we
+	 * don't do this correctly, as a receiver we won't
+	 * recognize data packets as being full sized when we
+	 * should, and thus we won't abide by the delayed ACK
+	 * rules correctly.
+	 * SACKs don't matter, we never delay an ACK when we
+	 * have any of those going out.
+	 */
+	*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
+	if (ts) {
+		if(sack)
+			*ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) |
+						  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+		else
+			*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+						  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+		*ptr++ = htonl(tstamp);		/* TSVAL */
+		*ptr++ = htonl(ts_recent);	/* TSECR */
+	} else if(sack)
+		*ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+					  (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
+	if (offer_wscale)
+		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));
+}
 
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
@@ -724,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 /* Congestion window validation. (RFC2861) */
 
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 {
 	__u32 packets_out = tp->packets_out;
 
@@ -773,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
 	int tso_segs = tcp_skb_pcount(skb);
 
@@ -1794,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 /* 
  * Do all connect socket setups that can be done AF independent.
  */ 
-static inline void tcp_connect_init(struct sock *sk)
+static void tcp_connect_init(struct sock *sk)
 {
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-- 
cgit v1.2.3


From 6742bbcbb8a0959e1dff0ce055768e3217d9967a Mon Sep 17 00:00:00 2001
From: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Date: Wed, 4 Jan 2006 01:45:17 -0200
Subject: [DCCP] ackvec: Fix spelling of "throw"

Signed-off-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ackvec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index c9a62cca22f..a979f4e5041 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -291,7 +291,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av)
 }
 #endif
 
-static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av)
+static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
 {
 	/*
 	 * As we're keeping track of the ack vector size (dccpav_vec_len) and
@@ -326,7 +326,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
 			      debug_prefix, 1,
 			      (unsigned long long)av->dccpav_ack_seqno,
 			      (unsigned long long)av->dccpav_ack_ackno);
-		dccp_ackvec_trow_away_ack_record(av);
+		dccp_ackvec_throw_away_ack_record(av);
 		av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
 	}
 }
@@ -389,7 +389,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
 					      av->dccpav_ack_seqno,
 					      (unsigned long long)
 					      av->dccpav_ack_ackno);
-				dccp_ackvec_trow_away_ack_record(av);
+				dccp_ackvec_throw_away_ack_record(av);
 			}
 			/*
 			 * If dccpav_ack_seqno was not received, no problem
-- 
cgit v1.2.3


From e4dfd449c80a41bb615b23d0fc198ba08360a1f8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Wed, 4 Jan 2006 01:46:34 -0200
Subject: [DCCP] ackvec: use u8 for the buf offsets

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ackvec.c | 27 +++++++++++++++++----------
 net/dccp/ackvec.h | 12 ++++++------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index a979f4e5041..ce9cb77c5c2 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	from = av->dccpav_buf + av->dccpav_buf_head;
 
 	/* Check if buf_head wraps */
-	if (av->dccpav_buf_head + len > av->dccpav_vec_len) {
-		const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head);
+	if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
+		const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
 
 		memcpy(to, from, tailsize);
 		to   += tailsize;
@@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
 				      const gfp_t priority)
 {
-	struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority);
+	struct dccp_ackvec *av;
 
+	BUG_ON(len == 0);
+
+	if (len > DCCP_MAX_ACKVEC_LEN)
+		return NULL;
+
+	av = kmalloc(sizeof(*av) + len, priority);
 	if (av != NULL) {
 		av->dccpav_buf_len	= len;
 		av->dccpav_buf_head	=
@@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
 }
 
 static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
-				   const unsigned int index)
+				   const u8 index)
 {
 	return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
 }
 
 static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
-				 const unsigned int index)
+				 const u8 index)
 {
 	return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
 }
@@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
  */
 static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 						 const unsigned int packets,
-						  const unsigned char state)
+						 const unsigned char state)
 {
 	unsigned int gap;
 	signed long new_head;
@@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		 *	could reduce the complexity of this scan.)
 		 */
 		u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
-		unsigned int index = av->dccpav_buf_head;
+		u8 index = av->dccpav_buf_head;
 
 		while (1) {
 			const u8 len = dccp_ackvec_len(av, index);
@@ -301,9 +307,10 @@ static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
 	 * draft-ietf-dccp-spec-11.txt Appendix A. -acme
 	 */
 #if 0
-	av->dccpav_buf_tail = av->dccpav_ack_ptr + 1;
-	if (av->dccpav_buf_tail >= av->dccpav_vec_len)
-		av->dccpav_buf_tail -= av->dccpav_vec_len;
+	u32 new_buf_tail = av->dccpav_ack_ptr + 1;
+	if (new_buf_tail >= av->dccpav_vec_len)
+		new_buf_tail -= av->dccpav_vec_len;
+	av->dccpav_buf_tail = new_buf_tail;
 #endif
 	av->dccpav_vec_len -= av->dccpav_sent_len;
 }
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index d0fd6c60c57..f7dfb5f67b8 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -54,16 +54,16 @@
  * @dccpav_buf - circular buffer of acknowledgeable packets
  */
 struct dccp_ackvec {
-	unsigned int	dccpav_buf_head;
-	unsigned int	dccpav_buf_tail;
 	u64		dccpav_buf_ackno;
 	u64		dccpav_ack_seqno;
 	u64		dccpav_ack_ackno;
-	unsigned int	dccpav_ack_ptr;
-	unsigned int	dccpav_sent_len;
-	unsigned int	dccpav_vec_len;
-	unsigned int	dccpav_buf_len;
 	struct timeval	dccpav_time;
+	u8		dccpav_buf_head;
+	u8		dccpav_buf_tail;
+	u8		dccpav_ack_ptr;
+	u8		dccpav_sent_len;
+	u8		dccpav_vec_len;
+	u8		dccpav_buf_len;
 	u8		dccpav_buf_nonce;
 	u8		dccpav_ack_nonce;
 	u8		dccpav_buf[0];
-- 
cgit v1.2.3


From 80e40daa4797a156781d1594642b654eb1c461df Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Wed, 4 Jan 2006 01:58:06 -0200
Subject: [TCP]: syn_flood_warning is only needed if CONFIG_SYN_COOKIES is
 selected

  CC      net/ipv4/tcp_ipv4.o
  /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/tcp_ipv4.c:665: warning:
  'syn_flood_warning' defined but not used

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/tcp_ipv4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5c70493dff0..e9f83e5b28c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -661,6 +661,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
+#ifdef CONFIG_SYN_COOKIES
 static void syn_flood_warning(struct sk_buff *skb)
 {
 	static unsigned long warntime;
@@ -672,6 +673,7 @@ static void syn_flood_warning(struct sk_buff *skb)
 		       ntohs(skb->h.th->dest));
 	}
 }
+#endif
 
 /*
  * Save and compile IPv4 options into the request_sock if needed.
-- 
cgit v1.2.3


From f190055ff5c08a877d3e1ac2e0300fd92c264b06 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@mandriva.com>
Date: Wed, 4 Jan 2006 02:02:20 -0200
Subject: [IPVS]: Add missing include <linux/net.h>

  CC [M]  net/ipv4/ipvs/ip_vs_conn.o
  /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c: In
  function 'ip_vs_conn_new':
  /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c:606:
  warning: implicit declaration of function 'net_ratelimit'
  /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c: In
  function 'ip_vs_random_dropentry':
  /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c:810:
  warning: implicit declaration of function 'net_random'

Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/ipv4/ipvs/ip_vs_conn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index ed5bd56a501..81d90354c92 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -25,6 +25,7 @@
  */
 
 #include <linux/in.h>
+#include <linux/net.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3


From ca4033024858fcbd392465ba9cbf4c838aedfb58 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jan 2006 13:56:08 -0800
Subject: [ECONET]: Use macro for spinlock_t definition.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/econet/af_econet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 4fc245243cc..c792994d795 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -57,7 +57,7 @@ static struct net_device *net2dev_map[256];
 #define EC_PORT_IP	0xd2
 
 #ifdef CONFIG_ECONET_AUNUDP
-static spinlock_t aun_queue_lock;
+static DEFINE_SPINLOCK(aun_queue_lock);
 static struct socket *udpsock;
 #define AUN_PORT	0x8000
 
-- 
cgit v1.2.3


From 196433c5b788eb732fdcf92449274e302f089ce4 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jan 2006 13:56:31 -0800
Subject: [IPV6]: Use macro for rwlock_t initialization.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f829a4ad3cc..1cf305a9f8d 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 
 	mc_lst->ifindex = dev->ifindex;
 	mc_lst->sfmode = MCAST_EXCLUDE;
-	mc_lst->sflock = RW_LOCK_UNLOCKED;
+	rwlock_init(&mc_lst->sflock);
 	mc_lst->sflist = NULL;
 
 	/*
-- 
cgit v1.2.3


From 181a46a56e9f852060c54247209e93740329b6eb Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 4 Jan 2006 13:56:54 -0800
Subject: [NETFILTER]: Use macro for spinlock_t/rwlock_t
 initializations/definition.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++--
 net/netfilter/nfnetlink_log.c           | 2 +-
 net/netfilter/nfnetlink_queue.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c52af9e56..f3e5ffbd592 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -98,7 +98,7 @@ struct nf_ct_frag6_queue
 #define FRAG6Q_HASHSZ	64
 
 static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
-static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(nf_ct_frag6_lock);
 static u32 nf_ct_frag6_hash_rnd;
 static LIST_HEAD(nf_ct_frag6_lru_list);
 int nf_ct_frag6_nqueues = 0;
@@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src,				   struct
 	init_timer(&fq->timer);
 	fq->timer.function = nf_ct_frag6_expire;
 	fq->timer.data = (long) fq;
-	fq->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&fq->lock);
 	atomic_set(&fq->refcnt, 1);
 
 	return nf_ct_frag6_intern(hash, fq);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cba63729313..e10512e229b 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid)
 		goto out_unlock;
 
 	INIT_HLIST_NODE(&inst->hlist);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
 
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index f28460b61e4..55afdda3d94 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid)
 	atomic_set(&inst->id_sequence, 0);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	INIT_LIST_HEAD(&inst->queue_list);
 
 	if (!try_module_get(THIS_MODULE))
-- 
cgit v1.2.3


From 9369986306d4692f37b61302d4e1ce3054d8833e Mon Sep 17 00:00:00 2001
From: Kris Katterjohn <kjak@users.sourceforge.net>
Date: Wed, 4 Jan 2006 13:58:36 -0800
Subject: [NET]: More instruction checks fornet/core/filter.c

Signed-off-by: Kris Katterjohn <kjak@users.sourceforge.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 112 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 32 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 3a10e0bc90e..8964d344558 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -13,6 +13,7 @@
  * 2 of the License, or (at your option) any later version.
  *
  * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
  */
 
 #include <linux/module.h>
@@ -250,7 +251,7 @@ load_b:
 			mem[fentry->k] = X;
 			continue;
 		default:
-			/* Invalid instruction counts as RET */
+			WARN_ON(1);
 			return 0;
 		}
 
@@ -283,8 +284,8 @@ load_b:
  *
  * Check the user's filter code. If we let some ugly
  * filter code slip through kaboom! The filter must contain
- * no references or jumps that are out of range, no illegal instructions
- * and no backward jumps. It must end with a RET instruction
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
  *
  * Returns 0 if the rule set is legal or a negative errno code if not.
  */
@@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 	for (pc = 0; pc < flen; pc++) {
 		/* all jumps are forward as they are not signed */
 		ftest = &filter[pc];
-		if (BPF_CLASS(ftest->code) == BPF_JMP) {
-			/* but they mustn't jump off the end */
-			if (BPF_OP(ftest->code) == BPF_JA) {
-				/*
-				 * Note, the large ftest->k might cause loops.
-				 * Compare this with conditional jumps below,
-				 * where offsets are limited. --ANK (981016)
-				 */
-				if (ftest->k >= (unsigned)(flen-pc-1))
-					return -EINVAL;
-			} else {
-				/* for conditionals both must be safe */
- 				if (pc + ftest->jt +1 >= flen ||
-				    pc + ftest->jf +1 >= flen)
-					return -EINVAL;
-			}
-		}
 
-		/* check for division by zero   -Kris Katterjohn 2005-10-30 */
-		if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0)
-			return -EINVAL;
+		/* Only allow valid instructions */
+		switch (ftest->code) {
+		case BPF_ALU|BPF_ADD|BPF_K:
+		case BPF_ALU|BPF_ADD|BPF_X:
+		case BPF_ALU|BPF_SUB|BPF_K:
+		case BPF_ALU|BPF_SUB|BPF_X:
+		case BPF_ALU|BPF_MUL|BPF_K:
+		case BPF_ALU|BPF_MUL|BPF_X:
+		case BPF_ALU|BPF_DIV|BPF_X:
+		case BPF_ALU|BPF_AND|BPF_K:
+		case BPF_ALU|BPF_AND|BPF_X:
+		case BPF_ALU|BPF_OR|BPF_K:
+		case BPF_ALU|BPF_OR|BPF_X:
+		case BPF_ALU|BPF_LSH|BPF_K:
+		case BPF_ALU|BPF_LSH|BPF_X:
+		case BPF_ALU|BPF_RSH|BPF_K:
+		case BPF_ALU|BPF_RSH|BPF_X:
+		case BPF_ALU|BPF_NEG:
+		case BPF_LD|BPF_W|BPF_ABS:
+		case BPF_LD|BPF_H|BPF_ABS:
+		case BPF_LD|BPF_B|BPF_ABS:
+		case BPF_LD|BPF_W|BPF_LEN:
+		case BPF_LD|BPF_W|BPF_IND:
+		case BPF_LD|BPF_H|BPF_IND:
+		case BPF_LD|BPF_B|BPF_IND:
+		case BPF_LD|BPF_IMM:
+		case BPF_LDX|BPF_W|BPF_LEN:
+		case BPF_LDX|BPF_B|BPF_MSH:
+		case BPF_LDX|BPF_IMM:
+		case BPF_MISC|BPF_TAX:
+		case BPF_MISC|BPF_TXA:
+		case BPF_RET|BPF_K:
+		case BPF_RET|BPF_A:
+			break;
+
+		/* Some instructions need special checks */
 
-		/* check that memory operations use valid addresses. */
-		if (ftest->k >= BPF_MEMWORDS) {
-			/* but it might not be a memory operation... */
-			switch (ftest->code) {
-			case BPF_ST:	
-			case BPF_STX:	
-			case BPF_LD|BPF_MEM:	
-			case BPF_LDX|BPF_MEM:	
+		case BPF_ALU|BPF_DIV|BPF_K:
+			/* check for division by zero */
+			if (ftest->k == 0)
 				return -EINVAL;
-			}
+			break;
+
+		case BPF_LD|BPF_MEM:
+		case BPF_LDX|BPF_MEM:
+		case BPF_ST:
+		case BPF_STX:
+			/* check for invalid memory addresses */
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			break;
+
+		case BPF_JMP|BPF_JA:
+			/*
+			 * Note, the large ftest->k might cause loops.
+			 * Compare this with conditional jumps below,
+			 * where offsets are limited. --ANK (981016)
+			 */
+			if (ftest->k >= (unsigned)(flen-pc-1))
+				return -EINVAL;
+			break;
+
+		case BPF_JMP|BPF_JEQ|BPF_K:
+		case BPF_JMP|BPF_JEQ|BPF_X:
+		case BPF_JMP|BPF_JGE|BPF_K:
+		case BPF_JMP|BPF_JGE|BPF_X:
+		case BPF_JMP|BPF_JGT|BPF_K:
+		case BPF_JMP|BPF_JGT|BPF_X:
+		case BPF_JMP|BPF_JSET|BPF_K:
+		case BPF_JMP|BPF_JSET|BPF_X:
+			/* for conditionals both must be safe */
+ 			if (pc + ftest->jt + 1 >= flen ||
+			    pc + ftest->jf + 1 >= flen)
+				return -EINVAL;
+			break;
+
+		default:
+			return -EINVAL;
 		}
 	}
 
-- 
cgit v1.2.3


From 74cb8798222bb7d1aecb0acb91e6eeedf5feb948 Mon Sep 17 00:00:00 2001
From: Thomas Young <tyo@ee.mu.oz.au>
Date: Wed, 4 Jan 2006 13:59:32 -0800
Subject: [TCP] tcp_vegas: Fix slow start

Vegas' slow start was only adding one MSS per RTT rather than one for
every ack. Slow start behavior should now match Reno.

Signed-off-by: Thomas Young <tyo@ee.mu.oz.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_vegas.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13e7e6e8df1..3b740349505 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
 		vegas->cntRTT = 0;
 		vegas->minRTT = 0x7fffffff;
 	}
+	/* Use normal slow start */
+	else if (tp->snd_cwnd <= tp->snd_ssthresh) 
+		tcp_slow_start(tp);
+	
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-- 
cgit v1.2.3