Merge branch 'master'

author: Jeff Garzik <jgarzik@pobox.com> 2005-12-12 15:24:45 -0500
committer: Jeff Garzik <jgarzik@pobox.com> 2005-12-12 15:24:45 -0500
commit: b1086eef813ecee09bd6b8ae364acf0fad065cba (patch)
tree: bc723bbdfc2898252e3fd8e14320d7fac58dca4b /net
parent: 003a20c81ec278595820d3829b544e90919f6f61 (diff)
parent: 49d7bc64283970ee83d2c954d04ba00d04e5943d (diff)
16 files changed, 283 insertions, 213 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 94e642ee6e2..1a59f2173f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1113,7 +1113,8 @@ out:
 void netdev_rx_csum_fault(struct net_device *dev)
 {
 	if (net_ratelimit()) {
-		printk(KERN_ERR "%s: hw csum failure.\n", dev->name);
+		printk(KERN_ERR "%s: hw csum failure.\n", 
+			dev ? dev->name : "<unknown>");
 		dump_stack();
 	}
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b7d13a4fff4..83fee37de38 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1725,7 +1725,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
  * of the skb if any page alloc fails user this procedure returns  -ENOMEM
  */
 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
-			int getfrag(void *from, char *to, int offset,
+			int (*getfrag)(void *from, char *to, int offset,
 					int len, int odd, struct sk_buff *skb),
 			void *from, int length)
 {
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index f89e55f814d..d402e9020c6 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -153,6 +153,7 @@ static struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
+static atomic_t decnet_memory_allocated;
 
 static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags);
 static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
@@ -446,10 +447,26 @@ static void dn_destruct(struct sock *sk)
 	dst_release(xchg(&sk->sk_dst_cache, NULL));
 }
 
+static int dn_memory_pressure;
+
+static void dn_enter_memory_pressure(void)
+{
+	if (!dn_memory_pressure) {
+		dn_memory_pressure = 1;
+	}
+}
+
 static struct proto dn_proto = {
-	.name	  = "DECNET",
-	.owner	  = THIS_MODULE,
-	.obj_size = sizeof(struct dn_sock),
+	.name			= "NSP",
+	.owner			= THIS_MODULE,
+	.enter_memory_pressure	= dn_enter_memory_pressure,
+	.memory_pressure	= &dn_memory_pressure,
+	.memory_allocated	= &decnet_memory_allocated,
+	.sysctl_mem		= sysctl_decnet_mem,
+	.sysctl_wmem		= sysctl_decnet_wmem,
+	.sysctl_rmem		= sysctl_decnet_rmem,
+	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
+	.obj_size		= sizeof(struct dn_sock),
 };
 
 static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
@@ -470,6 +487,8 @@ static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
 	sk->sk_family      = PF_DECnet;
 	sk->sk_protocol    = 0;
 	sk->sk_allocation  = gfp;
+	sk->sk_sndbuf	   = sysctl_decnet_wmem[1];
+	sk->sk_rcvbuf	   = sysctl_decnet_rmem[1];
 
 	/* Initialization of DECnet Session Control Port		*/
 	scp = DN_SK(sk);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 02bca49cb50..0e9d2c57116 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -10,6 +10,7 @@
  *
  * Changes:
  * Steve Whitehouse - C99 changes and default device handling
+ * Steve Whitehouse - Memory buffer settings, like the tcp ones
  *
  */
 #include <linux/config.h>
@@ -37,6 +38,11 @@ int decnet_dr_count = 3;
 int decnet_log_martians = 1;
 int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
 
+/* Reasonable defaults, I hope, based on tcp's defaults */
+int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
+int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
 #ifdef CONFIG_SYSCTL
 extern int decnet_dst_gc_interval;
 static int min_decnet_time_wait[] = { 5 };
@@ -428,6 +434,33 @@ static ctl_table dn_table[] = {
 		.extra1 = &min_decnet_no_fc_max_cwnd,
 		.extra2 = &max_decnet_no_fc_max_cwnd
 	},
+       {
+                .ctl_name = NET_DECNET_MEM,
+                .procname = "decnet_mem",
+                .data = &sysctl_decnet_mem,
+                .maxlen = sizeof(sysctl_decnet_mem),
+                .mode = 0644,
+                .proc_handler = &proc_dointvec,
+                .strategy = &sysctl_intvec,
+        },
+        {
+                .ctl_name = NET_DECNET_RMEM,
+                .procname = "decnet_rmem",
+                .data = &sysctl_decnet_rmem,
+                .maxlen = sizeof(sysctl_decnet_rmem),
+                .mode = 0644,
+                .proc_handler = &proc_dointvec,
+                .strategy = &sysctl_intvec,
+        },
+        {
+                .ctl_name = NET_DECNET_WMEM,
+                .procname = "decnet_wmem",
+                .data = &sysctl_decnet_wmem,
+                .maxlen = sizeof(sysctl_decnet_wmem),
+                .mode = 0644,
+                .proc_handler = &proc_dointvec,
+                .strategy = &sysctl_intvec,
+        },
 	{
 		.ctl_name = NET_DECNET_DEBUG_LEVEL,
 		.procname = "debug",
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 0bc00528d88..88a60650e6b 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK
 	  instead of the individual packets.
 	
 config IP_NF_CONNTRACK_EVENTS
-	bool "Connection tracking events"
-	depends on IP_NF_CONNTRACK
+	bool "Connection tracking events (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && IP_NF_CONNTRACK
 	help
 	  If this option is enabled, the connection tracking code will
 	  provide a notifier chain that can be used by other kernel code
@@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS
 	  IF unsure, say `N'.
 
 config IP_NF_CONNTRACK_NETLINK
-	tristate 'Connection tracking netlink interface'
-	depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+	tristate 'Connection tracking netlink interface (EXPERIMENTAL)'
+	depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK
 	depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m
 	help
 	  This option enables support for a netlink-based userspace interface
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 7a4ecddd597..84c66dbfeda 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data)
 	return 1;
 }
 
+void ip_conntrack_flush(void)
+{
+	ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
 {
 	if (vmalloced)
@@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
 			   get_order(sizeof(struct list_head) * size));
 }
 
-void ip_conntrack_flush(void)
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
 {
+	ip_ct_attach = NULL;
+
 	/* This makes sure all current packets have passed through
            netfilter framework.  Roll on, two-stage module
            delete... */
@@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void)
 
 	ip_ct_event_cache_flush();
  i_see_dead_people:
-	ip_ct_iterate_cleanup(kill_all, NULL);
+	ip_conntrack_flush();
 	if (atomic_read(&ip_conntrack_count) != 0) {
 		schedule();
 		goto i_see_dead_people;
@@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void)
 	/* wait until all references to ip_conntrack_untracked are dropped */
 	while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
 		schedule();
-}
 
-/* Mishearing the voices in his head, our hero wonders how he's
-   supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
-{
-	ip_ct_attach = NULL;
-	ip_conntrack_flush();
 	kmem_cache_destroy(ip_conntrack_cachep);
 	kmem_cache_destroy(ip_conntrack_expect_cachep);
 	free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 3fce91bcc0b..91fe8f2e38f 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
 }
 
 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
-	[CTA_PROTO_NUM-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_NUM-1]	= sizeof(u_int8_t),
 	[CTA_PROTO_SRC_PORT-1]	= sizeof(u_int16_t),
 	[CTA_PROTO_DST_PORT-1]	= sizeof(u_int16_t),
 	[CTA_PROTO_ICMP_TYPE-1]	= sizeof(u_int8_t),
@@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
 
 	if (!tb[CTA_PROTO_NUM-1])
 		return -EINVAL;
-	tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+	tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
 
 	proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
 
@@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			return -ENOENT;
 		}
 	}	
-	if (del_timer(&ct->timeout)) {
-		ip_conntrack_put(ct);
+	if (del_timer(&ct->timeout))
 		ct->timeout.function((unsigned long)ct);
-		return 0;
-	}
+
 	ip_conntrack_put(ct);
 	DEBUGP("leaving\n");
 
@@ -877,7 +875,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
 		DEBUGP("NAT status: %lu\n", 
 		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
 		
-		if (ip_nat_initialized(ct, hooknum))
+		if (ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
 			return -EEXIST;
 		ip_nat_setup_info(ct, &range, hooknum);
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index aeb7353d477..e7fa29e576d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s,
 static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
 			 const struct ip_conntrack *ct)
 {
-	struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+	struct nfattr *nest_parms;
 	
 	read_lock_bh(&tcp_lock);
+	nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
 	NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
 		&ct->proto.tcp.state);
 	read_unlock_bh(&tcp_lock);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 029c70dfb58..b7325e0b406 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
 {
-	if (skb != NULL) {
-		const struct inet_connection_sock *icsk = inet_csk(sk);
-		struct inet_sock *inet = inet_sk(sk);
-		struct tcp_sock *tp = tcp_sk(sk);
-		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-		int tcp_header_size = tp->tcp_header_len;
-		struct tcphdr *th;
-		int sysctl_flags;
-		int err;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet;
+	struct tcp_sock *tp;
+	struct tcp_skb_cb *tcb;
+	int tcp_header_size;
+	struct tcphdr *th;
+	int sysctl_flags;
+	int err;
+
+	BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+	/* If congestion control is doing timestamping, we must
+	 * take such a timestamp before we potentially clone/copy.
+	 */
+	if (icsk->icsk_ca_ops->rtt_sample)
+		__net_timestamp(skb);
+
+	if (likely(clone_it)) {
+		if (unlikely(skb_cloned(skb)))
+			skb = pskb_copy(skb, gfp_mask);
+		else
+			skb = skb_clone(skb, gfp_mask);
+		if (unlikely(!skb))
+			return -ENOBUFS;
+	}
 
-		BUG_ON(!tcp_skb_pcount(skb));
+	inet = inet_sk(sk);
+	tp = tcp_sk(sk);
+	tcb = TCP_SKB_CB(skb);
+	tcp_header_size = tp->tcp_header_len;
 
 #define SYSCTL_FLAG_TSTAMPS	0x1
 #define SYSCTL_FLAG_WSCALE	0x2
 #define SYSCTL_FLAG_SACK	0x4
 
-		/* If congestion control is doing timestamping */
-		if (icsk->icsk_ca_ops->rtt_sample)
-			__net_timestamp(skb);
-
-		sysctl_flags = 0;
-		if (tcb->flags & TCPCB_FLAG_SYN) {
-			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
-			if(sysctl_tcp_timestamps) {
-				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
-				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
-			}
-			if(sysctl_tcp_window_scaling) {
-				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
-				sysctl_flags |= SYSCTL_FLAG_WSCALE;
-			}
-			if(sysctl_tcp_sack) {
-				sysctl_flags |= SYSCTL_FLAG_SACK;
-				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
-					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
-			}
-		} else if (tp->rx_opt.eff_sacks) {
-			/* A SACK is 2 pad bytes, a 2 byte header, plus
-			 * 2 32-bit sequence numbers for each SACK block.
-			 */
-			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
-					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+	sysctl_flags = 0;
+	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+		tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+		if(sysctl_tcp_timestamps) {
+			tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+			sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
 		}
-		
-		if (tcp_packets_in_flight(tp) == 0)
-			tcp_ca_event(sk, CA_EVENT_TX_START);
-
-		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
-		skb->h.th = th;
-		skb_set_owner_w(skb, sk);
-
-		/* Build TCP header and checksum it. */
-		th->source		= inet->sport;
-		th->dest		= inet->dport;
-		th->seq			= htonl(tcb->seq);
-		th->ack_seq		= htonl(tp->rcv_nxt);
-		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
-		if (tcb->flags & TCPCB_FLAG_SYN) {
-			/* RFC1323: The window in SYN & SYN/ACK segments
-			 * is never scaled.
-			 */
-			th->window	= htons(tp->rcv_wnd);
-		} else {
-			th->window	= htons(tcp_select_window(sk));
+		if (sysctl_tcp_window_scaling) {
+			tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+			sysctl_flags |= SYSCTL_FLAG_WSCALE;
 		}
-		th->check		= 0;
-		th->urg_ptr		= 0;
-
-		if (tp->urg_mode &&
-		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
-			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
-			th->urg			= 1;
+		if (sysctl_tcp_sack) {
+			sysctl_flags |= SYSCTL_FLAG_SACK;
+			if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+				tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
 		}
+	} else if (unlikely(tp->rx_opt.eff_sacks)) {
+		/* A SACK is 2 pad bytes, a 2 byte header, plus
+		 * 2 32-bit sequence numbers for each SACK block.
+		 */
+		tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+				    (tp->rx_opt.eff_sacks *
+				     TCPOLEN_SACK_PERBLOCK));
+	}
+		
+	if (tcp_packets_in_flight(tp) == 0)
+		tcp_ca_event(sk, CA_EVENT_TX_START);
+
+	th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+	skb->h.th = th;
+	skb_set_owner_w(skb, sk);
+
+	/* Build TCP header and checksum it. */
+	th->source		= inet->sport;
+	th->dest		= inet->dport;
+	th->seq			= htonl(tcb->seq);
+	th->ack_seq		= htonl(tp->rcv_nxt);
+	*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
+					tcb->flags);
+
+	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+		/* RFC1323: The window in SYN & SYN/ACK segments
+		 * is never scaled.
+		 */
+		th->window	= htons(tp->rcv_wnd);
+	} else {
+		th->window	= htons(tcp_select_window(sk));
+	}
+	th->check		= 0;
+	th->urg_ptr		= 0;
 
-		if (tcb->flags & TCPCB_FLAG_SYN) {
-			tcp_syn_build_options((__u32 *)(th + 1),
-					      tcp_advertise_mss(sk),
-					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
-					      (sysctl_flags & SYSCTL_FLAG_SACK),
-					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
-					      tp->rx_opt.rcv_wscale,
-					      tcb->when,
-		      			      tp->rx_opt.ts_recent);
-		} else {
-			tcp_build_and_update_options((__u32 *)(th + 1),
-						     tp, tcb->when);
+	if (unlikely(tp->urg_mode &&
+		     between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
+		th->urg_ptr		= htons(tp->snd_up-tcb->seq);
+		th->urg			= 1;
+	}
 
-			TCP_ECN_send(sk, tp, skb, tcp_header_size);
-		}
-		tp->af_specific->send_check(sk, th, skb->len, skb);
+	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+		tcp_syn_build_options((__u32 *)(th + 1),
+				      tcp_advertise_mss(sk),
+				      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+				      (sysctl_flags & SYSCTL_FLAG_SACK),
+				      (sysctl_flags & SYSCTL_FLAG_WSCALE),
+				      tp->rx_opt.rcv_wscale,
+				      tcb->when,
+				      tp->rx_opt.ts_recent);
+	} else {
+		tcp_build_and_update_options((__u32 *)(th + 1),
+					     tp, tcb->when);
+		TCP_ECN_send(sk, tp, skb, tcp_header_size);
+	}
 
-		if (tcb->flags & TCPCB_FLAG_ACK)
-			tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+	tp->af_specific->send_check(sk, th, skb->len, skb);
 
-		if (skb->len != tcp_header_size)
-			tcp_event_data_sent(tp, skb, sk);
+	if (likely(tcb->flags & TCPCB_FLAG_ACK))
+		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-		TCP_INC_STATS(TCP_MIB_OUTSEGS);
+	if (skb->len != tcp_header_size)
+		tcp_event_data_sent(tp, skb, sk);
 
-		err = tp->af_specific->queue_xmit(skb, 0);
-		if (err <= 0)
-			return err;
+	TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
-		tcp_enter_cwr(sk);
+	err = tp->af_specific->queue_xmit(skb, 0);
+	if (unlikely(err <= 0))
+		return err;
+
+	tcp_enter_cwr(sk);
+
+	/* NET_XMIT_CN is special. It does not guarantee,
+	 * that this packet is lost. It tells that device
+	 * is about to start to drop packets or already
+	 * drops some packets of the same priority and
+	 * invokes us to send less aggressively.
+	 */
+	return err == NET_XMIT_CN ? 0 : err;
 
-		/* NET_XMIT_CN is special. It does not guarantee,
-		 * that this packet is lost. It tells that device
-		 * is about to start to drop packets or already
-		 * drops some packets of the same priority and
-		 * invokes us to send less aggressively.
-		 */
-		return err == NET_XMIT_CN ? 0 : err;
-	}
-	return -ENOBUFS;
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
@@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
 			break;
 
 		/* Advance the send_head.  This one is sent out.
@@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 		/* Send it out now. */
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-		if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
 			update_send_head(sk, tp, skb);
 			tcp_cwnd_validate(sk, tp);
 			return;
@@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
-				    pskb_copy(skb, GFP_ATOMIC):
-				    skb_clone(skb, GFP_ATOMIC)));
+	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 
 	if (err == 0) {
 		/* Update global TCP statistics. */
@@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	if (tcp_transmit_skb(sk, skb))
+	if (tcp_transmit_skb(sk, skb, 0, priority))
 		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
 }
 
@@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk)
 		TCP_ECN_send_synack(tcp_sk(sk), skb);
 	}
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
 
 /*
@@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk)
 	__skb_queue_tail(&sk->sk_write_queue, buff);
 	sk_charge_skb(sk, buff);
 	tp->packets_out += tcp_skb_pcount(buff);
-	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+	tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
 	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the SYN until an answer. */
@@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk)
 		/* Send it off, this clears delayed acks for us. */
 		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
 		TCP_SKB_CB(buff)->when = tcp_time_stamp;
-		tcp_transmit_skb(sk, buff);
+		tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 	}
 }
 
@@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	return tcp_transmit_skb(sk, skb);
+	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
 
 int tcp_write_wakeup(struct sock *sk)
@@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk)
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+			err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 			if (!err) {
 				update_send_head(sk, tp, skb);
 			}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index b7d296a8ac6..13e7e6e8df1 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
 		vegas->beg_snd_nxt  = tp->snd_nxt;
 		vegas->beg_snd_cwnd = tp->snd_cwnd;
 
-		/* Take into account the current RTT sample too, to
-		 * decrease the impact of delayed acks. This double counts
-		 * this sample since we count it for the next window as well,
-		 * but that's not too awful, since we're taking the min,
-		 * rather than averaging.
-		 */
-		tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
-
 		/* We do the Vegas calculations only if we got enough RTT
 		 * samples that we can be reasonably sure that we got
 		 * at least one RTT sample that wasn't from a delayed ACK.
@@ -333,11 +325,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
 			else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
 				tp->snd_cwnd = tp->snd_cwnd_clamp;
 		}
-	}
 
-	/* Wipe the slate clean for the next RTT. */
-	vegas->cntRTT = 0;
-	vegas->minRTT = 0x7fffffff;
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 40d9a1935ab..8bfbe997079 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -248,7 +248,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
 	if (esp->conf.padlen)
 		mtu = ALIGN(mtu, esp->conf.padlen);
 
-	return mtu + x->props.header_len + esp->auth.icv_full_len;
+	return mtu + x->props.header_len + esp->auth.icv_trunc_len;
 }
 
 static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index c0f1da5497a..a7e03cfacd0 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -68,8 +68,8 @@ static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 		[ICMPV6_NI_REPLY - 128]		= ICMPV6_NI_REPLY +1
 	};
 
-	__u8 type = orig->dst.u.icmp.type - 128;
-	if (type >= sizeof(invmap) || !invmap[type])
+	int type = orig->dst.u.icmp.type - 128;
+	if (type < 0 || type >= sizeof(invmap) || !invmap[type])
 		return 0;
 
 	tuple->src.u.icmp.id   = orig->src.u.icmp.id;
@@ -129,12 +129,12 @@ static int icmpv6_new(struct nf_conn *conntrack,
 		[ICMPV6_ECHO_REQUEST - 128] = 1,
 		[ICMPV6_NI_QUERY - 128] = 1
 	};
+	int type = conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128;
 
-	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new)
-	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) {
+	if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
 		/* Can't create a new ICMPv6 `conn' with this. */
-		DEBUGP("icmp: can't create new conn with type %u\n",
-		       conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+		DEBUGP("icmpv6: can't create new conn with type %u\n",
+		       type + 128);
 		NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
 		return 0;
 	}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a84f9221e5f..794c41d19b2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -61,8 +61,8 @@ config NF_CONNTRACK_MARK
 	  instead of the individual packets.
 
 config NF_CONNTRACK_EVENTS
-	bool "Connection tracking events"
-	depends on NF_CONNTRACK
+	bool "Connection tracking events (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && NF_CONNTRACK
 	help
 	  If this option is enabled, the connection tracking code will
 	  provide a notifier chain that can be used by other kernel code
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1da678303d7..a7c7b490cf2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1383,6 +1383,9 @@ void nf_conntrack_cleanup(void)
 		schedule();
 		goto i_see_dead_people;
 	}
+	/* wait until all references to nf_conntrack_untracked are dropped */
+	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+		schedule();
 
 	for (i = 0; i < NF_CT_F_NUM; i++) {
 		if (nf_ct_cache[i].use == 0)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a60c59b9763..95fdf04f1d8 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -162,7 +162,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
 		return -EINVAL;
 	}
 
-	min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+	min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
 	if (unlikely(nlh->nlmsg_len < min_len))
 		return -EINVAL;
 
@@ -236,8 +236,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
 	}
 
 	/* All the messages must at least contain nfgenmsg */
-	if (nlh->nlmsg_len < 
-			NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+	if (nlh->nlmsg_len < NLMSG_SPACE(sizeof(struct nfgenmsg))) {
 		DEBUGP("received message was too short\n");
 		return 0;
 	}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 499ae3df4a4..3e246276041 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1587,23 +1587,47 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
 	return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
 }
 
-static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
+static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
 {
 	int i;
 
-	for (i=0; i<len; i++) {
-		if (pg_vec[i]) {
-			struct page *page, *pend;
-
-			pend = pg_vec_endpage(pg_vec[i], order);
-			for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
-				ClearPageReserved(page);
-			free_pages((unsigned long)pg_vec[i], order);
-		}
+	for (i = 0; i < len; i++) {
+		if (likely(pg_vec[i]))
+			free_pages((unsigned long) pg_vec[i], order);
 	}
 	kfree(pg_vec);
 }
 
+static inline char *alloc_one_pg_vec_page(unsigned long order)
+{
+	return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+					 order);
+}
+
+static char **alloc_pg_vec(struct tpacket_req *req, int order)
+{
+	unsigned int block_nr = req->tp_block_nr;
+	char **pg_vec;
+	int i;
+
+	pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+	if (unlikely(!pg_vec))
+		goto out;
+
+	for (i = 0; i < block_nr; i++) {
+		pg_vec[i] = alloc_one_pg_vec_page(order);
+		if (unlikely(!pg_vec[i]))
+			goto out_free_pgvec;
+	}
+
+out:
+	return pg_vec;
+
+out_free_pgvec:
+	free_pg_vec(pg_vec, order, block_nr);
+	pg_vec = NULL;
+	goto out;
+}
 
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
 {
@@ -1617,64 +1641,46 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 
 		/* Sanity tests and some calculations */
 
-		if (po->pg_vec)
+		if (unlikely(po->pg_vec))
 			return -EBUSY;
 
-		if ((int)req->tp_block_size <= 0)
+		if (unlikely((int)req->tp_block_size <= 0))
 			return -EINVAL;
-		if (req->tp_block_size&(PAGE_SIZE-1))
+		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
 			return -EINVAL;
-		if (req->tp_frame_size < TPACKET_HDRLEN)
+		if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
 			return -EINVAL;
-		if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
+		if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
 			return -EINVAL;
 
 		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
-		if (po->frames_per_block <= 0)
+		if (unlikely(po->frames_per_block <= 0))
 			return -EINVAL;
-		if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
+		if (unlikely((po->frames_per_block * req->tp_block_nr) !=
+			     req->tp_frame_nr))
 			return -EINVAL;
-		/* OK! */
-
-		/* Allocate page vector */
-		while ((PAGE_SIZE<<order) < req->tp_block_size)
-			order++;
 
 		err = -ENOMEM;
-
-		pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
-		if (pg_vec == NULL)
+		order = get_order(req->tp_block_size);
+		pg_vec = alloc_pg_vec(req, order);
+		if (unlikely(!pg_vec))
 			goto out;
-		memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
-
-		for (i=0; i<req->tp_block_nr; i++) {
-			struct page *page, *pend;
-			pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
-			if (!pg_vec[i])
-				goto out_free_pgvec;
-
-			pend = pg_vec_endpage(pg_vec[i], order);
-			for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
-				SetPageReserved(page);
-		}
-		/* Page vector is allocated */
 
 		l = 0;
-		for (i=0; i<req->tp_block_nr; i++) {
+		for (i = 0; i < req->tp_block_nr; i++) {
 			char *ptr = pg_vec[i];
 			struct tpacket_hdr *header;
 			int k;
 
-			for (k=0; k<po->frames_per_block; k++) {
-				
-				header = (struct tpacket_hdr*)ptr;
+			for (k = 0; k < po->frames_per_block; k++) {
+				header = (struct tpacket_hdr *) ptr;
 				header->tp_status = TP_STATUS_KERNEL;
 				ptr += req->tp_frame_size;
 			}
 		}
 		/* Done */
 	} else {
-		if (req->tp_frame_nr)
+		if (unlikely(req->tp_frame_nr))
 			return -EINVAL;
 	}
 
@@ -1701,7 +1707,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 
 		spin_lock_bh(&sk->sk_receive_queue.lock);
 		pg_vec = XC(po->pg_vec, pg_vec);
-		po->frame_max = req->tp_frame_nr-1;
+		po->frame_max = (req->tp_frame_nr - 1);
 		po->head = 0;
 		po->frame_size = req->tp_frame_size;
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1728,7 +1734,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 
 	release_sock(sk);
 
-out_free_pgvec:
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
@@ -1755,17 +1760,19 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
 	if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
 		goto out;
 
-	atomic_inc(&po->mapped);
 	start = vma->vm_start;
-	err = -EAGAIN;
-	for (i=0; i<po->pg_vec_len; i++) {
-		if (remap_pfn_range(vma, start,
-				     __pa(po->pg_vec[i]) >> PAGE_SHIFT,
-				     po->pg_vec_pages*PAGE_SIZE,
-				     vma->vm_page_prot))
-			goto out;
-		start += po->pg_vec_pages*PAGE_SIZE;
+	for (i = 0; i < po->pg_vec_len; i++) {
+		struct page *page = virt_to_page(po->pg_vec[i]);
+		int pg_num;
+
+		for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
+			err = vm_insert_page(vma, start, page);
+			if (unlikely(err))
+				goto out;
+			start += PAGE_SIZE;
+		}
 	}
+	atomic_inc(&po->mapped);
 	vma->vm_ops = &packet_mmap_ops;
 	err = 0;
author	Jeff Garzik <jgarzik@pobox.com>	2005-12-12 15:24:45 -0500
committer	Jeff Garzik <jgarzik@pobox.com>	2005-12-12 15:24:45 -0500
commit	b1086eef813ecee09bd6b8ae364acf0fad065cba (patch)
tree	bc723bbdfc2898252e3fd8e14320d7fac58dca4b /net
parent	003a20c81ec278595820d3829b544e90919f6f61 (diff)
parent	49d7bc64283970ee83d2c954d04ba00d04e5943d (diff)