From 3428c209c6820bbbb7dfb323caef8d402b3deb4c Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 3 Nov 2005 14:27:07 +0100 Subject: [NETFILTER] PPTP helper: Fix compilation of conntrack helper without NAT This patch fixes compilation of the PPTP conntrack helper when NAT is configured off. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ip_conntrack_helper_pptp.c | 4 ---- net/ipv4/netfilter/ip_nat_helper_pptp.c | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 926a6684643..4108a5e12b3 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master, exp_orig->expectfn = pptp_expectfn; exp_orig->flags = 0; - exp_orig->dir = IP_CT_DIR_ORIGINAL; - /* both expectations are identical apart from tuple */ memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); - exp_reply->dir = !exp_orig->dir; - if (ip_nat_pptp_hook_exp_gre) ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); else { diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index 3cdd0684d30..ee6ab74ad3a 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); + expect_orig->dir = IP_CT_DIR_ORIGINAL; inv_t.src.ip = reply_t->src.ip; inv_t.dst.ip = reply_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); @@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); + expect_reply->dir = IP_CT_DIR_REPLY; inv_t.src.ip = orig_t->src.ip; inv_t.dst.ip = orig_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); -- cgit v1.2.3 From d811552eda2476215d69d485e437d2dcae1ab0b4 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 3 Nov 2005 13:05:20 +0100 Subject: [NETFILTER] PPTP helper: Fix endianness bug in GRE key / CallID NAT This endianness bug slipped through while changing the 'gre.key' field in the conntrack tuple from 32bit to 16bit. None of my tests caught the problem, since the linux pptp client always has '0' as call id / gre key. Only windows clients actually trigger the bug. Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ip_nat_proto_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index 7c128540167..f7cad7cf1ae 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb, break; case GRE_VERSION_PPTP: DEBUGP("call_id -> 0x%04x\n", - ntohl(tuple->dst.u.gre.key)); - pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key)); + ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; break; default: DEBUGP("can't nat unknown GRE version\n"); -- cgit v1.2.3 From d2a7bb7141a1fac7b11523538b2d2407e928baeb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 3 Nov 2005 20:17:51 +0100 Subject: [NETFILTER] NAT: Fix module refcount dropping too far The unknown protocol is used as a fallback when a protocol isn't known. Hence we cannot handle it failing, so don't set ".me". It's OK, since we only grab a reference from within the same module (iptable_nat.ko), so we never take the module refcount from 0 to 1. Also, remove the "protocol is NULL" test: it's never NULL. Signed-off-by: Rusty Rusty Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ip_nat_core.c | 6 ++---- net/ipv4/netfilter/ip_nat_proto_unknown.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index c5e3abd2467..762f4d93936 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum) * removed until we've grabbed the reference */ preempt_disable(); p = __ip_nat_proto_find(protonum); - if (p) { - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - } + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; preempt_enable(); return p; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 99bbef56f84..f0099a646a0 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) struct ip_nat_protocol ip_nat_unknown_protocol = { .name = "unknown", - .me = THIS_MODULE, + /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, .unique_tuple = unknown_unique_tuple, -- cgit v1.2.3 From 0f81eb4db4f1cc560318b6e7762a7a1d7d8c7095 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 3 Nov 2005 19:05:37 +0100 Subject: [NETFILTER]: Fix double free after netlink_unicast() in ctnetlink It's not necessary to free skb if netlink_unicast() failed. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ip_conntrack_netlink.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 166e6069f12..82a65043a8e 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, IPCTNL_MSG_CT_NEW, 1, ct); ip_conntrack_put(ct); if (err <= 0) - goto out; + goto free; err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); if (err < 0) @@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("leaving\n"); return 0; +free: + kfree_skb(skb2); out: - if (skb2) - kfree_skb(skb2); return -1; } @@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, 1, exp); if (err <= 0) - goto out; + goto free; ip_conntrack_expect_put(exp); - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto free; - - return err; + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +free: + kfree_skb(skb2); out: ip_conntrack_expect_put(exp); -free: - if (skb2) - kfree_skb(skb2); return err; } -- cgit v1.2.3 From 433a4d3b5456dec5bcca5a0f236bf622da1267b3 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 3 Nov 2005 19:25:56 +0100 Subject: [NETFILTER]: CONNMARK target needs ip_conntrack There's a missing dependency from the CONNMARK target to ip_conntrack. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/ipt_CONNMARK.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 13463802133..05d66ab5942 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = { static int __init init(void) { + need_ip_conntrack(); return ipt_register_target(&ipt_connmark_reg); } -- cgit v1.2.3 From 6df716340da3a6fdd33d73d7ed4c6f7590ca1c42 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 3 Nov 2005 16:33:23 -0800 Subject: [TCP/DCCP]: Randomize port selection This patch randomizes the port selected on bind() for connections to help with possible security attacks. It should also be faster in most cases because there is no need for a global lock. Signed-off-by: Stephen Hemminger Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/inet_connection_sock.c | 14 +++----------- net/ipv4/tcp.c | 1 - net/ipv4/tcp_ipv4.c | 2 -- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b..3fe021f1a56 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; do { - rover++; - if (rover > high) - rover = low; head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a958..72b7c22e1ea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2112,7 +2112,6 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474..49d67cd75ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; static int tcp_v4_get_port(struct sock *sk, unsigned short snum) -- cgit v1.2.3 From dc8103f25fd7cfac2c2b295f33edc10f255b4c80 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 8 Nov 2005 09:40:05 -0800 Subject: [IPVS]: fix connection leak if expire_nodest_conn=1 There was a fix in 2.6.13 that changed the behaviour of ip_vs_conn_expire_now function not to put reference to connection, its callers should hold write lock or connection refcnt. But we forgot to convert one caller, when the real server for connection is unavailable caller should put the connection reference. It happens only when sysctl var expire_nodest_conn is set to 1 and such connections never expire. Thanks to Roberto Nibali who found the problem and tested a 2.4.32-rc2 patch, which is equal to this 2.6 version. Patch for 2.4 is already sent to Marcelo. Signed-off-by: Julian Anastasov Signed-off-by: Roberto Nibali Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 981cc3244ef..1a0843cd58a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1009,11 +1009,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, if (sysctl_ip_vs_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); - } else { - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); return NF_DROP; } -- cgit v1.2.3 From a51482bde22f99c63fbbb57d5d46cc666384e379 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 8 Nov 2005 09:41:34 -0800 Subject: [NET]: kfree cleanup From: Jesper Juhl This is the net/ part of the big kfree cleanup patch. Remove pointless checks for NULL prior to calling kfree() in net/. Signed-off-by: Jesper Juhl Cc: "David S. Miller" Cc: Arnaldo Carvalho de Melo Acked-by: Marcel Holtmann Acked-by: YOSHIFUJI Hideaki Signed-off-by: Andrew Morton --- net/ipv4/af_inet.c | 3 +-- net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ip_options.c | 3 +-- net/ipv4/ip_output.c | 12 ++++-------- net/ipv4/ip_sockglue.c | 12 ++++-------- net/ipv4/ipvs/ip_vs_app.c | 6 ++---- net/ipv4/multipath_wrandom.c | 10 +++------- net/ipv4/netfilter/ip_nat_snmp_basic.c | 3 +-- net/ipv4/tcp_ipv4.c | 3 +-- 9 files changed, 18 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a9d84f93442..eaa150c33b0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -147,8 +147,7 @@ void inet_sock_destruct(struct sock *sk) BUG_TRAP(!sk->sk_wmem_queued); BUG_TRAP(!sk->sk_forward_alloc); - if (inet->opt) - kfree(inet->opt); + kfree(inet->opt); dst_release(sk->sk_dst_cache); sk_refcnt_debug_dec(sk); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 990633c09df..2267c1fad87 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -266,8 +266,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) if (tb) err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); } - if (rta.rta_mx) - kfree(rta.rta_mx); + kfree(rta.rta_mx); } rtnl_unlock(); return err; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bce4e875193..dbe12da8d8b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -510,8 +510,7 @@ static int ip_options_get_finish(struct ip_options **optp, kfree(opt); return -EINVAL; } - if (*optp) - kfree(*optp); + kfree(*optp); *optp = opt; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17758234a3e..df7f20da422 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1262,10 +1262,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1289,10 +1287,8 @@ void ip_flush_pending_frames(struct sock *sk) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2f0b47da5b3..4f2d8725730 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -202,8 +202,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s if (ra->sk == sk) { if (on) { write_unlock_bh(&ip_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; @@ -446,8 +445,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, #endif } opt = xchg(&inet->opt, opt); - if (opt) - kfree(opt); + kfree(opt); break; } case IP_PKTINFO: @@ -828,10 +826,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_mc_msfilter(sk, msf, ifindex); mc_msf_out: - if (msf) - kfree(msf); - if (gsf) - kfree(gsf); + kfree(msf); + kfree(gsf); break; } case IP_ROUTER_ALERT: diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index fc6f95aaa96..d7eb680101c 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -110,8 +110,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) return 0; out: - if (inc->timeout_table) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); return ret; } @@ -136,8 +135,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) list_del(&inc->a_list); - if (inc->timeout_table != NULL) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); } diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c index bd7d75b6abe..d34a9fa608e 100644 --- a/net/ipv4/multipath_wrandom.c +++ b/net/ipv4/multipath_wrandom.c @@ -207,16 +207,12 @@ static void wrandom_select_route(const struct flowi *flp, decision = mpc->rt; last_power = mpc->power; - if (last_mpc) - kfree(last_mpc); - + kfree(last_mpc); last_mpc = mpc; } - if (last_mpc) { - /* concurrent __multipath_flush may lead to !last_mpc */ - kfree(last_mpc); - } + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); decision->u.dst.__use++; *rp = decision; diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 93b2c5111bb..8acb7ed40b4 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1161,8 +1161,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (!snmp_object_decode(&ctx, obj)) { if (*obj) { - if ((*obj)->id) - kfree((*obj)->id); + kfree((*obj)->id); kfree(*obj); } kfree(obj); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 49d67cd75ed..634dabb558f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -823,8 +823,7 @@ out: */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - if (inet_rsk(req)->opt) - kfree(inet_rsk(req)->opt); + kfree(inet_rsk(req)->opt); } static inline void syn_flood_warning(struct sk_buff *skb) -- cgit v1.2.3 From 89f5f0aeed14ac7245f760b0b96c9269c87bcbbe Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 8 Nov 2005 09:41:56 -0800 Subject: [IPV4]: Fix ip_queue_xmit identity increment for TSO packets When ip_queue_xmit calls ip_select_ident_more for IP identity selection it gives it the wrong packet count for TSO packets. The ip_select_* functions expect one less than the number of packets, so we need to subtract one for TSO packets. This bug was diagnosed and fixed by Tom Young. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index df7f20da422..11c2f68254f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -353,7 +353,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->tso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); -- cgit v1.2.3 From 46998f59c03ecbd7c2250810f35af6fe24868845 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 12:58:05 -0800 Subject: [NETFILTER]: packet counter of conntrack is 32bits The packet counter variable of conntrack was changed to 32bits from 64bits. This follows that change. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 82a65043a8e..431a446994f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nfattr *nest_count = NFA_NEST(skb, type); - u_int64_t tmp; + u_int32_t tmp; tmp = htonl(ct->counters[dir].packets); NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); -- cgit v1.2.3 From eaae4fa45e0f4cd1da0f00ae93551edb1002b2b9 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 12:58:46 -0800 Subject: [NETFILTER]: refcount leak of proto when ctnetlink dumping tuple Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 431a446994f..02f303cf201 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -58,14 +58,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct ip_conntrack_tuple *tuple) { struct ip_conntrack_protocol *proto; + int ret = 0; NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (proto && proto->tuple_to_nfattr) - return proto->tuple_to_nfattr(skb, tuple); + if (likely(proto && proto->tuple_to_nfattr)) { + ret = proto->tuple_to_nfattr(skb, tuple); + ip_conntrack_proto_put(proto); + } - return 0; + return ret; nfattr_failure: return -1; -- cgit v1.2.3 From a2506c04322ca266fe2f9bd7d02a67b1972da611 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 9 Nov 2005 12:59:13 -0800 Subject: [NETFILTER] nfnetlink: nfattr_parse() can never fail, make it void nfattr_parse (and thus nfattr_parse_nested) always returns success. So we can make them 'void' and remove all the checking at the caller side. Based on original patch by Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 45 +++++------------------------ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 6 +--- 2 files changed, 9 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 02f303cf201..838262e1737 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -482,9 +482,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - - if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_IP_MAX, attr); if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; @@ -500,9 +498,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } static const int cta_min_proto[CTA_PROTO_MAX] = { @@ -524,8 +519,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; @@ -542,9 +536,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, } return ret; - -nfattr_failure: - return -1; } static inline int @@ -558,8 +549,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, memset(tuple, 0, sizeof(*tuple)); - if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); if (!tb[CTA_TUPLE_IP-1]) return -EINVAL; @@ -586,9 +576,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #ifdef CONFIG_IP_NF_NAT_NEEDED @@ -606,11 +593,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - goto nfattr_failure; + return -1; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) @@ -629,9 +615,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } static inline int @@ -645,8 +628,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -668,9 +650,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #endif @@ -681,8 +660,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); if (!tb[CTA_HELP_NAME-1]) return -EINVAL; @@ -690,9 +668,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); return 0; - -nfattr_failure: - return -1; } static int @@ -960,8 +935,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; int err = 0; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); if (!proto) @@ -972,9 +946,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) ip_conntrack_proto_put(proto); return err; - -nfattr_failure: - return -ENOMEM; } static int diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index d6701cafbcc..6ea4b22ff28 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -362,8 +362,7 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; @@ -374,9 +373,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) write_unlock_bh(&tcp_lock); return 0; - -nfattr_failure: - return -1; } #endif -- cgit v1.2.3 From 51df784ed739246a3774b300e5f536e17bec36ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 12:59:41 -0800 Subject: [NETFILTER] ctnetlink: check if protoinfo is present This fixes an oops triggered from userspace. If we don't pass information about the private protocol info, the reference to attr will be NULL. This is likely to happen in update messages. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 6ea4b22ff28..468c6003b4c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -362,6 +362,11 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); if (!tb[CTA_PROTOINFO_TCP_STATE-1]) -- cgit v1.2.3 From 02a78cdf425156b86abdb6883f837a70fb7106da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:04 -0800 Subject: [NETFILTER] ctnetlink: add marking support from userspace This patch adds support for conntrack marking from user space. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 838262e1737..09957f9be97 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -979,6 +979,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) return err; } +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("all done\n"); return 0; } @@ -1022,6 +1027,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], if (ct->helper) ip_conntrack_helper_put(ct->helper); +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("conntrack with id %u inserted\n", ct->id); return 0; -- cgit v1.2.3 From 119a31849442215fa66e4d18a33443a55c45e631 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:29 -0800 Subject: [NETFILTER] ctnetlink: add module alias to fix autoloading Add missing module alias. This is a must to load ctnetlink on demand. For example, the conntrack tool will fail if the module isn't loaded. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 09957f9be97..842fe2c081c 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1538,6 +1538,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .cb = ctnl_exp_cb, }; +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + static int __init ctnetlink_init(void) { int ret; -- cgit v1.2.3 From 7a4fe3664b3cfecd2a40a46f54c71333639e28b7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:00:47 -0800 Subject: [NETFILTER] ctnetlink: kill unused includes Kill some useless headers included in ctnetlink. They aren't used in any way. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 842fe2c081c..bf584249571 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -28,11 +28,8 @@ #include #include #include -#include #include -#include -#include #include #include #include -- cgit v1.2.3 From 81e5c27d08bb39e646fe822ea80ab8feba62b94d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:01:19 -0800 Subject: [NETFILTER] ctnetlink: get_conntrack can use GFP_KERNEL ctnetlink_get_conntrack is always called from user context, so GFP_KERNEL is enough. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index bf584249571..d1dde1417ef 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -779,7 +779,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, ct = tuplehash_to_ctrack(h); err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) { ip_conntrack_put(ct); return -ENOMEM; -- cgit v1.2.3 From 5978a9b82c55b82a1087bd86e0ae8b00f94d0d0b Mon Sep 17 00:00:00 2001 From: Philip Craig Date: Wed, 9 Nov 2005 13:01:53 -0800 Subject: [NETFILTER] PPTP helper: fix PNS-PAC expectation call id The reply tuple of the PNS->PAC expectation was using the wrong call id. So we had the following situation: - PNS behind NAT firewall - PNS call id requires NATing - PNS->PAC gre packet arrives first then the PNS->PAC expectation is matched, and the other expectation is deleted, but the PAC->PNS gre packets do not match the gre conntrack because the call id is wrong. We also cannot use ip_nat_follow_master(). Signed-off-by: Philip Craig Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_helper_pptp.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index ee6ab74ad3a..e546203f566 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct, struct ip_conntrack_tuple t; struct ip_ct_pptp_master *ct_pptp_info; struct ip_nat_pptp *nat_pptp_info; + struct ip_nat_range range; ct_pptp_info = &master->help.ct_pptp_info; nat_pptp_info = &master->nat.help.nat_pptp_info; @@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct, DEBUGP("not found!\n"); } - ip_nat_follow_master(ct, exp); + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + if (exp->dir == IP_CT_DIR_ORIGINAL) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + if (exp->dir == IP_CT_DIR_REPLY) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); } /* outbound packets == from PNS to PAC */ @@ -213,7 +237,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, /* alter expectation for PNS->PAC direction */ invert_tuplepr(&inv_t, &expect_orig->tuple); - expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); + expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); expect_orig->dir = IP_CT_DIR_ORIGINAL; -- cgit v1.2.3 From d63a92810807e8da298895236f2b99697e884014 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 13:02:45 -0800 Subject: [NETFILTER]: stop tracking ICMP error at early point Currently connection tracking handles ICMP error like normal packets if it failed to get related connection. But it fails that after all. This makes connection tracking stop tracking ICMP error at early point. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 98f0015dd25..9481d159acb 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -151,13 +151,13 @@ icmp_error_message(struct sk_buff *skb, /* Not enough header? */ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); if (inside == NULL) - return NF_ACCEPT; + return -NF_ACCEPT; /* Ignore ICMP's containing fragments (shouldn't happen) */ if (inside->ip.frag_off & htons(IP_OFFSET)) { DEBUGP("icmp_error_track: fragment of proto %u\n", inside->ip.protocol); - return NF_ACCEPT; + return -NF_ACCEPT; } innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); @@ -166,7 +166,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Ordinarily, we'd expect the inverted tupleproto, but it's @@ -174,7 +174,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } ip_conntrack_proto_put(innerproto); @@ -190,7 +190,7 @@ icmp_error_message(struct sk_buff *skb, if (!h) { DEBUGP("icmp_error_track: no match\n"); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Reverse direction from that found */ if (DIRECTION(h) != IP_CT_DIR_REPLY) -- cgit v1.2.3 From fe902a91ff427af7dbf20e7c196623b2a4eade13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:09 -0800 Subject: [NETFILTER] ctnetlink: return -EINVAL if size is wrong Return -EINVAL if the size isn't OK instead of -EPERM. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d1dde1417ef..cfc5487e627 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -593,7 +593,7 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - return -1; + return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) -- cgit v1.2.3 From fcda46128d5cb50075339b79ce585ab767337e9e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:26 -0800 Subject: [NETFILTER] ctnetlink: propagate error instaed of returning -EPERM Propagate the error to userspace instead of returning -EPERM if the get conntrack operation fails. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index cfc5487e627..7fe74565964 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -802,7 +802,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, free: kfree_skb(skb2); out: - return -1; + return err; } static inline int -- cgit v1.2.3 From a856a19a9f3ee14fc0d555470f3af138aeb0245c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Nov 2005 13:03:42 -0800 Subject: [NETFILTER] ctnetlink: Add support to identify expectations by ID's Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 7fe74565964..5c1c0a3d1c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1293,6 +1293,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (!exp) return -ENOENT; + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + err = -ENOMEM; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) -- cgit v1.2.3 From 439a9994bb6ae3c7cab1f0b776bca6bc7aa58a11 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Wed, 9 Nov 2005 13:04:08 -0800 Subject: [NETFILTER] ctnetlink: Fix oops when no ICMP ID info in message This patch fixes an userspace triggered oops. If there is no ICMP_ID info the reference to attr will be NULL. Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 9481d159acb..083951e2069 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -296,7 +296,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], struct ip_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1]) + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) return -1; tuple->dst.u.icmp.type = -- cgit v1.2.3 From 5fd52fe0989f8c84abd8d4a40ded79d4da911744 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Wed, 9 Nov 2005 13:04:32 -0800 Subject: [NETFILTER] ctnetlink: ICMP_ID is u_int16_t not u_int8_t. Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 083951e2069..5198f3a1e2c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -305,7 +305,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], tuple->dst.u.icmp.code = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); tuple->src.u.icmp.id = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); return 0; } -- cgit v1.2.3 From 9fb9cbb1082d6b31fb45aa1a14432449a0df6cf1 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 9 Nov 2005 16:38:16 -0800 Subject: [NETFILTER]: Add nf_conntrack subsystem. The existing connection tracking subsystem in netfilter can only handle ipv4. There were basically two choices present to add connection tracking support for ipv6. We could either duplicate all of the ipv4 connection tracking code into an ipv6 counterpart, or (the choice taken by these patches) we could design a generic layer that could handle both ipv4 and ipv6 and thus requiring only one sub-protocol (TCP, UDP, etc.) connection tracking helper module to be written. In fact nf_conntrack is capable of working with any layer 3 protocol. The existing ipv4 specific conntrack code could also not deal with the pecularities of doing connection tracking on ipv6, which is also cured here. For example, these issues include: 1) ICMPv6 handling, which is used for neighbour discovery in ipv6 thus some messages such as these should not participate in connection tracking since effectively they are like ARP messages 2) fragmentation must be handled differently in ipv6, because the simplistic "defrag, connection track and NAT, refrag" (which the existing ipv4 connection tracking does) approach simply isn't feasible in ipv6 3) ipv6 extension header parsing must occur at the correct spots before and after connection tracking decisions, and there were no provisions for this in the existing connection tracking design 4) ipv6 has no need for stateful NAT The ipv4 specific conntrack layer is kept around, until all of the ipv4 specific conntrack helpers are ported over to nf_conntrack and it is feature complete. Once that occurs, the old conntrack stuff will get placed into the feature-removal-schedule and we will fully kill it off 6 months later. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/netfilter/Kconfig | 41 +- net/ipv4/netfilter/Makefile | 6 + net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 12 +- net/ipv4/netfilter/ipt_CONNMARK.c | 22 +- net/ipv4/netfilter/ipt_NOTRACK.c | 4 +- net/ipv4/netfilter/ipt_connbytes.c | 39 +- net/ipv4/netfilter/ipt_connmark.c | 10 +- net/ipv4/netfilter/ipt_conntrack.c | 96 +++++ net/ipv4/netfilter/ipt_helper.c | 54 +++ net/ipv4/netfilter/ipt_state.c | 6 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 571 +++++++++++++++++++++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 301 +++++++++++++ 13 files changed, 1106 insertions(+), 58 deletions(-) create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c create mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d917e4ce1d..9d3c8b5f327 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,6 +5,20 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_CONNTRACK_IPV4 + tristate "IPv4 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv4 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + # connection tracking, helpers and protocols config IP_NF_CONNTRACK tristate "Connection tracking (required for masq/NAT)" @@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE tristate "Packet type match support" depends on IP_NF_IPTABLES help - Packet type matching allows you to match a packet by - its "class", eg. BROADCAST, MULTICAST, ... + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... Typical usage: iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG @@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS config IP_NF_MATCH_HELPER tristate "Helper match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp @@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER config IP_NF_MATCH_STATE tristate "Connection state match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE config IP_NF_MATCH_CONNTRACK tristate "Connection tracking match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help This is a general conntrack match module, a superset of the state match. @@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' - depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + depends on IP_NF_MANGLE + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -782,7 +803,7 @@ config IP_NF_RAW config IP_NF_TARGET_NOTRACK tristate 'NOTRACK target support' depends on IP_NF_RAW - depends on IP_NF_CONNTRACK + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index dab4b58dd31..058c48e258f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o + +# objects for l3 independent conntrack +nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 5c1c0a3d1c4..d2a4fec2286 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1376,7 +1376,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, ip_conntrack_expect_put(exp); } } - write_unlock(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9bcb398fbc1..45c52d8f4d9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,7 +29,7 @@ #include #include -#include +#include #define CLUSTERIP_VERSION "0.8" @@ -316,14 +316,14 @@ target(struct sk_buff **pskb, { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - u_int32_t hash; + u_int32_t *mark, hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - if (!ct) { + mark = nf_ct_get_mark((*pskb), &ctinfo); + if (mark == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -346,7 +346,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + *mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -363,7 +363,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, *mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 05d66ab5942..8acac5a40a9 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -29,7 +29,7 @@ MODULE_LICENSE("GPL"); #include #include -#include +#include static unsigned int target(struct sk_buff **pskb, @@ -43,24 +43,24 @@ target(struct sk_buff **pskb, u_int32_t diff; u_int32_t nfmark; u_int32_t newmark; + u_int32_t ctinfo; + u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - if (ct) { + if (ctmark) { switch(markinfo->mode) { case IPT_CONNMARK_SET: - newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; + if (newmark != *ctmark) + *ctmark = newmark; break; case IPT_CONNMARK_SAVE: - newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); - if (ct->mark != newmark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (*ctmark != newmark) + *ctmark = newmark; break; case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; - diff = (ct->mark ^ nfmark) & markinfo->mask; + diff = (*ctmark ^ nfmark) & markinfo->mask; if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; break; diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c index a4bb9b3bc29..e3c69d072c6 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -5,7 +5,7 @@ #include #include -#include +#include static unsigned int target(struct sk_buff **pskb, @@ -23,7 +23,7 @@ target(struct sk_buff **pskb, If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + nf_ct_untrack(*pskb); (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index df4a42c6da2..d68a048b717 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include @@ -46,60 +46,59 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connbytes_info *sinfo = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct; u_int64_t what = 0; /* initialize to make gcc happy */ + const struct ip_conntrack_counter *counters; - if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + if (!(counters = nf_ct_get_counters(skb))) return 0; /* no match */ switch (sinfo->what) { case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_REPLY].packets; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; - what += ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; + what += counters[IP_CT_DIR_REPLY].packets; break; } break; case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_REPLY].bytes; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; - what += ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; + what += counters[IP_CT_DIR_REPLY].bytes; break; } break; case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, - ct->counters[IP_CT_DIR_ORIGINAL].packets); + what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, + counters[IP_CT_DIR_ORIGINAL].packets); break; case IPT_CONNBYTES_DIR_REPLY: - what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, - ct->counters[IP_CT_DIR_REPLY].packets); + what = div64_64(counters[IP_CT_DIR_REPLY].bytes, + counters[IP_CT_DIR_REPLY].packets); break; case IPT_CONNBYTES_DIR_BOTH: { u_int64_t bytes; u_int64_t pkts; - bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + - ct->counters[IP_CT_DIR_REPLY].bytes; - pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ - ct->counters[IP_CT_DIR_REPLY].packets; + bytes = counters[IP_CT_DIR_ORIGINAL].bytes + + counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets+ + counters[IP_CT_DIR_REPLY].packets; /* FIXME_THEORETICAL: what to do if sum * overflows ? */ diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index bf8de47ce00..5306ef293b9 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -28,7 +28,7 @@ MODULE_LICENSE("GPL"); #include #include -#include +#include static int match(const struct sk_buff *skb, @@ -39,12 +39,12 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connmark_info *info = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - if (!ct) + u_int32_t ctinfo; + const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); + if (!ctmark) return 0; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return (((*ctmark) & info->mask) == info->mark) ^ info->invert; } static int diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c index c1d22801b7c..c8d18705469 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -10,7 +10,14 @@ #include #include + +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include +#include +#else +#include +#endif + #include #include @@ -18,6 +25,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher "); MODULE_DESCRIPTION("iptables connection tracking match module"); +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) + static int match(const struct sk_buff *skb, const struct net_device *in, @@ -102,6 +111,93 @@ match(const struct sk_buff *skb, return 1; } +#else /* CONFIG_IP_NF_CONNTRACK */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &nf_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +#endif /* CONFIG_NF_IP_CONNTRACK */ + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 3e7dd014de4..bf14e1c7798 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c @@ -13,9 +13,15 @@ #include #include #include +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include #include #include +#else +#include +#include +#include +#endif #include #include @@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module"); #define DEBUGP(format, args...) #endif +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) static int match(const struct sk_buff *skb, const struct net_device *in, @@ -73,6 +80,53 @@ out_unlock: return ret; } +#else /* CONFIG_IP_NF_CONNTRACK */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + read_lock_bh(&nf_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + read_unlock_bh(&nf_conntrack_lock); + return ret; +} +#endif + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index b1511b97ea5..4d7f16b70ce 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include @@ -30,9 +30,9 @@ match(const struct sk_buff *skb, enum ip_conntrack_info ctinfo; unsigned int statebit; - if (skb->nfct == &ip_conntrack_untracked.ct_general) + if (nf_ct_is_untracked(skb)) statebit = IPT_STATE_UNTRACKED; - else if (!ip_conntrack_get(skb, &ctinfo)) + else if (!nf_ct_get_ctinfo(skb, &ctinfo)) statebit = IPT_STATE_INVALID; else statebit = IPT_STATE_BIT(ctinfo); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 00000000000..8202c1c0afa --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -0,0 +1,571 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - move L3 protocol dependent part to this file. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI + * - add get_features() to support various size of conntrack + * structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[2], *ap; + ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), + sizeof(u_int32_t) * 2, _addrs); + if (ap == NULL) + return 0; + + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + + return 1; +} + +static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u3.ip = orig->dst.u3.ip; + tuple->dst.u3.ip = orig->src.u3.ip; + + return 1; +} + +static int ipv4_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.u3.ip), + NIPQUAD(tuple->dst.u3.ip)); +} + +static int ipv4_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns new sk_buff, or NULL */ +static struct sk_buff * +nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb_orphan(skb); + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (skb) + ip_send_check(skb->nh.iph); + + return skb; +} + +static int +ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return -NF_DROP; + } + + *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; + *protonum = (*pskb)->nh.iph->protocol; + + return NF_ACCEPT; +} + +int nat_module_is_loaded = 0; +static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple) +{ + if (nat_module_is_loaded) + return NF_CT_F_NAT; + + return NF_CT_F_BASIC; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(pskb); +} + +static unsigned int ipv4_conntrack_help(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, + (*pskb)->nh.raw - (*pskb)->data + + (*pskb)->nh.iph->ihl*4, + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = nf_ct_ipv4_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +static unsigned int ipv4_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv4_conntrack_defrag_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_in_ops = { + .hook = ipv4_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_local_out_ops = { + .hook = ipv4_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* helpers */ +static struct nf_hook_ops ipv4_conntrack_helper_out_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + +static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv4_conntrack_out_ops = { + .hook = ipv4_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +static struct nf_hook_ops ipv4_conntrack_local_in_ops = { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +#ifdef CONFIG_SYSCTL +/* From nf_conntrack_proto_icmp.c */ +extern unsigned long nf_ct_icmp_timeout; +static struct ctl_table_header *nf_ct_ipv4_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "nf_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + NF_CT_TUPLE_U_BLANK(&tuple); + tuple.src.u3.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.u3.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { + .l3proto = PF_INET, + .name = "ipv4", + .pkt_to_tuple = ipv4_pkt_to_tuple, + .invert_tuple = ipv4_invert_tuple, + .print_tuple = ipv4_print_tuple, + .print_conntrack = ipv4_print_conntrack, + .prepare = ipv4_prepare, + .get_features = ipv4_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp; +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + goto cleanup_nothing; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register tcp.\n"); + goto cleanup_sockopt; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register icmp.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register ipv4\n"); + goto cleanup_icmp; + } + + ret = nf_register_hook(&ipv4_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n"); + goto cleanup_ipv4; + } + ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv4_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local helper hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } + + ret = nf_register_hook(&ipv4_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv4_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + /* For use by REJECT target */ + ip_ct_attach = __nf_conntrack_attach; + + return ret; + + cleanup: + synchronize_net(); + ip_ct_attach = NULL; +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv4_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv4_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv4_conntrack_out_ops); + cleanup_helperoutops: + nf_unregister_hook(&ipv4_conntrack_helper_out_ops); + cleanup_helperinops: + nf_unregister_hook(&ipv4_conntrack_helper_in_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv4_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv4_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv4_conntrack_defrag_ops); + cleanup_ipv4: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + cleanup_icmp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 00000000000..7ddb5c08f7b --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,301 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long nf_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, unsigned int dataoff) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct nf_conntrack_protocol *innerproto; + struct nf_conntrack_tuple_hash *h; + int dataoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return -NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_message: fragment of proto %u\n", + inside->ip.protocol); + return -NF_ACCEPT; + } + + innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, + inside->ip.protocol, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: ! get_tuple p=%u", + inside->ip.protocol); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = nf_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + /* Reverse direction from that found */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, + "nf_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmp = +{ + .list = { NULL, NULL }, + .l3proto = PF_INET, + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmp); -- cgit v1.2.3 From a8f74b228826eef1cbe04a05647d61e896f5fd63 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 10 Nov 2005 02:25:52 +0100 Subject: [NETLINK]: Make netlink_callback->done() optional Most netlink families make no use of the done() callback, making it optional gets rid of all unnecessary dummy implementations. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 71f3c7350c6..39061ed53cf 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -724,12 +724,6 @@ done: return skb->len; } -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - static __inline__ int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { @@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) goto err_inval; } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); + inet_diag_dump, NULL); } else { return inet_diag_get_exact(skb, nlh); } -- cgit v1.2.3 From fb286bb2990a107009dbf25f6ffebeb7df77f9be Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 10 Nov 2005 13:01:24 -0800 Subject: [NET]: Detect hardware rx checksum faults correctly Here is the patch that introduces the generic skb_checksum_complete which also checks for hardware RX checksum faults. If that happens, it'll call netdev_rx_csum_fault which currently prints out a stack trace with the device name. In future it can turn off RX checksum. I've converted every spot under net/ that does RX checksum checks to use skb_checksum_complete or __skb_checksum_complete with the exceptions of: * Those places where checksums are done bit by bit. These will call netdev_rx_csum_fault directly. * The following have not been completely checked/converted: ipmr ip_vs netfilter dccp This patch is based on patches and suggestions from Stephen Hemminger and David S. Miller. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 6 +++--- net/ipv4/igmp.c | 19 ++++++++++++++----- net/ipv4/ip_gre.c | 15 ++++++++------- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 11 ++++------- net/ipv4/tcp_ipv4.c | 24 +++++++++--------------- net/ipv4/udp.c | 7 ++----- 6 files changed, 40 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec56..e3eceecd049 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc8406..c04607b4921 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53..4e9c74b54b1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5198f3a1e2c..e4d6b268e8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558f..ac1fcf5b4eb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1219,7 +1213,7 @@ int tcp_v4_rcv(struct sk_buff *skb) * provided case of th->doff==0 is elimineted. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0..2422a5f7195 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); -- cgit v1.2.3 From f4805eded7d38c4e42bf473dc5eb2f34853beb06 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:53:30 -0800 Subject: [TCP]: fix congestion window update when using TSO deferal TCP peformance with TSO over networks with delay is awful. On a 100Mbit link with 150ms delay, we get 4Mbits/sec with TSO and 50Mbits/sec without TSO. The problem is with TSO, we intentionally do not keep the maximum number of packets in flight to fill the window, we hold out to until we can send a MSS chunk. But, we also don't update the congestion window unless we have filled, as per RFC2861. This patch replaces the check for the congestion window being full with something smarter that accounts for TSO. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_highspeed.c | 4 ++-- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_hybla.c | 6 +++--- net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_scalable.c | 3 ++- 7 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e060904..5af99b3ef5d 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,7 +217,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..0705b496c6b 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,7 +186,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..5e56ad368dd 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,12 +111,12 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, u32 pkts_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..404a326ba34 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,7 +207,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f..998f6416ef8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2058,3 +2058,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf552..a2fd25617d2 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,7 +20,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { -- cgit v1.2.3 From 2d2abbab63f6726a147ae61ada39bf2c9ee0db9a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:56:12 -0800 Subject: [TCP]: simplify microsecond rtt sampling Simplify the code that comuputes microsecond rtt estimate used by TCP Vegas. Move the callback out of the RTT sampler and into the end of the ack cleanup. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 62 +++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..e4306565493 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -548,10 +548,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -610,9 +609,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1921,7 +1917,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1940,13 +1936,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +1956,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2050,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2110,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,8 +2124,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); @@ -2135,8 +2136,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2299,7 +2299,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2352,8 +2351,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); @@ -4242,7 +4240,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -- cgit v1.2.3 From 7faffa1c7fb9b8e8917e3225d4e2638270c0a48b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:07:24 -0800 Subject: [TCP]: add tcp_slow_start helper Move all the code that does linear TCP slowstart to one inline function to ease later patch to add ABC support. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 10 ++++------ net/ipv4/tcp_cong.c | 11 +++++------ net/ipv4/tcp_highspeed.c | 7 +++---- net/ipv4/tcp_htcp.c | 11 +++++------ net/ipv4/tcp_scalable.c | 11 +++++------ net/ipv4/tcp_vegas.c | 42 +++++++++++------------------------------- 6 files changed, 33 insertions(+), 59 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5af99b3ef5d..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -220,14 +220,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0705b496c6b..6d3e883b48f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,12 +189,11 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 5e56ad368dd..82b3c189bd7 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -119,10 +119,9 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 404a326ba34..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -210,11 +210,10 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a2fd25617d2..26d7486ee50 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -24,17 +24,16 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f..4376814d29f 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } } - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; } /* Extract info for Tcp socket info provided via netlink. */ -- cgit v1.2.3 From 9772efb970780aeed488c19d8b4afd46c3b484af Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:09:53 -0800 Subject: [TCP]: Appropriate Byte Count support This is an updated version of the RFC3465 ABC patch originally for Linux 2.6.11-rc4 by Yee-Ting Li. ABC is a way of counting bytes ack'd rather than packets when updating congestion control. The orignal ABC described in the RFC applied to a Reno style algorithm. For advanced congestion control there is little change after leaving slow start. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 31 ++++++++++++++++++++----------- net/ipv4/tcp_input.c | 7 +++++++ net/ipv4/tcp_minisocks.c | 1 + 5 files changed, 37 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea..cfaf7613375 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6d3e883b48f..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -192,17 +192,26 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e4306565493..4cb5e6f408d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -1247,6 +1248,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1904,6 +1906,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2310,6 +2313,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -4370,6 +4376,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4..9203a21e299 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; -- cgit v1.2.3 From 326f36e9e7de362e09745ce6f84b65e7ccac33ba Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 10 Nov 2005 17:11:48 -0800 Subject: [TCP]: receive buffer growth limiting with mixed MTU This is a patch for discussion addressing some receive buffer growing issues. This is partially related to the thread "Possible BUG in IPv4 TCP window handling..." last week. Specifically it addresses the problem of an interaction between rcvbuf moderation (receiver autotuning) and rcv_ssthresh. The problem occurs when sending small packets to a receiver with a larger MTU. (A very common case I have is a host with a 1500 byte MTU sending to a host with a 9k MTU.) In such a case, the rcv_ssthresh code is targeting a window size corresponding to filling up the current rcvbuf, not taking into account that the new rcvbuf moderation may increase the rcvbuf size. One hunk makes rcv_ssthresh use tcp_rmem[2] as the size target rather than rcvbuf. The other changes the behavior when it overflows its memory bounds with in-order data so that it tries to grow rcvbuf (the same as with out-of-order data). These changes should help my problem of mixed MTUs, and should also help the case from last week's thread I think. (In both cases though you still need tcp_rmem[2] to be set much larger than the TCP window.) One question is if this is too aggressive at trying to increase rcvbuf if it's under memory stress. Orignally-from: John Heffner Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cb5e6f408d..827cd4b9e86 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -234,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -327,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. -- cgit v1.2.3 From caa20d9abe810be2ede9612b6c9db6ce7d6edf80 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:13:47 -0800 Subject: [TCP]: spelling fixes Minor spelling fixes for TCP code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 4 ++-- 6 files changed, 31 insertions(+), 31 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cfaf7613375..9ac7a4f46bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 827cd4b9e86..34cfa58eab7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -224,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -278,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -287,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -367,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smoother things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -377,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -506,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased fastly, decrease too fastly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -607,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -772,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -1899,7 +1899,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { @@ -1912,7 +1912,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) * 1998/04/10 Andrey V. Savochkin * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -2268,7 +2268,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2344,7 +2344,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3133,7 +3133,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3441,7 +3441,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3486,7 +3486,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3631,7 +3631,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac1fcf5b4eb..4d5021e1929 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && tcp_v4_checksum_init(skb))) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9203a21e299..1b66a2ac432 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -551,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 998f6416ef8..602e7057e43 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -599,7 +599,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1171,7 +1171,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1361,7 +1361,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c..e1880959614 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: -- cgit v1.2.3 From 6a438bbe68c7013a42d9c5aee5a40d7dafdbe6ec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:14:59 -0800 Subject: [TCP]: speed up SACK processing Use "hints" to speed up the SACK processing. Various forms of this have been used by TCP developers (Web100, STCP, BIC) to avoid the 2x linear search of outstanding segments. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 144 ++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_output.c | 54 +++++++++++++++---- 2 files changed, 172 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 34cfa58eab7..40a26b7157b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -897,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; istart_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -940,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1023,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1035,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1058,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1085,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1192,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1258,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1482,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1519,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1605,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1688,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -2117,6 +2230,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 602e7057e43..029c70dfb58 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -- cgit v1.2.3 From c050970a257a4060e927e497a12323e961fcbadc Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 11 Nov 2005 04:43:47 -0500 Subject: [PATCH] TCP: fix vegas build Recent TCP changes broke the build. Signed-off-by: Jeff Garzik Signed-off-by: Linus Torvalds --- net/ipv4/tcp_vegas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4376814d29f..b7d296a8ac6 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,7 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; -- cgit v1.2.3 From a2d7222f0f5861ce13b9308c30bd18f28ebeb583 Mon Sep 17 00:00:00 2001 From: Vlad Drukker Date: Sat, 12 Nov 2005 12:13:14 -0800 Subject: [NETFILTER] {ip,nf}_conntrack TCP: Accept SYN+PUSH like SYN Some devices (e.g. Qlogic iSCSI HBA hardware like QLA4010 up to firmware 3.0.0.4) initiates TCP with SYN and PUSH flags set. The Linux TCP/IP stack deals fine with that, but the connection tracking code doesn't. This patch alters TCP connection tracking to accept SYN+PUSH as a valid flag combination. Signed-off-by: Vlad Drukker Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 468c6003b4c..5b3f5220f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -814,6 +814,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, -- cgit v1.2.3 From dbd36ea496726460299842fdbeaaa7fff2f0c5c7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:21:01 -0800 Subject: [NETFILTER] ctnetlink: use size_t to make gcc-4.x happy Make gcc-4.x happy. Use size_t instead of int. Thanks to Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d2a4fec2286..853d0ac5534 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -467,7 +467,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -497,7 +497,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) return 0; } -static const int cta_min_proto[CTA_PROTO_MAX] = { +static const size_t cta_min_proto[CTA_PROTO_MAX] = { [CTA_PROTO_NUM-1] = sizeof(u_int16_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), @@ -576,7 +576,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; -- cgit v1.2.3 From 56558208521729fa6b2a0f12df22e1569dee297a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:22:11 -0800 Subject: [NETFILTER] ctnetlink: More thorough size checking of attributes Add missing size checks. Thanks Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 39 +++++++++++++++++++++++++++++ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 7 ++++++ 2 files changed, 46 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 853d0ac5534..f5e5e315867 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -614,6 +614,11 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return 0; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -627,6 +632,9 @@ ctnetlink_parse_nat(struct nfattr *cda[], nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -667,6 +675,14 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) return 0; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -678,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -760,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -1047,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1252,6 +1277,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1263,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1333,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1462,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 5b3f5220f28..ee3b7d6c4d2 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -357,6 +357,10 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; @@ -369,6 +373,9 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; -- cgit v1.2.3 From 37d2e7a20d745035b600f1a6be56cbb9c7259419 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 14 Nov 2005 15:24:59 -0800 Subject: [NETFILTER] nfnetlink: unconditionally require CAP_NET_ADMIN This patch unconditionally requires CAP_NET_ADMIN for all nfnetlink messages. It also removes the per-message cap_required field, since all existing subsystems use CAP_NET_ADMIN for all their messages anyway. Patrick McHardy owes me a beer if we ever need to re-introduce this. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f5e5e315867..de9f4464438 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1543,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { -- cgit v1.2.3 From 31f3426904e066f17e3f88c468a2f7c869ad4aac Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 15 Nov 2005 15:17:10 -0800 Subject: [TCP]: More spelling fixes. From Joe Perches Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40a26b7157b..bf2e23086bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -367,7 +367,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoother things out + * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be increased fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) -- cgit v1.2.3 From e7c8a41e817f381ac5c2a59ecc81b483bd68a7df Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 16 Nov 2005 12:55:37 -0800 Subject: [IPV4,IPV6]: replace handmade list with hlist in IPv{4,6} reassembly Both of ipq and frag_queue have *next and **prev, and they can be replaced with hlist. Thanks Arnaldo Carvalho de Melo for the suggestion. Signed-off-by: Yasuyuki Kozakai Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c..8ce0ce2ee48 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && -- cgit v1.2.3 From bd6af700a7191f483f41706467033588f28c8877 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 17 Nov 2005 14:11:18 -0800 Subject: [TCP]: TCP highspeed build error There is a compile error that crept in with the last patch of TCP patches. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 82b3c189bd7..63cf7e54084 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,7 +111,7 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, u32 pkts_acked) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); -- cgit v1.2.3 From 2fce76afdb067fa3e7f8ee33c9fe366bd65887ea Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 17 Nov 2005 15:06:47 -0800 Subject: [NETFILTER] ip_conntrack: fix ftp/irc/tftp helpers on ports >= 32768 Since we've converted the ftp/irc/tftp helpers to use the new module_parm_array() some time ago, we ware accidentially using signed data types - thus preventing those modules from being used on ports >= 32768. This patch fixes it by using 'ushort' module parameters. Thanks to Jan Nijs for reporting this bug. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_irc.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_tftp.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f8..59e12b02b22 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f..2dea1db1440 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525..d3c5a371f99 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 -- cgit v1.2.3