From 28f6aeea3f12d37bd258b2c0d5ba891bff4ec479 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 25 Dec 2009 17:30:22 -0800 Subject: net: restore ip source validation when using policy routing and the skb mark: there are cases where a back path validation requires us to use a different routing table for src ip validation than the one used for mapping ingress dst ip. One such a case is transparent proxying where we pretend to be the destination system and therefore the local table is used for incoming packets but possibly a main table would be used on outbound. Make the default behavior to allow the above and if users need to turn on the symmetry via sysctl src_valid_mark Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 1 + net/ipv4/fib_frontend.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5cdbc102a41..040c4f05b65 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1397,6 +1397,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3323168ee52..82dbf711d6d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -252,6 +252,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); + if (mark && !IN_DEV_SRC_VMARK(in_dev)) + fl.mark = 0; } rcu_read_unlock(); -- cgit v1.2.3 From 7ad6848c7e81a603605fad3f3575841aab004eea Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 6 Jan 2010 20:37:01 -0800 Subject: ip: fix mc_loop checks for tunnels with multicast outer addresses When we have L3 tunnels with different inner/outer families (i.e. IPV4/IPV6) which use a multicast address as the outer tunnel destination address, multicast packets will be loopbacked back to the sending socket even if IP*_MULTICAST_LOOP is set to disabled. The mc_loop flag is present in the family specific part of the socket (e.g. the IPv4 or IPv4 specific part). setsockopt sets the inner family mc_loop flag. When the packet is pushed through the L3 tunnel it will eventually be processed by the outer family which if different will check the flag in a different part of the socket then it was set. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e34013a78ef..3451799e3db 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -254,7 +254,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST) { - if ((!sk || inet_sk(sk)->mc_loop) + if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped -- cgit v1.2.3 From 65324144b50bc7022cc9b6ca8f4a536a957019e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jan 2010 05:50:47 +0000 Subject: net: RFC3069, private VLAN proxy arp support This is to be used together with switch technologies, like RFC3069, that where the individual ports are not allowed to communicate with each other, but they are allowed to talk to the upstream router. As described in RFC 3069, it is possible to allow these hosts to communicate through the upstream router by proxy_arp'ing. This patch basically allow proxy arp replies back to the same interface (from which the ARP request/solicitation was received). Tunable per device via proc "proxy_arp_pvlan": /proc/sys/net/ipv4/conf/*/proxy_arp_pvlan This switch technology is known by different vendor names: - In RFC 3069 it is called VLAN Aggregation. - Cisco and Allied Telesyn call it Private VLAN. - Hewlett-Packard call it Source-Port filtering or port-isolation. - Ericsson call it MAC-Forced Forwarding (RFC Draft). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/arp.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- net/ipv4/devinet.c | 1 + net/ipv4/route.c | 7 ++++++- 3 files changed, 55 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c95cd93acf2..078709233bc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -70,6 +70,7 @@ * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #include @@ -524,12 +525,15 @@ int arp_bind_neighbour(struct dst_entry *dst) /* * Check if we can use proxy ARP for this path */ - -static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; + if (rt->u.dst.dev == dev) + return 0; + if (!IN_DEV_PROXY_ARP(in_dev)) return 0; @@ -547,6 +551,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) return (omi != imi && omi != -1); } +/* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->u.dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + /* * Interface to link layer: send routine and receive handler. */ @@ -833,8 +874,11 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) + { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5cdbc102a41..0715f4cac39 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1407,6 +1407,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e446496f564..1cc339441e7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1988,8 +1988,13 @@ static int __mkroute_input(struct sk_buff *skb, if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } -- cgit v1.2.3 From d218d11133d888f9745802146a50255a4781d37a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2010 16:28:01 -0800 Subject: tcp: Generalized TTL Security Mechanism This patch adds the kernel portions needed to implement RFC 5082 Generalized TTL Security Mechanism (GTSM). It is a lightweight security measure against forged packets causing DoS attacks (for BGP). This is already implemented the same way in BSD kernels. For the necessary Quagga patch http://www.gossamer-threads.com/lists/quagga/dev/17389 Description from Cisco http://www.cisco.com/en/US/docs/ios/12_3t/12_3t7/feature/guide/gt_btsh.html It does add one byte to each socket structure, but I did a little rearrangement to reuse a hole (on 64 bit), but it does grow the structure on 32 bit This should be documented on ip(4) man page and the Glibc in.h file also needs update. IPV6_MINHOPLIMIT should also be added (although BSD doesn't support that). Only TCP is supported, but could also be added to UDP, DCCP, SCTP if desired. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 14 +++++++++++++- net/ipv4/tcp_ipv4.c | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cafad9baff0..644dc43a55d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -451,7 +451,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<transparent = !!val; break; + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + default: err = -ENOPROTOOPT; break; @@ -1198,6 +1207,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet->transparent; break; + case IP_MINTTL: + val = inet->min_ttl; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65b8ebfd078..382f667238e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,9 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + if (iph->ttl < inet_sk(sk)->min_ttl) + goto discard_and_relse; + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; -- cgit v1.2.3 From 71fceff0ea36d5a6cffecb272b8b3970535fe905 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 15 Jan 2010 01:16:40 -0800 Subject: ipv4: Use less conflicting local var name in change_nexthops() loop macro. As noticed by H Hartley Sweeten, since change_nexthops() uses 'nh' as it's iterator variable, it can conflict with other existing local vars. Use "nexthop_nh" to avoid the conflict and make it easier to figure out where this magic variable comes from. Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 76 +++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ed19aa6919c..96b21011a3e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -62,8 +62,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) -#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ -for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) +#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ +for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -72,7 +72,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ for (nhsel=0; nhsel < 1; nhsel++) -#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ +#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel=0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -145,9 +145,9 @@ void free_fib_info(struct fib_info *fi) return; } change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); - nh->nh_dev = NULL; + if (nexthop_nh->nh_dev) + dev_put(nexthop_nh->nh_dev); + nexthop_nh->nh_dev = NULL; } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); @@ -162,9 +162,9 @@ void fib_release_info(struct fib_info *fi) if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); change_nexthops(fi) { - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hlist_del(&nh->nh_hash); + hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) fi->fib_dead = 1; fib_info_put(fi); @@ -395,19 +395,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; - nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; - nh->nh_oif = rtnh->rtnh_ifindex; - nh->nh_weight = rtnh->rtnh_hops + 1; + nexthop_nh->nh_flags = + (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + nexthop_nh->nh_oif = rtnh->rtnh_ifindex; + nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nh->nh_gw = nla ? nla_get_be32(nla) : 0; + nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; #ifdef CONFIG_NET_CLS_ROUTE nla = nla_find(attrs, attrlen, RTA_FLOW); - nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif } @@ -738,7 +739,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { - nh->nh_parent = fi; + nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mx) { @@ -808,7 +809,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(cfg, fi, nh)) != 0) + if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) goto failure; } endfor_nexthops(fi) } @@ -843,11 +844,11 @@ link_it: struct hlist_head *head; unsigned int hash; - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); head = &fib_info_devhash[hash]; - hlist_add_head(&nh->nh_hash, head); + hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) spin_unlock_bh(&fib_info_lock); return fi; @@ -1080,21 +1081,21 @@ int fib_sync_down_dev(struct net_device *dev, int force) prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) + if (nexthop_nh->nh_flags&RTNH_F_DEAD) dead++; - else if (nh->nh_dev == dev && - nh->nh_scope != scope) { - nh->nh_flags |= RTNH_F_DEAD; + else if (nexthop_nh->nh_dev == dev && + nexthop_nh->nh_scope != scope) { + nexthop_nh->nh_flags |= RTNH_F_DEAD; #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nh->nh_power; - nh->nh_power = 0; + fi->fib_power -= nexthop_nh->nh_power; + nexthop_nh->nh_power = 0; spin_unlock_bh(&fib_multipath_lock); #endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nh->nh_dev == dev) { + if (force > 1 && nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } @@ -1144,18 +1145,20 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { alive++; continue; } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + if (nexthop_nh->nh_dev == NULL || + !(nexthop_nh->nh_dev->flags&IFF_UP)) continue; - if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) + if (nexthop_nh->nh_dev != dev || + !__in_dev_get_rtnl(dev)) continue; alive++; spin_lock_bh(&fib_multipath_lock); - nh->nh_power = 0; - nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_power = 0; + nexthop_nh->nh_flags &= ~RTNH_F_DEAD; spin_unlock_bh(&fib_multipath_lock); } endfor_nexthops(fi) @@ -1182,9 +1185,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { - power += nh->nh_weight; - nh->nh_power = nh->nh_weight; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } } endfor_nexthops(fi); fi->fib_power = power; @@ -1204,9 +1207,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) w = jiffies % fi->fib_power; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { - if ((w -= nh->nh_power) <= 0) { - nh->nh_power--; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && + nexthop_nh->nh_power) { + if ((w -= nexthop_nh->nh_power) <= 0) { + nexthop_nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; spin_unlock_bh(&fib_multipath_lock); -- cgit v1.2.3 From 72659ecce68588b74f6c46862c2b4cec137d7a5a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sun, 17 Jan 2010 19:09:39 -0800 Subject: tcp: account SYN-ACK timeouts & retransmissions Currently we don't increment SYN-ACK timeouts & retransmissions although we do increment the same stats for SYN. We seem to have lost the SYN-ACK accounting with the introduction of tcp_syn_recv_timer (commit 2248761e in the netdev-vger-cvs tree). This patch fixes this issue. In the process we also rename the v4/v6 syn/ack retransmit functions for clarity. We also add a new request_socket operations (syn_ack_timeout) so we can keep code in inet_connection_sock.c protocol agnostic. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 ++ net/ipv4/tcp_ipv4.c | 18 ++++++++++-------- net/ipv4/tcp_timer.c | 6 ++++++ 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ee16475f8fc..8da6429269d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -529,6 +529,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, &expire, &resend); + if (req->rsk_ops->syn_ack_timeout) + req->rsk_ops->syn_ack_timeout(parent, req); if (!expire && (!resend || !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 382f667238e..356f544c4c1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -742,9 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req, - struct request_values *rvp) +static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -775,10 +775,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { - return __tcp_v4_send_synack(sk, NULL, req, rvp); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -1192,10 +1193,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_send_synack, + .rtx_syn_ack = tcp_v4_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG @@ -1373,8 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext) || + if (tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || want_cookie) goto drop_and_free; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8816a20c259..de7d1bf9114 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -474,6 +474,12 @@ static void tcp_synack_timer(struct sock *sk) TCP_TIMEOUT_INIT, TCP_RTO_MAX); } +void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +{ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); +} +EXPORT_SYMBOL(tcp_syn_ack_timeout); + void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) -- cgit v1.2.3 From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:35:32 +0000 Subject: net: spread __net_init, __net_exit __net_init/__net_exit are apparently not going away, so use them to full extent. In some cases __net_init was removed, because it was called from __net_exit code. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/igmp.c | 4 ++-- net/ipv4/ip_fragment.c | 8 ++++---- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ipip.c | 7 +++---- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/udp.c | 4 ++-- net/ipv4/udplite.c | 4 ++-- 8 files changed, 19 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 82dbf711d6d..9b3e28ed524 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -883,7 +883,7 @@ static void nl_fib_input(struct sk_buff *skb) netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); } -static int nl_fib_lookup_init(struct net *net) +static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, @@ -1004,7 +1004,7 @@ fail: return err; } -static void __net_exit ip_fib_net_exit(struct net *net) +static void ip_fib_net_exit(struct net *net) { unsigned int i; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 76c08402c93..8f5468393f0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2603,7 +2603,7 @@ static const struct file_operations igmp_mcf_seq_fops = { .release = seq_release_net, }; -static int igmp_net_init(struct net *net) +static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; @@ -2621,7 +2621,7 @@ out_igmp: return -ENOMEM; } -static void igmp_net_exit(struct net *net) +static void __net_exit igmp_net_exit(struct net *net) { proc_net_remove(net, "mcfilter"); proc_net_remove(net, "igmp"); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 86964b353c3..891c72aea52 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -646,7 +646,7 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ns_ctl_register(struct net *net) +static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -676,7 +676,7 @@ err_alloc: return -ENOMEM; } -static void ip4_frags_ns_ctl_unregister(struct net *net) +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -704,7 +704,7 @@ static inline void ip4_frags_ctl_register(void) } #endif -static int ipv4_frags_init_net(struct net *net) +static int __net_init ipv4_frags_init_net(struct net *net) { /* * Fragment cache limits. We will commit 256K at one time. Should we @@ -726,7 +726,7 @@ static int ipv4_frags_init_net(struct net *net) return ip4_frags_ns_ctl_register(net); } -static void ipv4_frags_exit_net(struct net *net) +static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f36ce156cac..7631b20490f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1307,7 +1307,7 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) } } -static int ipgre_init_net(struct net *net) +static int __net_init ipgre_init_net(struct net *net) { struct ipgre_net *ign = net_generic(net, ipgre_net_id); int err; @@ -1334,7 +1334,7 @@ err_alloc_dev: return err; } -static void ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_net(struct net *net) { struct ipgre_net *ign; LIST_HEAD(list); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index eda04fed337..95db732e542 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -130,7 +130,6 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static void ipip_fb_tunnel_init(struct net_device *dev); static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); @@ -730,7 +729,7 @@ static void ipip_tunnel_init(struct net_device *dev) ipip_tunnel_bind_dev(dev); } -static void ipip_fb_tunnel_init(struct net_device *dev) +static void __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -773,7 +772,7 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) } } -static int ipip_init_net(struct net *net) +static int __net_init ipip_init_net(struct net *net) { struct ipip_net *ipn = net_generic(net, ipip_net_id); int err; @@ -806,7 +805,7 @@ err_alloc_dev: return err; } -static void ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_net(struct net *net) { struct ipip_net *ipn = net_generic(net, ipip_net_id); LIST_HEAD(list); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 356f544c4c1..c3588b4fd97 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,12 +2430,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = { }, }; -static int tcp4_proc_init_net(struct net *net) +static int __net_init tcp4_proc_init_net(struct net *net) { return tcp_proc_register(net, &tcp4_seq_afinfo); } -static void tcp4_proc_exit_net(struct net *net) +static void __net_exit tcp4_proc_exit_net(struct net *net) { tcp_proc_unregister(net, &tcp4_seq_afinfo); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f0126fdd7e0..4f7d2122d81 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2027,12 +2027,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = { }, }; -static int udp4_proc_init_net(struct net *net) +static int __net_init udp4_proc_init_net(struct net *net) { return udp_proc_register(net, &udp4_seq_afinfo); } -static void udp4_proc_exit_net(struct net *net) +static void __net_exit udp4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udp4_seq_afinfo); } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 66f79513f4a..6610bf76369 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -81,12 +81,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = { }, }; -static int udplite4_proc_init_net(struct net *net) +static int __net_init udplite4_proc_init_net(struct net *net) { return udp_proc_register(net, &udplite4_seq_afinfo); } -static void udplite4_proc_exit_net(struct net *net) +static void __net_exit udplite4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udplite4_seq_afinfo); } -- cgit v1.2.3 From 0a931acfd19faf13129a22a46c06f330ecc2a4a3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:32:50 +0000 Subject: ipv4: don't remove /proc/net/rt_acct /proc/net/rt_acct is not created if NET_CLS_ROUTE=n. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e446496f564..d62b05d3338 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -586,7 +586,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_NET_CLS_ROUTE remove_proc_entry("rt_acct", net->proc_net); +#endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { -- cgit v1.2.3 From 6d955180b2f9ccff444df06265160868cabb289a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 18 Jan 2010 12:58:44 +0000 Subject: ipv4: allow warming up the ARP cache with request type gratuitous ARP If the per device ARP_ACCEPT option is enable, currently we only allow creating new ARP cache entries for response type gratuitous ARP. Allowing gratuitous ARP to create new ARP entries (not only to update existing ones) is useful when we want to avoid unnecessary delays for the first packet of a stream. This patch allows request type gratuitous ARP to create new ARP cache entries as well. This is useful when we want to populate the ARP cache entries for a large number of hosts on the same LAN. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 078709233bc..1940b4df769 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -907,7 +907,8 @@ static int arp_process(struct sk_buff *skb) devices (strip is candidate) */ if (n == NULL && - arp->ar_op == htons(ARPOP_REPLY) && + (arp->ar_op == htons(ARPOP_REPLY) || + (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && inet_addr_type(net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } -- cgit v1.2.3 From b4ced2b768ab6c580148d1163c82a655fe147edc Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 19 Jan 2010 14:12:20 -0800 Subject: netlink: With opcode INET_DIAG_BC_S_LE dport was compared in inet_diag_bc_run() The s-port should be compared. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index bdb78dd180c..1aaa8110d84 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -368,7 +368,7 @@ static int inet_diag_bc_run(const void *bc, int len, yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: - yes = entry->dport <= op[1].no; + yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; -- cgit v1.2.3 From 5833929cc2ad2b3064b4fac8c44e293972d240d8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 22 Jan 2010 10:17:26 +0000 Subject: net: constify MIB name tables Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f25542c48b7..1b09a6dde7c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_SENTINEL }; -static struct { - char *name; +static const struct { + const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, -- cgit v1.2.3 From e754834e65220b2b674c55c3b6dfb2fb1a2804d0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 22 Jan 2010 10:18:25 +0000 Subject: icmp: move icmp_err_convert[] to .rodata Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fe11f60ce41..4b4c2bcd15d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -114,7 +114,7 @@ struct icmp_bxm { /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ -struct icmp_err icmp_err_convert[] = { +const struct icmp_err icmp_err_convert[] = { { .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ .fatal = 0, -- cgit v1.2.3 From e9017b55189355e9e6569990a18919e83f35bccb Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Sat, 23 Jan 2010 01:57:42 -0800 Subject: IP: Send an ICMP "Fragment Reassembly Timeout" message when enabling connection track No matter whether connection track is enabled, an end host should send an ICMPv4 "Fragment Reassembly Timeout" message when defrag timeout. The reasons are following two points: 1. RFC 792 says: >>>> >> > > If a host reassembling a fragmented datagram cannot complete the >>>> >> > > reassembly due to missing fragments within its time limit it >>>> >> > > discards the datagram, and it may send a time exceeded message. >>>> >> > > >>>> >> > > If fragment zero is not available then no time exceeded need be >>>> >> > > sent at all. >>>> >> > > >>>> >> > > Read more: http://www.faqs.org/rfcs/rfc792.html#ixzz0aOXRD7Wp 2. Patrick McHardy also agrees with this opinion. :-) About the discussion of this opinion, refer to http://patchwork.ozlabs.org/patch/41649 The patch fixed the problem like this: When enabling connection track, fragments are received at PRE_ROUTING HOOK. If they are failed to reassemble, ip_expire() will be called. Before sending an ICMP "Fragment Reassembly Timeout" message, the patch searches router table to get the destination entry only for host type. The patch has been tested on both host type and route type. Signed-off-by: Shan Wei Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 891c72aea52..9f41bd31175 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -205,13 +207,37 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); - if (head->dev) - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - rcu_read_unlock(); + if (!head->dev) + goto out_rcu_unlock; + + /* + * Only search router table for the head fragment, + * when defraging timeout at PRE_ROUTING HOOK. + */ + if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { + const struct iphdr *iph = ip_hdr(head); + int err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (unlikely(err)) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; + + } + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); } + +out_rcu_unlock: + rcu_read_unlock(); out: spin_unlock(&qp->q.lock); ipq_put(qp); -- cgit v1.2.3 From d7c7544c3d5f59033d1bf3236bc7b289f5f26b75 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 24 Jan 2010 22:47:53 -0800 Subject: netns xfrm: deal with dst entries in netns GC is non-existent in netns, so after you hit GC threshold, no new dst entries will be created until someone triggers cleanup in init_net. Make xfrm4_dst_ops and xfrm6_dst_ops per-netns. This is not done in a generic way, because it woule waste (AF_MAX - 2) * sizeof(struct dst_ops) bytes per-netns. Reorder GC threshold initialization so it'd be done before registering XFRM policies. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8c08a28d8f8..67107d63c1c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,6 @@ #include #include -static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, @@ -190,8 +189,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm4_garbage_collect(struct dst_ops *ops) { - xfrm4_policy_afinfo.garbage_collect(&init_net); - return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); + struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); + + xfrm4_policy_afinfo.garbage_collect(net); + return (atomic_read(&ops->entries) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -268,7 +269,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { static struct ctl_table xfrm4_policy_table[] = { { .procname = "xfrm4_gc_thresh", - .data = &xfrm4_dst_ops.gc_thresh, + .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -295,8 +296,6 @@ static void __exit xfrm4_policy_fini(void) void __init xfrm4_init(int rt_max_size) { - xfrm4_state_init(); - xfrm4_policy_init(); /* * Select a default value for the gc_thresh based on the main route * table hash size. It seems to me the worst case scenario is when @@ -308,6 +307,9 @@ void __init xfrm4_init(int rt_max_size) * and start cleaning when were 1/2 full */ xfrm4_dst_ops.gc_thresh = rt_max_size/2; + + xfrm4_state_init(); + xfrm4_policy_init(); #ifdef CONFIG_SYSCTL sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, xfrm4_policy_table); -- cgit v1.2.3 From f81074f86176605bfbfafb9944e27628a4e26ce6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 25 Jan 2010 15:47:50 -0800 Subject: tcp_probe: avoid modulus operation and wrap fix By rounding up the buffer size to power of 2, several expensive modulus operations can be avoided. This patch also solves a bug where the gap need when ring gets full was not being accounted for. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index bb110c5ce1d..9bc805df95d 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -39,9 +39,9 @@ static int port __read_mostly = 0; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); -static int bufsize __read_mostly = 4096; +static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, int, 0); +module_param(bufsize, uint, 0); static int full __read_mostly; MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); @@ -75,12 +75,12 @@ static struct { static inline int tcp_probe_used(void) { - return (tcp_probe.head - tcp_probe.tail) % bufsize; + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); } static inline int tcp_probe_avail(void) { - return bufsize - tcp_probe_used(); + return bufsize - tcp_probe_used() - 1; } /* @@ -116,7 +116,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->ssthresh = tcp_current_ssthresh(sk); p->srtt = tp->srtt >> 3; - tcp_probe.head = (tcp_probe.head + 1) % bufsize; + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } tcp_probe.lastcwnd = tp->snd_cwnd; spin_unlock(&tcp_probe.lock); @@ -149,7 +149,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file) static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail % bufsize; + = tcp_probe.log + tcp_probe.tail; struct timespec tv = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); @@ -192,7 +192,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, width = tcpprobe_sprint(tbuf, sizeof(tbuf)); if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) % bufsize; + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); spin_unlock_bh(&tcp_probe.lock); @@ -222,9 +222,10 @@ static __init int tcpprobe_init(void) init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); - if (bufsize < 0) + if (bufsize == 0) return -EINVAL; + bufsize = roundup_pow_of_two(bufsize); tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); if (!tcp_probe.log) goto err0; @@ -236,7 +237,7 @@ static __init int tcpprobe_init(void) if (ret) goto err1; - pr_info("TCP probe registered (port=%d)\n", port); + pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); return 0; err1: proc_net_remove(&init_net, procname); -- cgit v1.2.3 From a92df2545402c1a08e7a158f4477a52dea0eeeed Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 25 Jan 2010 10:38:34 +0000 Subject: netns xfrm: ipcomp support Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/ipcomp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 38fbf04150a..b55a0c3df82 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -25,6 +25,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) { + struct net *net = dev_net(skb->dev); __be32 spi; struct iphdr *iph = (struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); @@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; @@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) /* We always hold one tunnel user reference to indicate a tunnel */ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { + struct net *net = xs_net(x); struct xfrm_state *t; - t = xfrm_state_alloc(&init_net); + t = xfrm_state_alloc(net); if (t == NULL) goto out; @@ -82,10 +84,11 @@ error: */ static int ipcomp_tunnel_attach(struct xfrm_state *x) { + struct net *net = xs_net(x); int err = 0; struct xfrm_state *t; - t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, + t = xfrm_state_lookup(net, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); -- cgit v1.2.3 From c85bb41e93184bf5494dde6d8fe5a81b564c84c8 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 2 Feb 2010 07:32:29 -0800 Subject: igmp: fix ip_mc_sf_allow race [v5] Almost all igmp functions accessing inet->mc_list are protected by rtnl_lock(), but there is one exception which is ip_mc_sf_allow(), so there is a chance of either ip_mc_drop_socket or ip_mc_leave_group remove an entry while ip_mc_sf_allow is running causing a crash. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 83 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8f5468393f0..d2836399874 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1799,7 +1799,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - inet->mc_list = iml; + rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1807,24 +1807,46 @@ done: return err; } +static void ip_sf_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_sf_socklist *psf; + + psf = container_of(rp, struct ip_sf_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(psf); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { + struct ip_sf_socklist *psf = iml->sflist; int err; - if (iml->sflist == NULL) { + if (psf == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, - iml->sfmode, iml->sflist->sl_count, - iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + call_rcu(&psf->rcu, ip_sf_socklist_reclaim); return err; } + +static void ip_mc_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_mc_socklist *iml; + + iml = container_of(rp, struct ip_mc_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(iml); +} + + /* * Ask a socket to leave a group. */ @@ -1854,12 +1876,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) (void) ip_mc_leave_src(sk, iml, in_dev); - *imlp = iml->next; + rcu_assign_pointer(*imlp, iml->next); if (in_dev) ip_mc_dec_group(in_dev, group); rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); return 0; } if (!in_dev) @@ -1974,9 +1998,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) { for (i=0; isl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } - pmc->sflist = psl = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; isl_count; i++) { @@ -2072,11 +2099,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: @@ -2209,30 +2238,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; + int ret; + ret = 1; if (!ipv4_is_multicast(loc_addr)) - return 1; + goto out; - for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + rcu_read_lock(); + for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && pmc->multi.imr_ifindex == dif) break; } + ret = inet->mc_all; if (!pmc) - return inet->mc_all; + goto unlock; psl = pmc->sflist; + ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) - return pmc->sfmode == MCAST_EXCLUDE; + goto unlock; for (i=0; isl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } + ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - return 0; + goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - return 0; - return 1; + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; } /* @@ -2251,7 +2290,7 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; - inet->mc_list = iml->next; + rcu_assign_pointer(inet->mc_list, iml->next); in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); @@ -2259,7 +2298,9 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); } rtnl_unlock(); } -- cgit v1.2.3 From d1c9ae6d1e7b95cedc8e39e8949e795379a0669e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 2 Feb 2010 11:46:50 -0800 Subject: ipv4: ip_fragment: fix unbalanced rcu_read_unlock() Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9f41bd31175..b59430bc041 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -234,10 +234,9 @@ static void ip_expire(unsigned long arg) /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - } - out_rcu_unlock: - rcu_read_unlock(); + rcu_read_unlock(); + } out: spin_unlock(&qp->q.lock); ipq_put(qp); -- cgit v1.2.3 From d088dde7b19628f386c486f16cd471f5b5682ca8 Mon Sep 17 00:00:00 2001 From: Christoph Egger Date: Wed, 3 Feb 2010 06:05:54 +0000 Subject: ipv4: obsolete config in kernel source (IP_ROUTE_PERVASIVE) CONFIG_IP_ROUTE_PERVASIVE is missing a corresponding config IP_ROUTE_PERVASIVE somewhere in KConfig (and missing it for ages already) so it looks like some aging artefact no longer needed. Therefor this patch kills of the only remaining reference to that config Item removing the already unrechable code snipet. Signed-off-by: Christoph Egger Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 96b21011a3e..1af0ea0fb6a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -528,10 +528,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_gw) { struct fib_result res; -#ifdef CONFIG_IP_ROUTE_PERVASIVE - if (nh->nh_flags&RTNH_F_PERVASIVE) - return 0; -#endif if (nh->nh_flags&RTNH_F_ONLINK) { struct net_device *dev; -- cgit v1.2.3 From 14c7dbe043d01a83a30633ab6b109ba2ac61d9f7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 8 Feb 2010 11:17:43 -0800 Subject: netfilter: xtables: compat out of scope fix As per C99 6.2.4(2) when temporary table data goes out of scope, the behaviour is undefined: if (compat) { struct foo tmp; ... private = &tmp; } [dereference private] Signed-off-by: Alexey Dobriyan Cc: stable@kernel.org Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 06632762ba5..90203e1b918 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -925,10 +925,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 572330a552e..3ce53cf13d5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1132,10 +1132,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; -- cgit v1.2.3 From d696c7bdaa55e2208e56c6f98e6bc1599f34286d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 8 Feb 2010 11:18:07 -0800 Subject: netfilter: nf_conntrack: fix hash resizing with namespaces As noticed by Jon Masters , the conntrack hash size is global and not per namespace, but modifiable at runtime through /sys/module/nf_conntrack/hashsize. Changing the hash size will only resize the hash in the current namespace however, so other namespaces will use an invalid hash size. This can cause crashes when enlarging the hashsize, or false negative lookups when shrinking it. Move the hash size into the per-namespace data and only use the global hash size to initialize the per-namespace value when instanciating a new namespace. Additionally restrict hash resizing to init_net for now as other namespaces are not handled currently. Cc: stable@kernel.org Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 4 ++-- net/ipv4/netfilter/nf_nat_core.c | 22 +++++++++------------- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index d171b123a65..d1ea38a7c49 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -210,7 +210,7 @@ static ctl_table ip_ct_sysctl_table[] = { }, { .procname = "ip_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 8668a3defda..2fb7b76da94 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); if (!is_a_nulls(n)) @@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(head->next); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= net->ct.htable_size) return NULL; } head = rcu_dereference(net->ct.hash[st->bucket].first); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index fe1a64479dd..26066a2327a 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -35,9 +35,6 @@ static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; -/* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size __read_mostly; - #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; @@ -72,7 +69,7 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; @@ -80,7 +77,7 @@ hash_by_src(const struct nf_conntrack_tuple *tuple) hash = jhash_3words((__force u32)tuple->src.u3.ip, (__force u32)tuple->src.u.all, tuple->dst.protonum, 0); - return ((u64)hash * nf_nat_htable_size) >> 32; + return ((u64)hash * net->ipv4.nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -147,7 +144,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(tuple); + unsigned int h = hash_by_src(net, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; const struct hlist_node *n; @@ -330,7 +327,7 @@ nf_nat_setup_info(struct nf_conn *ct, if (have_to_hash) { unsigned int srchash; - srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); @@ -679,8 +676,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + /* Leave them the same for the moment. */ + net->ipv4.nat_htable_size = net->ct.htable_size; + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, + &net->ipv4.nat_vmalloced, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -703,7 +702,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - nf_nat_htable_size); + net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { @@ -724,9 +723,6 @@ static int __init nf_nat_init(void) return ret; } - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - ret = register_pernet_subsys(&nf_nat_net_ops); if (ret < 0) goto cleanup_extend; -- cgit v1.2.3