From e6f1cebf71c4e7aae7dfa43414ce2631291def9f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:44:53 -0700 Subject: [NET] endianness noise: INADDR_ANY Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ipconfig.c | 9 +++++---- net/sctp/protocol.c | 10 +++++----- net/sunrpc/svc_xprt.c | 2 +- 5 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09ca5293d08..0d109504ed8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -458,7 +458,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && - addr->sin_addr.s_addr != INADDR_ANY && + addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index de0572c8885..f72457b4b0a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -583,7 +583,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } if (!mreq.imr_ifindex) { - if (mreq.imr_address.s_addr == INADDR_ANY) { + if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { inet->mc_index = 0; inet->mc_addr = 0; err = 0; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 5dd938579ee..7c992fbbc2c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -103,6 +103,7 @@ - '3' from resolv.h */ #define NONE __constant_htonl(INADDR_NONE) +#define ANY __constant_htonl(INADDR_ANY) /* * Public IP configuration @@ -1479,19 +1480,19 @@ static int __init ip_auto_config_setup(char *addrs) DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); switch (num) { case 0: - if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + if ((ic_myaddr = in_aton(ip)) == ANY) ic_myaddr = NONE; break; case 1: - if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + if ((ic_servaddr = in_aton(ip)) == ANY) ic_servaddr = NONE; break; case 2: - if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + if ((ic_gateway = in_aton(ip)) == ANY) ic_gateway = NONE; break; case 3: - if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + if ((ic_netmask = in_aton(ip)) == ANY) ic_netmask = NONE; break; case 4: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ad0a4069b95..7a7646a9565 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -337,14 +337,14 @@ static int sctp_v4_cmp_addr(const union sctp_addr *addr1, static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) { addr->v4.sin_family = AF_INET; - addr->v4.sin_addr.s_addr = INADDR_ANY; + addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; } /* Is this a wildcard address? */ static int sctp_v4_is_any(const union sctp_addr *addr) { - return INADDR_ANY == addr->v4.sin_addr.s_addr; + return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr; } /* This function checks if the address is a valid address to be used for @@ -375,7 +375,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) int ret = inet_addr_type(&init_net, addr->v4.sin_addr.s_addr); - if (addr->v4.sin_addr.s_addr != INADDR_ANY && + if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && !sysctl_ip_nonlocal_bind) @@ -785,8 +785,8 @@ static int sctp_inet_cmp_addr(const union sctp_addr *addr1, /* PF_INET only supports AF_INET addresses. */ if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; - if (INADDR_ANY == addr1->v4.sin_addr.s_addr || - INADDR_ANY == addr2->v4.sin_addr.s_addr) + if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr || + htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr) return 1; if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) return 1; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ea377e06afa..332eb47539e 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -185,7 +185,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, struct svc_xprt_class *xcl; struct sockaddr_in sin = { .sin_family = AF_INET, - .sin_addr.s_addr = INADDR_ANY, + .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; dprintk("svc: creating transport %s[%d]\n", xprt_name, port); -- cgit v1.2.3 From 0382b9c35469be273ed10fa374496a924055a3c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:46:46 -0700 Subject: [PKT_SCHED]: annotate cls_u32 Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 8 ++++---- net/sched/em_u32.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b18fa95ef24..c5c16b4b6e9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -89,7 +89,7 @@ static const struct tcf_ext_map u32_ext_map = { static struct tc_u_common *u32_list; -static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift) +static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift) { unsigned h = ntohl(key & sel->hmask)>>fshift; @@ -137,7 +137,7 @@ next_knode: for (i = n->sel.nkeys; i>0; i--, key++) { - if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + if ((*(__be32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { n = n->next; goto next_knode; } @@ -182,7 +182,7 @@ check_terminal: ht = n->ht_down; sel = 0; if (ht->divisor) - sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift); + sel = ht->divisor&u32_hash_fold(*(__be32*)(ptr+n->sel.hoff), &n->sel,n->fshift); if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) goto next_ht; @@ -190,7 +190,7 @@ check_terminal: if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; if (n->sel.flags&TC_U32_VAROFFSET) - off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 += ntohs(n->sel.offmask & *(__be16*)(ptr+n->sel.offoff)) >>n->sel.offshift; off2 &= ~3; } if (n->sel.flags&TC_U32_EAT) { diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 112796e4a7c..953f1479f7d 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, sizeof(u32))) return 0; - return !(((*(u32*) ptr) ^ key->val) & key->mask); + return !(((*(__be32*) ptr) ^ key->val) & key->mask); } static struct tcf_ematch_ops em_u32_ops = { -- cgit v1.2.3 From bc92dd194d05e8334b210552fbc0ac5711d72ea9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:47:32 -0700 Subject: [SCTP]: fix misannotated __sctp_rcv_asconf_lookup() Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 57fe2f81eca..812ff1756c3 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -944,7 +944,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, static struct sctp_association *__sctp_rcv_asconf_lookup( sctp_chunkhdr_t *ch, const union sctp_addr *laddr, - __be32 peer_port, + __be16 peer_port, struct sctp_transport **transportp) { sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch; -- cgit v1.2.3 From 27724426a9000086993a8107a11cff276c4bd4d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:48:03 -0700 Subject: [SUNRPC]: net/* NULL noise Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/sunrpc/auth_gss/gss_mech_switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 61801a069ff..bce9d527af0 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -317,7 +317,7 @@ gss_delete_sec_context(struct gss_ctx **context_handle) if (!*context_handle) return(GSS_S_NO_CONTEXT); - if ((*context_handle)->internal_ctx_id != 0) + if ((*context_handle)->internal_ctx_id) (*context_handle)->mech_type->gm_ops ->gss_delete_sec_context((*context_handle) ->internal_ctx_id); -- cgit v1.2.3 From 9534f035ee67630688447e8aac91b1d365a51913 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:49:48 -0700 Subject: [8021Q]: vlan_dev misannotations Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 77f04e49a1a..8fbcefe10c9 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -382,7 +382,7 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) vlan_dev_info(dev)->cnt_encap_on_xmit++; pr_debug("%s: proto to encap: 0x%hx\n", - __FUNCTION__, htons(veth->h_vlan_proto)); + __FUNCTION__, ntohs(veth->h_vlan_proto)); /* Construct the second two bytes. This field looks something * like: * usr_priority: 3 bits (high bits) -- cgit v1.2.3 From 5e226e4d9016daee170699f8a4188a5505021756 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 17 Mar 2008 22:50:23 -0700 Subject: [IPV4]: esp_output() misannotations Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 091e6709f83..f3ceca31aa4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -168,7 +168,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_encap_tmpl *encap = x->encap; struct udphdr *uh; __be32 *udpdata32; - unsigned int sport, dport; + __be16 sport, dport; int encap_type; spin_lock_bh(&x->lock); -- cgit v1.2.3 From 6f3d09291b4982991680b61763b2541e53e2a95f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Mar 2008 01:44:24 +0100 Subject: sched, net: socket wakeups are sync 'sync' wakeups are a hint towards the scheduler that (certain) networking related wakeups likely create coupling between tasks. Signed-off-by: Ingo Molnar --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 09cb3a74de7..2654c147c00 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1621,7 +1621,7 @@ static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); } @@ -1635,7 +1635,7 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync(sk->sk_sleep); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) -- cgit v1.2.3 From 6aebb9b280e5662ece41cf570e25e61795443985 Mon Sep 17 00:00:00 2001 From: Roel Kluin <12o3l@tiscali.nl> Date: Thu, 20 Mar 2008 15:06:23 -0700 Subject: [NETFILTER]: nf_conntrack_h323: logical-bitwise & confusion in process_setup() logical-bitwise & confusion Signed-off-by: Roel Kluin <12o3l@tiscali.nl> Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_h323_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 62137879e6a..898f1922b5b 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -842,7 +842,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct, set_h225_addr = rcu_dereference(set_h225_addr_hook); if ((setup->options & eSetup_UUIE_destCallSignalAddress) && - (set_h225_addr) && ct->status && IPS_NAT_MASK && + (set_h225_addr) && ct->status & IPS_NAT_MASK && get_h225_addr(ct, *data, &setup->destCallSignalAddress, &addr, &port) && memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) { -- cgit v1.2.3 From d0ebf133590abdc035af6e19a6568667af0ab3b0 Mon Sep 17 00:00:00 2001 From: Daniel Hokka Zakrisson Date: Thu, 20 Mar 2008 15:07:10 -0700 Subject: [NETFILTER]: ipt_recent: sanity check hit count If a rule using ipt_recent is created with a hit count greater than ip_pkt_list_tot, the rule will never match as it cannot keep track of enough timestamps. This patch makes ipt_recent refuse to create such rules. With ip_pkt_list_tot's default value of 20, the following can be used to reproduce the problem. nc -u -l 0.0.0.0 1234 & for i in `seq 1 100`; do echo $i | nc -w 1 -u 127.0.0.1 1234; done This limits it to 20 packets: iptables -A OUTPUT -p udp --dport 1234 -m recent --set --name test \ --rsource iptables -A OUTPUT -p udp --dport 1234 -m recent --update --seconds \ 60 --hitcount 20 --name test --rsource -j DROP While this is unlimited: iptables -A OUTPUT -p udp --dport 1234 -m recent --set --name test \ --rsource iptables -A OUTPUT -p udp --dport 1234 -m recent --update --seconds \ 60 --hitcount 21 --name test --rsource -j DROP With the patch the second rule-set will throw an EINVAL. Reported-by: Sean Kennedy Signed-off-by: Daniel Hokka Zakrisson Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 68cbe3ca01c..8e8f0425a8e 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -252,6 +252,8 @@ recent_mt_check(const char *tablename, const void *ip, if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && (info->seconds || info->hit_count)) return false; + if (info->hit_count > ip_pkt_list_tot) + return false; if (info->name[0] == '\0' || strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return false; -- cgit v1.2.3 From 270637abff0cdf848b910b9f96ad342e1da61c66 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Thu, 20 Mar 2008 15:17:14 -0700 Subject: [SCTP]: Fix a race between module load and protosw access There is a race is SCTP between the loading of the module and the access by the socket layer to the protocol functions. In particular, a list of addresss that SCTP maintains is not initialized prior to the registration with the protosw. Thus it is possible for a user application to gain access to SCTP functions before everything has been initialized. The problem shows up as odd crashes during connection initializtion when we try to access the SCTP address list. The solution is to refactor how we do registration and initialize the lists prior to registering with the protosw. Care must be taken since the address list initialization depends on some other pieces of SCTP initialization. Also the clean-up in case of failure now also needs to be refactored. Signed-off-by: Vlad Yasevich Acked-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 32 ++++++++----- net/sctp/protocol.c | 129 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 104 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 9aa0733aee8..b1e05d719f9 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1014,15 +1014,24 @@ static struct sctp_pf sctp_pf_inet6 = { }; /* Initialize IPv6 support and register with socket layer. */ -int sctp_v6_init(void) +void sctp_v6_pf_init(void) { - int rc; - /* Register the SCTP specific PF_INET6 functions. */ sctp_register_pf(&sctp_pf_inet6, PF_INET6); /* Register the SCTP specific AF_INET6 functions. */ sctp_register_af(&sctp_af_inet6); +} + +void sctp_v6_pf_exit(void) +{ + list_del(&sctp_af_inet6.list); +} + +/* Initialize IPv6 support and register with socket layer. */ +int sctp_v6_protosw_init(void) +{ + int rc; rc = proto_register(&sctpv6_prot, 1); if (rc) @@ -1035,6 +1044,14 @@ int sctp_v6_init(void) return 0; } +void sctp_v6_protosw_exit(void) +{ + inet6_unregister_protosw(&sctpv6_seqpacket_protosw); + inet6_unregister_protosw(&sctpv6_stream_protosw); + proto_unregister(&sctpv6_prot); +} + + /* Register with inet6 layer. */ int sctp_v6_add_protocol(void) { @@ -1047,15 +1064,6 @@ int sctp_v6_add_protocol(void) return 0; } -/* IPv6 specific exit support. */ -void sctp_v6_exit(void) -{ - inet6_unregister_protosw(&sctpv6_seqpacket_protosw); - inet6_unregister_protosw(&sctpv6_stream_protosw); - proto_unregister(&sctpv6_prot); - list_del(&sctp_af_inet6.list); -} - /* Unregister with inet6 layer. */ void sctp_v6_del_protocol(void) { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 7a7646a9565..f90091a1b9c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -992,6 +992,58 @@ static void cleanup_sctp_mibs(void) free_percpu(sctp_statistics[1]); } +static void sctp_v4_pf_init(void) +{ + /* Initialize the SCTP specific PF functions. */ + sctp_register_pf(&sctp_pf_inet, PF_INET); + sctp_register_af(&sctp_af_inet); +} + +static void sctp_v4_pf_exit(void) +{ + list_del(&sctp_af_inet.list); +} + +static int sctp_v4_protosw_init(void) +{ + int rc; + + rc = proto_register(&sctp_prot, 1); + if (rc) + return rc; + + /* Register SCTP(UDP and TCP style) with socket layer. */ + inet_register_protosw(&sctp_seqpacket_protosw); + inet_register_protosw(&sctp_stream_protosw); + + return 0; +} + +static void sctp_v4_protosw_exit(void) +{ + inet_unregister_protosw(&sctp_stream_protosw); + inet_unregister_protosw(&sctp_seqpacket_protosw); + proto_unregister(&sctp_prot); +} + +static int sctp_v4_add_protocol(void) +{ + /* Register notifier for inet address additions/deletions. */ + register_inetaddr_notifier(&sctp_inetaddr_notifier); + + /* Register SCTP with inet layer. */ + if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) + return -EAGAIN; + + return 0; +} + +static void sctp_v4_del_protocol(void) +{ + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); + unregister_inetaddr_notifier(&sctp_inetaddr_notifier); +} + /* Initialize the universe into something sensible. */ SCTP_STATIC __init int sctp_init(void) { @@ -1035,8 +1087,6 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize object count debugging. */ sctp_dbg_objcnt_init(); - /* Initialize the SCTP specific PF functions. */ - sctp_register_pf(&sctp_pf_inet, PF_INET); /* * 14. Suggested SCTP Protocol Parameter Values */ @@ -1194,19 +1244,22 @@ SCTP_STATIC __init int sctp_init(void) sctp_sysctl_register(); INIT_LIST_HEAD(&sctp_address_families); - sctp_register_af(&sctp_af_inet); + sctp_v4_pf_init(); + sctp_v6_pf_init(); - status = proto_register(&sctp_prot, 1); - if (status) - goto err_proto_register; + /* Initialize the local address list. */ + INIT_LIST_HEAD(&sctp_local_addr_list); + spin_lock_init(&sctp_local_addr_lock); + sctp_get_local_addr_list(); - /* Register SCTP(UDP and TCP style) with socket layer. */ - inet_register_protosw(&sctp_seqpacket_protosw); - inet_register_protosw(&sctp_stream_protosw); + status = sctp_v4_protosw_init(); - status = sctp_v6_init(); if (status) - goto err_v6_init; + goto err_protosw_init; + + status = sctp_v6_protosw_init(); + if (status) + goto err_v6_protosw_init; /* Initialize the control inode/socket for handling OOTB packets. */ if ((status = sctp_ctl_sock_init())) { @@ -1215,19 +1268,9 @@ SCTP_STATIC __init int sctp_init(void) goto err_ctl_sock_init; } - /* Initialize the local address list. */ - INIT_LIST_HEAD(&sctp_local_addr_list); - spin_lock_init(&sctp_local_addr_lock); - sctp_get_local_addr_list(); - - /* Register notifier for inet address additions/deletions. */ - register_inetaddr_notifier(&sctp_inetaddr_notifier); - - /* Register SCTP with inet layer. */ - if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) { - status = -EAGAIN; + status = sctp_v4_add_protocol(); + if (status) goto err_add_protocol; - } /* Register SCTP with inet6 layer. */ status = sctp_v6_add_protocol(); @@ -1238,18 +1281,18 @@ SCTP_STATIC __init int sctp_init(void) out: return status; err_v6_add_protocol: - inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); - unregister_inetaddr_notifier(&sctp_inetaddr_notifier); + sctp_v6_del_protocol(); err_add_protocol: - sctp_free_local_addr_list(); + sctp_v4_del_protocol(); sock_release(sctp_ctl_socket); err_ctl_sock_init: - sctp_v6_exit(); -err_v6_init: - inet_unregister_protosw(&sctp_stream_protosw); - inet_unregister_protosw(&sctp_seqpacket_protosw); - proto_unregister(&sctp_prot); -err_proto_register: + sctp_v6_protosw_exit(); +err_v6_protosw_init: + sctp_v4_protosw_exit(); +err_protosw_init: + sctp_free_local_addr_list(); + sctp_v4_pf_exit(); + sctp_v6_pf_exit(); sctp_sysctl_unregister(); list_del(&sctp_af_inet.list); free_pages((unsigned long)sctp_port_hashtable, @@ -1282,23 +1325,21 @@ SCTP_STATIC __exit void sctp_exit(void) /* Unregister with inet6/inet layers. */ sctp_v6_del_protocol(); - inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); - - /* Unregister notifier for inet address additions/deletions. */ - unregister_inetaddr_notifier(&sctp_inetaddr_notifier); - - /* Free the local address list. */ - sctp_free_local_addr_list(); + sctp_v4_del_protocol(); /* Free the control endpoint. */ sock_release(sctp_ctl_socket); - /* Cleanup v6 initializations. */ - sctp_v6_exit(); + /* Free protosw registrations */ + sctp_v6_protosw_exit(); + sctp_v4_protosw_exit(); + + /* Free the local address list. */ + sctp_free_local_addr_list(); /* Unregister with socket layer. */ - inet_unregister_protosw(&sctp_stream_protosw); - inet_unregister_protosw(&sctp_seqpacket_protosw); + sctp_v6_pf_exit(); + sctp_v4_pf_exit(); sctp_sysctl_unregister(); list_del(&sctp_af_inet.list); @@ -1317,8 +1358,6 @@ SCTP_STATIC __exit void sctp_exit(void) kmem_cache_destroy(sctp_chunk_cachep); kmem_cache_destroy(sctp_bucket_cachep); - - proto_unregister(&sctp_prot); } module_init(sctp_init); -- cgit v1.2.3 From 2bec008ca9fd009aa503b75344d1c22da9256141 Mon Sep 17 00:00:00 2001 From: Fabio Checconi Date: Thu, 20 Mar 2008 15:54:58 -0700 Subject: bridge: use time_before() in br_fdb_cleanup() In br_fdb_cleanup() next_timer and this_timer are in jiffies, so they should be compared using the time_after() macro. Signed-off-by: Fabio Checconi Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index bc40377136a..9326c377822 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -136,7 +136,7 @@ void br_fdb_cleanup(unsigned long _data) this_timer = f->ageing_timer + delay; if (time_before_eq(this_timer, jiffies)) fdb_delete(f); - else if (this_timer < next_timer) + else if (time_before(this_timer, next_timer)) next_timer = this_timer; } } -- cgit v1.2.3 From 8a455b087c9629b3ae3b521b4f1ed16672f978cc Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 20 Mar 2008 16:07:27 -0700 Subject: netpoll: zap_completion_queue: adjust skb->users counter zap_completion_queue() retrieves skbs from completion_queue where they have zero skb->users counter. Before dev_kfree_skb_any() it should be non-zero yet, so it's increased now. Reported-and-tested-by: Andrew Morton Signed-off-by: Jarek Poplawski Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/core/netpoll.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4b7e756181c..c635de52526 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -215,10 +215,12 @@ static void zap_completion_queue(void) while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; - if (skb->destructor) + if (skb->destructor) { + atomic_inc(&skb->users); dev_kfree_skb_any(skb); /* put this one back */ - else + } else { __kfree_skb(skb); + } } } -- cgit v1.2.3 From 607bfbf2d55dd1cfe5368b41c2a81a8c9ccf4723 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 20 Mar 2008 16:11:27 -0700 Subject: [TCP]: Fix shrinking windows with window scaling When selecting a new window, tcp_select_window() tries not to shrink the offered window by using the maximum of the remaining offered window size and the newly calculated window size. The newly calculated window size is always a multiple of the window scaling factor, the remaining window size however might not be since it depends on rcv_wup/rcv_nxt. This means we're effectively shrinking the window when scaling it down. The dump below shows the problem (scaling factor 2^7): - Window size of 557 (71296) is advertised, up to 3111907257: IP 172.2.2.3.33000 > 172.2.2.2.33000: . ack 3111835961 win 557 <...> - New window size of 514 (65792) is advertised, up to 3111907217, 40 bytes below the last end: IP 172.2.2.3.33000 > 172.2.2.2.33000: . 3113575668:3113577116(1448) ack 3111841425 win 514 <...> The number 40 results from downscaling the remaining window: 3111907257 - 3111841425 = 65832 65832 / 2^7 = 514 65832 % 2^7 = 40 If the sender uses up the entire window before it is shrunk, this can have chaotic effects on the connection. When sending ACKs, tcp_acceptable_seq() will notice that the window has been shrunk since tcp_wnd_end() is before tp->snd_nxt, which makes it choose tcp_wnd_end() as sequence number. This will fail the receivers checks in tcp_sequence() however since it is before it's tp->rcv_wup, making it respond with a dupack. If both sides are in this condition, this leads to a constant flood of ACKs until the connection times out. Make sure the window is never shrunk by aligning the remaining window to the window scaling factor. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 01578f544ad..72b9350006f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -255,7 +255,7 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - new_win = cur_win; + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; -- cgit v1.2.3 From 38fe999e2286139cccdaa500a81bd49a16a81158 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 20 Mar 2008 16:13:58 -0700 Subject: [IPV6] KCONFIG: Fix description about IPV6_TUNNEL. Based on notice from "Colin" . Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 58219dfffef..47263e45bac 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -179,11 +179,12 @@ config IPV6_SIT Saying M here will produce a module called sit.ko. If unsure, say Y. config IPV6_TUNNEL - tristate "IPv6: IPv6-in-IPv6 tunnel" + tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" select INET6_TUNNEL depends on IPV6 ---help--- - Support for IPv6-in-IPv6 tunnels described in RFC 2473. + Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in + RFC 2473. If unsure, say N. -- cgit v1.2.3 From 12b101555f4a67db67a66966a516075bd477741f Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Fri, 21 Mar 2008 15:01:50 -0700 Subject: [IPV4]: Fix null dereference in ip_defrag Been seeing occasional panics in my testing of 2.6.25-rc in ip_defrag. Offending line in ip_defrag is here: net = skb->dev->nd_net where dev is NULL. Bisected the problem down to commit ac18e7509e7df327e30d6e073a787d922eaf211d ([NETNS][FRAGS]: Make the inet_frag_queue lookup work in namespaces). Below patch (idea from Patrick McHardy) fixes the problem for me. Signed-off-by: Phil Oester Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a2e92f9709d..3b2e5adca83 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -568,7 +568,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev->nd_net; + net = skb->dev ? skb->dev->nd_net : skb->dst->dev->nd_net; /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); -- cgit v1.2.3 From 7512cbf6efc97644812f137527a54b8e92b6a90a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 21 Mar 2008 15:58:52 -0700 Subject: [DLCI]: Fix tiny race between module unload and sock_ioctl. This is a narrow pedantry :) but the dlci_ioctl_hook check and call should not be parted with the mutex lock. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/socket.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index b6d35cd72a5..9d3fbfbc853 100644 --- a/net/socket.c +++ b/net/socket.c @@ -909,11 +909,10 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!dlci_ioctl_hook) request_module("dlci"); - if (dlci_ioctl_hook) { - mutex_lock(&dlci_ioctl_mutex); + mutex_lock(&dlci_ioctl_mutex); + if (dlci_ioctl_hook) err = dlci_ioctl_hook(cmd, argp); - mutex_unlock(&dlci_ioctl_mutex); - } + mutex_unlock(&dlci_ioctl_mutex); break; default: err = sock->ops->ioctl(sock, cmd, arg); -- cgit v1.2.3 From 69d1506731168d6845a76a303b2c45f7c05f3f2c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 22 Mar 2008 15:47:05 -0700 Subject: [TCP]: Let skbs grow over a page on fast peers While testing the virtio-net driver on KVM with TSO I noticed that TSO performance with a 1500 MTU is significantly worse compared to the performance of non-TSO with a 16436 MTU. The packet dump shows that most of the packets sent are smaller than a page. Looking at the code this actually is quite obvious as it always stop extending the packet if it's the first packet yet to be sent and if it's larger than the MSS. Since each extension is bound by the page size, this means that (given a 1500 MTU) we're very unlikely to construct packets greater than a page, provided that the receiver and the path is fast enough so that packets can always be sent immediately. The fix is also quite obvious. The push calls inside the loop is just an optimisation so that we don't end up doing all the sending at the end of the loop. Therefore there is no specific reason why it has to do so at MSS boundaries. For TSO, the most natural extension of this optimisation is to do the pushing once the skb exceeds the TSO size goal. This is what the patch does and testing with KVM shows that the TSO performance with a 1500 MTU easily surpasses that of a 16436 MTU and indeed the packet sizes sent are generally larger than 16436. I don't see any obvious downsides for slower peers or connections, but it would be prudent to test this extensively to ensure that those cases don't regress. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 071e83a894a..39b629ac240 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -735,7 +735,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -981,7 +981,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { -- cgit v1.2.3 From 6440cc9e0f48ade57af7be28008cbfa6a991f287 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 22 Mar 2008 17:59:58 -0700 Subject: [IPV4] fib_trie: fix warning from rcu_assign_poinger This gets rid of a warning caused by the test in rcu_assign_pointer. I tried to fix rcu_assign_pointer, but that devolved into a long set of discussions about doing it right that came to no real solution. Since the test in rcu_assign_pointer for constant NULL would never succeed in fib_trie, just open code instead. Signed-off-by: Stephen Hemminger Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ff446d0fa8..f6cdc012eec 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -177,10 +177,13 @@ static inline struct tnode *node_parent_rcu(struct node *node) return rcu_dereference(ret); } +/* Same as rcu_assign_pointer + * but that macro() assumes that value is a pointer. + */ static inline void node_set_parent(struct node *node, struct tnode *ptr) { - rcu_assign_pointer(node->parent, - (unsigned long)ptr | NODE_TYPE(node)); + smp_wmb(); + node->parent = (unsigned long)ptr | NODE_TYPE(node); } static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) -- cgit v1.2.3 From 421f099bc555c5f1516fdf5060de1d6bb5f51002 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 22 Mar 2008 18:04:16 -0700 Subject: [IPV6] net/ipv6/ndisc.c: remove unused variable The variable hlen is initialized but never used otherwise. The semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @@ type T; identifier i; constant C; @@ ( extern T i; | - T i; <+... when != i - i = C; ...+> ) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0d33a7d3212..51557c27a0c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1420,7 +1420,6 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, u8 *opt; int rd_len; int err; - int hlen; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; dev = skb->dev; @@ -1491,7 +1490,6 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, return; } - hlen = 0; skb_reserve(buff, LL_RESERVED_SPACE(dev)); ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, -- cgit v1.2.3 From 53a6201fdfa04accc91ea1a7accce8e8bc37ef8e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 22 Mar 2008 18:05:33 -0700 Subject: [9P] net/9p/trans_fd.c: remove unused variable The variable cb is initialized but never used otherwise. The semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @@ type T; identifier i; constant C; @@ ( extern T i; | - T i; <+... when != i - i = C; ...+> ) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/9p/trans_fd.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 1aa9d517539..4e8d4e724b9 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -861,7 +861,6 @@ static void p9_mux_free_request(struct p9_conn *m, struct p9_req *req) static void p9_mux_flush_cb(struct p9_req *freq, void *a) { - p9_conn_req_callback cb; int tag; struct p9_conn *m; struct p9_req *req, *rreq, *rptr; @@ -872,7 +871,6 @@ static void p9_mux_flush_cb(struct p9_req *freq, void *a) freq->tcall->params.tflush.oldtag); spin_lock(&m->lock); - cb = NULL; tag = freq->tcall->params.tflush.oldtag; req = NULL; list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { -- cgit v1.2.3 From dbee0d3f4603b9d0e56234a0743321fe4dad31ca Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Sun, 23 Mar 2008 21:45:36 -0700 Subject: [ATM]: When proc_create() fails, do some error handling work and return -ENOMEM. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- net/atm/clip.c | 19 ++++++++++++++++--- net/atm/lec.c | 4 ++++ 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index d30167c0b48..2ab1e36098f 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -947,6 +947,8 @@ static const struct file_operations arp_seq_fops = { }; #endif +static void atm_clip_exit_noproc(void); + static int __init atm_clip_init(void) { neigh_table_init_no_netlink(&clip_tbl); @@ -963,18 +965,22 @@ static int __init atm_clip_init(void) struct proc_dir_entry *p; p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops); + if (!p) { + printk(KERN_ERR "Unable to initialize " + "/proc/net/atm/arp\n"); + atm_clip_exit_noproc(); + return -ENOMEM; + } } #endif return 0; } -static void __exit atm_clip_exit(void) +static void atm_clip_exit_noproc(void) { struct net_device *dev, *next; - remove_proc_entry("arp", atm_proc_root); - unregister_inetaddr_notifier(&clip_inet_notifier); unregister_netdevice_notifier(&clip_dev_notifier); @@ -1005,6 +1011,13 @@ static void __exit atm_clip_exit(void) clip_tbl_hook = NULL; } +static void __exit atm_clip_exit(void) +{ + remove_proc_entry("arp", atm_proc_root); + + atm_clip_exit_noproc(); +} + module_init(atm_clip_init); module_exit(atm_clip_exit); MODULE_AUTHOR("Werner Almesberger"); diff --git a/net/atm/lec.c b/net/atm/lec.c index 0e450d12f03..a2efa7ff41f 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1250,6 +1250,10 @@ static int __init lane_module_init(void) struct proc_dir_entry *p; p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops); + if (!p) { + printk(KERN_ERR "Unable to initialize /proc/net/atm/lec\n"); + return -ENOMEM; + } #endif register_atm_ioctl(&lane_ioctl_ops); -- cgit v1.2.3 From 8f3ea33a5078a09eba12bfe57424507809367756 Mon Sep 17 00:00:00 2001 From: Martin Devera Date: Sun, 23 Mar 2008 22:00:38 -0700 Subject: sch_htb: fix "too many events" situation HTB is event driven algorithm and part of its work is to apply scheduled events at proper times. It tried to defend itself from livelock by processing only limited number of events per dequeue. Because of faster computers some users already hit this hardcoded limit. This patch limits processing up to 2 jiffies (why not 1 jiffie ? because it might stop prematurely when only fraction of jiffie remains). Signed-off-by: Martin Devera Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 795c761ad99..66148cc4759 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -711,9 +711,11 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, */ static psched_time_t htb_do_events(struct htb_sched *q, int level) { - int i; - - for (i = 0; i < 500; i++) { + /* don't run for longer than 2 jiffies; 2 is used instead of + 1 to simplify things when jiffy is going to be incremented + too soon */ + unsigned long stop_at = jiffies + 2; + while (time_before(jiffies, stop_at)) { struct htb_class *cl; long diff; struct rb_node *p = rb_first(&q->wait_pq[level]); @@ -731,9 +733,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level) if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); } - if (net_ratelimit()) - printk(KERN_WARNING "htb: too many events !\n"); - return q->now + PSCHED_TICKS_PER_SEC / 10; + /* too much load - let's continue on next jiffie */ + return q->now + PSCHED_TICKS_PER_SEC / HZ; } /* Returns class->node+prio from id-tree where classe's id is >= id. NULL -- cgit v1.2.3 From d3073779f8362d64b804882f5f41c208c4a5e11e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 24 Mar 2008 12:03:03 -0400 Subject: SVCRDMA: Use only 1 RDMA read scatter entry for iWARP adapters The iWARP protocol limits RDMA read requests to a single scatter entry. NFS/RDMA has code in rdma_read_max_sge() that is supposed to limit the sge_count for RDMA read requests to 1, but the code to do that is inside an #ifdef RDMA_TRANSPORT_IWARP block. In the mainline kernel at least, RDMA_TRANSPORT_IWARP is an enum and not a preprocessor #define, so the #ifdef'ed code is never compiled. In my test of a kernel build with -j8 on an NFS/RDMA mount, this problem eventually leads to trouble starting with: svcrdma: Error posting send = -22 svcrdma : RDMA_READ error = -22 and things go downhill from there. The trivial fix is to delete the #ifdef guard. The check seems to be a remnant of when the NFS/RDMA code was not merged and needed to compile against multiple kernel versions, although I don't think it ever worked as intended. In any case now that the code is upstream there's no need to test whether the RDMA_TRANSPORT_IWARP constant is defined or not. Without this patch, my kernel build on an NFS/RDMA mount using NetEffect adapters quickly and 100% reproducibly failed with an error like: ld: final link failed: Software caused connection abort With the patch applied I was able to complete a kernel build on the same setup. (Tom Tucker says this is "actually an _ancient_ remnant when it had to compile against iWARP vs. non-iWARP enabled OFA trees.") Signed-off-by: Roland Dreier Acked-by: Tom Tucker Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ab54a736486..971271602dd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -237,14 +237,12 @@ static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { -#ifdef RDMA_TRANSPORT_IWARP if ((RDMA_TRANSPORT_IWARP == rdma_node_get_transport(xprt->sc_cm_id-> device->node_type)) && sge_count > 1) return 1; else -#endif return min_t(int, sge_count, xprt->sc_max_sge); } -- cgit v1.2.3