From 5a1b5898ee9e0bf68a86609ecb9775457b1857a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 28 Apr 2007 21:04:03 -0700 Subject: [NET]: Remove NETIF_F_INTERNAL_STATS, default to internal stats. Herbert Xu conviced me that a new flag was overkill; every driver currently overrides get_stats, so we might as well make the internal one the default. If someone did fail to set get_stats, they would now get all 0 stats instead of "No statistics available". Signed-off-by: Rusty Russell Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d5e42d13bd6..eb999003bbb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2101,26 +2101,23 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { struct net_device_stats *stats = dev->get_stats(dev); - if (stats) { - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); - } else - seq_printf(seq, "%6s: No statistics available.\n", dev->name); + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); } /* @@ -3257,11 +3254,9 @@ out: mutex_unlock(&net_todo_run_mutex); } -static struct net_device_stats *maybe_internal_stats(struct net_device *dev) +static struct net_device_stats *internal_stats(struct net_device *dev) { - if (dev->features & NETIF_F_INTERNAL_STATS) - return &dev->stats; - return NULL; + return &dev->stats; } /** @@ -3299,7 +3294,7 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, if (sizeof_priv) dev->priv = netdev_priv(dev); - dev->get_stats = maybe_internal_stats; + dev->get_stats = internal_stats; setup(dev); strcpy(dev->name, name); return dev; -- cgit v1.2.3 From aad97f38b71dd2ecd730b3a3dce8264d13fbcd56 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 28 Apr 2007 21:09:04 -0700 Subject: [SCTP]: Fix sctp_getsockopt_local_addrs_old() to use local storage. sctp_getsockopt_local_addrs_old() in net/sctp/socket.c calls copy_to_user() while the spinlock addr_lock is held. this should not be done as copy_to_user() might sleep. the call to sctp_copy_laddrs_to_user() while holding the lock is also problematic as it calls copy_to_user() Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 96 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 11938fb2039..2fc0a92caa7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3987,7 +3987,7 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, memcpy(&temp, &from->ipaddr, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; - if(space_left < addrlen) + if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) return -EFAULT; @@ -4076,8 +4076,9 @@ done: /* Helper function that copies local addresses to user and returns the number * of addresses copied. */ -static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_addrs, - void __user *to) +static int sctp_copy_laddrs_old(struct sock *sk, __u16 port, + int max_addrs, void *to, + int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -4094,10 +4095,10 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) - return -EFAULT; + memcpy(to, &temp, addrlen); to += addrlen; + *bytes_copied += addrlen; cnt ++; if (cnt >= max_addrs) break; } @@ -4105,8 +4106,8 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add return cnt; } -static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void __user **to, size_t space_left) +static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, + size_t space_left, int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -4123,14 +4124,14 @@ static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_leftaddress_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user_old(sk, bp->port, - getaddrs.addr_num, - to); - if (cnt < 0) { - err = cnt; - goto unlock; - } + cnt = sctp_copy_laddrs_old(sk, bp->port, + getaddrs.addr_num, + addrs, &bytes_copied); goto copy_getaddrs; } } @@ -4206,22 +4214,29 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; - } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; if (cnt >= getaddrs.addr_num) break; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + /* copy the entire address list into the user provided space */ + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } + + /* copy the leading structure back to user */ getaddrs.addr_num = cnt; if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs_old))) err = -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } @@ -4241,7 +4256,8 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, rwlock_t *addr_lock; int err = 0; size_t space_left; - int bytes_copied; + int bytes_copied = 0; + void *addrs; if (len <= sizeof(struct sctp_getaddrs)) return -EINVAL; @@ -4269,6 +4285,9 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs,addrs); space_left = len - sizeof(struct sctp_getaddrs) - offsetof(struct sctp_getaddrs,addrs); + addrs = kmalloc(space_left, GFP_KERNEL); + if (!addrs) + return -ENOMEM; sctp_read_lock(addr_lock); @@ -4279,11 +4298,11 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user(sk, bp->port, - &to, space_left); + cnt = sctp_copy_laddrs(sk, bp->port, addrs, + space_left, &bytes_copied); if (cnt < 0) { err = cnt; - goto unlock; + goto error; } goto copy_getaddrs; } @@ -4294,26 +4313,31 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_left < addrlen) - return -ENOMEM; /*fixme: right error?*/ - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; + if (space_left < addrlen) { + err = -ENOMEM; /*fixme: right error?*/ + goto error; } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; space_left -= addrlen; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) return -EFAULT; - bytes_copied = ((char __user *)to) - optval; if (put_user(bytes_copied, optlen)) return -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } -- cgit v1.2.3 From 5632c5152aa621885d87ea0b8fdd5a6bb9f69c6f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Apr 2007 21:16:39 -0700 Subject: [IPV6]: Track device renames in snmp6. When network device's are renamed, the IPV6 snmp6 code gets confused. It doesn't track name changes so it will OOPS when network device's are removed. The fix is trivial, just unregister/re-register in notify handler. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 ++++-- net/ipv6/proc.c | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e04e4937350..3452433cbc9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2359,8 +2359,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGENAME: -#ifdef CONFIG_SYSCTL if (idev) { + snmp6_unregister_dev(idev); +#ifdef CONFIG_SYSCTL addrconf_sysctl_unregister(&idev->cnf); neigh_sysctl_unregister(idev->nd_parms); neigh_sysctl_register(dev, idev->nd_parms, @@ -2368,8 +2369,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, &ndisc_ifinfo_sysctl_change, NULL); addrconf_sysctl_register(idev, &idev->cnf); - } #endif + snmp6_register_dev(idev); + } break; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index acb306a5dd5..920dc9cf6a8 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -223,6 +223,7 @@ int snmp6_unregister_dev(struct inet6_dev *idev) return -EINVAL; remove_proc_entry(idev->stats.proc_dir_entry->name, proc_net_devsnmp6); + idev->stats.proc_dir_entry = NULL; return 0; } -- cgit v1.2.3 From ecfd6b183780c6d9e85873693b3ce6c5f4d08b58 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 28 Apr 2007 21:20:32 -0700 Subject: [XFRM]: Export SPD info With this patch you can use iproute2 in user space to efficiently see how many policies exist in different directions. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 16 ++++++++++- net/xfrm/xfrm_user.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 762926009c0..dbf9d96a2f0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -579,8 +579,22 @@ static inline int xfrm_byidx_should_resize(int total) return 0; } -static DEFINE_MUTEX(hash_resize_mutex); +void xfrm_spd_getinfo(struct xfrm_spdinfo *si) +{ + read_lock_bh(&xfrm_policy_lock); + si->incnt = xfrm_policy_count[XFRM_POLICY_IN]; + si->outcnt = xfrm_policy_count[XFRM_POLICY_OUT]; + si->fwdcnt = xfrm_policy_count[XFRM_POLICY_FWD]; + si->inscnt = xfrm_policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; + si->outscnt = xfrm_policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; + si->fwdscnt = xfrm_policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; + si->spdhcnt = xfrm_idx_hmask; + si->spdhmcnt = xfrm_policy_hashmax; + read_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_spd_getinfo); +static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *__unused) { int dir, total; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 69110fed64b..4210d91624c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -672,6 +672,81 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, return skb; } +static int build_spdinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) +{ + struct xfrm_spdinfo si; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldnt really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_spd_getinfo(&si); + + if (flags & XFRM_SPD_HMASK) + NLA_PUT_U32(skb, XFRMA_SPDHMASK, si.spdhcnt); + if (flags & XFRM_SPD_HMAX) + NLA_PUT_U32(skb, XFRMA_SPDHMAX, si.spdhmcnt); + if (flags & XFRM_SPD_ICNT) + NLA_PUT_U32(skb, XFRMA_SPDICNT, si.incnt); + if (flags & XFRM_SPD_OCNT) + NLA_PUT_U32(skb, XFRMA_SPDOCNT, si.outcnt); + if (flags & XFRM_SPD_FCNT) + NLA_PUT_U32(skb, XFRMA_SPDFCNT, si.fwdcnt); + if (flags & XFRM_SPD_ISCNT) + NLA_PUT_U32(skb, XFRMA_SPDISCNT, si.inscnt); + if (flags & XFRM_SPD_OSCNT) + NLA_PUT_U32(skb, XFRMA_SPDOSCNT, si.inscnt); + if (flags & XFRM_SPD_FSCNT) + NLA_PUT_U32(skb, XFRMA_SPDFSCNT, si.inscnt); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtattr **xfrma) +{ + struct sk_buff *r_skb; + u32 *flags = NLMSG_DATA(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + int len = NLMSG_LENGTH(sizeof(u32)); + + + if (*flags & XFRM_SPD_HMASK) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_HMAX) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_ICNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_OCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_FCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_ISCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_OSCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_FSCNT) + len += RTA_SPACE(sizeof(u32)); + + r_skb = alloc_skb(len, GFP_ATOMIC); + if (r_skb == NULL) + return -ENOMEM; + + if (build_spdinfo(r_skb, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(xfrm_nl, r_skb, spid); +} + static int build_sadinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) { struct xfrm_sadinfo si; @@ -1879,6 +1954,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), }; #undef XMSGSIZE @@ -1907,6 +1983,7 @@ static struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -- cgit v1.2.3 From 65bb723c9502b7ba0a3aad13bdac8832e213ba74 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 28 Apr 2007 21:21:46 -0700 Subject: [TCP]: Update references in two old comments This updates references to drafts in comments which must be about 10 years old. Internet draft draft-ietf-tcpimpl-prob-03.txt expired in 1998 and was replaced by RFC 2525 in March 1999. Section 3.10 of the draft maps almost identically into section 2.17 of RFC 2525: both are entitled "Failure to RST on close with data pending", the differences in text body amount to a typo and minor sentence change. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++-------- net/ipv4/tcp_output.c | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2cf9a898ce5..d6e48866817 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout) sk_stream_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e70a6840cb6..b5fa3c19afe 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2035,7 +2035,7 @@ void tcp_send_fin(struct sock *sk) /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { -- cgit v1.2.3 From d0772b70faaf8e9f2013b6c4273d94d5eac8047a Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 28 Apr 2007 21:26:23 -0700 Subject: [IPV6]: Fix slab corruption running ip6sic From: Eric Sesterhenn Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 538499a8997..5502cc948df 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -261,7 +261,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, spi); + return xfrm6_rcv_spi(skb, spi) > 0 ? : 0; } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, -- cgit v1.2.3 From 04b090d50c88ac8e5ec9c2e985bb65bd153893aa Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sat, 28 Apr 2007 23:03:59 -0700 Subject: [AF_IUCV/IUCV]: smp_call_function deadlock Calling smp_call_function can lead to a deadlock if it is called from tasklet context. Fixing this deadlock requires to move the smp_call_function from the tasklet context to a work queue. To do that queue the path pending interrupts to a separate list and move the path cleanup out of iucv_path_sever to iucv_path_connect and iucv_path_pending. This creates a new requirement for iucv_path_connect: it may not be called from tasklet context anymore. Also fixed compile problem for CONFIG_HOTPLUG_CPU=n and another one when walking the cpu_online mask. When doing this, we must disable cpu hotplug. Signed-off-by: Frank Pavlic Signed-off-by: Martin Schwidefsky Signed-off-by: David S. Miller --- net/iucv/iucv.c | 205 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 132 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 60f293842a3..903bdb6eaaa 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -90,20 +90,43 @@ struct iucv_irq_data { u32 res2[8]; }; -struct iucv_work { +struct iucv_irq_list { struct list_head list; struct iucv_irq_data data; }; -static LIST_HEAD(iucv_work_queue); -static DEFINE_SPINLOCK(iucv_work_lock); - static struct iucv_irq_data *iucv_irq_data; static cpumask_t iucv_buffer_cpumask = CPU_MASK_NONE; static cpumask_t iucv_irq_cpumask = CPU_MASK_NONE; -static void iucv_tasklet_handler(unsigned long); -static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_handler,0); +/* + * Queue of interrupt buffers lock for delivery via the tasklet + * (fast but can't call smp_call_function). + */ +static LIST_HEAD(iucv_task_queue); + +/* + * The tasklet for fast delivery of iucv interrupts. + */ +static void iucv_tasklet_fn(unsigned long); +static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); + +/* + * Queue of interrupt buffers for delivery via a work queue + * (slower but can call smp_call_function). + */ +static LIST_HEAD(iucv_work_queue); + +/* + * The work element to deliver path pending interrupts. + */ +static void iucv_work_fn(struct work_struct *work); +static DECLARE_WORK(iucv_work, iucv_work_fn); + +/* + * Spinlock protecting task and work queue. + */ +static DEFINE_SPINLOCK(iucv_queue_lock); enum iucv_command_codes { IUCV_QUERY = 0, @@ -147,10 +170,10 @@ static unsigned long iucv_max_pathid; static DEFINE_SPINLOCK(iucv_table_lock); /* - * iucv_tasklet_cpu: contains the number of the cpu executing the tasklet. - * Needed for iucv_path_sever called from tasklet. + * iucv_active_cpu: contains the number of the cpu executing the tasklet + * or the work handler. Needed for iucv_path_sever called from tasklet. */ -static int iucv_tasklet_cpu = -1; +static int iucv_active_cpu = -1; /* * Mutex and wait queue for iucv_register/iucv_unregister. @@ -449,17 +472,19 @@ static void iucv_setmask_mp(void) { int cpu; + preempt_disable(); for_each_online_cpu(cpu) /* Enable all cpus with a declared buffer. */ if (cpu_isset(cpu, iucv_buffer_cpumask) && !cpu_isset(cpu, iucv_irq_cpumask)) smp_call_function_on(iucv_allow_cpu, NULL, 0, 1, cpu); + preempt_enable(); } /** * iucv_setmask_up * - * Allow iucv interrupts on a single cpus. + * Allow iucv interrupts on a single cpu. */ static void iucv_setmask_up(void) { @@ -493,8 +518,10 @@ static int iucv_enable(void) goto out; /* Declare per cpu buffers. */ rc = -EIO; + preempt_disable(); for_each_online_cpu(cpu) smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu); + preempt_enable(); if (cpus_empty(iucv_buffer_cpumask)) /* No cpu could declare an iucv buffer. */ goto out_path; @@ -584,48 +611,49 @@ static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) return iucv_call_b2f0(IUCV_SEVER, parm); } +#ifdef CONFIG_SMP /** - * __iucv_cleanup_pathid + * __iucv_cleanup_queue * @dummy: unused dummy argument * * Nop function called via smp_call_function to force work items from * pending external iucv interrupts to the work queue. */ -static void __iucv_cleanup_pathid(void *dummy) +static void __iucv_cleanup_queue(void *dummy) { } +#endif /** - * iucv_cleanup_pathid - * @pathid: 16 bit pathid + * iucv_cleanup_queue * * Function called after a path has been severed to find all remaining * work items for the now stale pathid. The caller needs to hold the * iucv_table_lock. */ -static void iucv_cleanup_pathid(u16 pathid) +static void iucv_cleanup_queue(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; /* - * Path is severed, the pathid can be reused immediatly on - * a iucv connect or a connection pending interrupt. - * iucv_path_connect and connection pending interrupt will - * wait until the iucv_table_lock is released before the - * recycled pathid enters the system. - * Force remaining interrupts to the work queue, then - * scan the work queue for items of this path. + * When a path is severed, the pathid can be reused immediatly + * on a iucv connect or a connection pending interrupt. Remove + * all entries from the task queue that refer to a stale pathid + * (iucv_path_table[ix] == NULL). Only then do the iucv connect + * or deliver the connection pending interrupt. To get all the + * pending interrupts force them to the work queue by calling + * an empty function on all cpus. */ - smp_call_function(__iucv_cleanup_pathid, NULL, 0, 1); - spin_lock_irq(&iucv_work_lock); - list_for_each_entry_safe(p, n, &iucv_work_queue, list) { - /* Remove work items for pathid except connection pending */ - if (p->data.ippathid == pathid && p->data.iptype != 0x01) { + smp_call_function(__iucv_cleanup_queue, NULL, 0, 1); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) { + /* Remove stale work items from the task queue. */ + if (iucv_path_table[p->data.ippathid] == NULL) { list_del(&p->list); kfree(p); } } - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); } /** @@ -684,7 +712,6 @@ void iucv_unregister(struct iucv_handler *handler, int smp) iucv_sever_pathid(p->pathid, NULL); iucv_path_table[p->pathid] = NULL; list_del(&p->list); - iucv_cleanup_pathid(p->pathid); iucv_path_free(p); } spin_unlock_bh(&iucv_table_lock); @@ -757,9 +784,9 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, union iucv_param *parm; int rc; - preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) - spin_lock_bh(&iucv_table_lock); + BUG_ON(in_atomic()); + spin_lock_bh(&iucv_table_lock); + iucv_cleanup_queue(); parm = percpu_ptr(iucv_param, smp_processor_id()); memset(parm, 0, sizeof(union iucv_param)); parm->ctrl.ipmsglim = path->msglim; @@ -794,9 +821,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, rc = -EIO; } } - if (iucv_tasklet_cpu != smp_processor_id()) - spin_unlock_bh(&iucv_table_lock); - preempt_enable(); + spin_unlock_bh(&iucv_table_lock); return rc; } @@ -867,15 +892,14 @@ int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_lock_bh(&iucv_table_lock); rc = iucv_sever_pathid(path->pathid, userdata); if (!rc) { iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); } - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_unlock_bh(&iucv_table_lock); preempt_enable(); return rc; @@ -1244,8 +1268,7 @@ static void iucv_path_complete(struct iucv_irq_data *data) struct iucv_path_complete *ipc = (void *) data; struct iucv_path *path = iucv_path_table[ipc->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_complete) + if (path && path->handler && path->handler->path_complete) path->handler->path_complete(path, ipc->ipuser); } @@ -1273,14 +1296,14 @@ static void iucv_path_severed(struct iucv_irq_data *data) struct iucv_path_severed *ips = (void *) data; struct iucv_path *path = iucv_path_table[ips->ippathid]; - BUG_ON(!path || !path->handler); + if (!path || !path->handler) /* Already severed */ + return; if (path->handler->path_severed) path->handler->path_severed(path, ips->ipuser); else { iucv_sever_pathid(path->pathid, NULL); iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); iucv_path_free(path); } } @@ -1309,8 +1332,7 @@ static void iucv_path_quiesced(struct iucv_irq_data *data) struct iucv_path_quiesced *ipq = (void *) data; struct iucv_path *path = iucv_path_table[ipq->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_quiesced) + if (path && path->handler && path->handler->path_quiesced) path->handler->path_quiesced(path, ipq->ipuser); } @@ -1338,8 +1360,7 @@ static void iucv_path_resumed(struct iucv_irq_data *data) struct iucv_path_resumed *ipr = (void *) data; struct iucv_path *path = iucv_path_table[ipr->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_resumed) + if (path && path->handler && path->handler->path_resumed) path->handler->path_resumed(path, ipr->ipuser); } @@ -1371,8 +1392,7 @@ static void iucv_message_complete(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imc->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_complete) { + if (path && path->handler && path->handler->message_complete) { msg.flags = imc->ipflags1; msg.id = imc->ipmsgid; msg.audit = imc->ipaudit; @@ -1417,8 +1437,7 @@ static void iucv_message_pending(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imp->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_pending) { + if (path && path->handler && path->handler->message_pending) { msg.flags = imp->ipflags1; msg.id = imp->ipmsgid; msg.class = imp->iptrgcls; @@ -1433,17 +1452,16 @@ static void iucv_message_pending(struct iucv_irq_data *data) } /** - * iucv_tasklet_handler: + * iucv_tasklet_fn: * * This tasklet loops over the queue of irq buffers created by * iucv_external_interrupt, calls the appropriate action handler * and then frees the buffer. */ -static void iucv_tasklet_handler(unsigned long ignored) +static void iucv_tasklet_fn(unsigned long ignored) { typedef void iucv_irq_fn(struct iucv_irq_data *); static iucv_irq_fn *irq_fn[] = { - [0x01] = iucv_path_pending, [0x02] = iucv_path_complete, [0x03] = iucv_path_severed, [0x04] = iucv_path_quiesced, @@ -1453,38 +1471,70 @@ static void iucv_tasklet_handler(unsigned long ignored) [0x08] = iucv_message_pending, [0x09] = iucv_message_pending, }; - struct iucv_work *p; + struct list_head task_queue = LIST_HEAD_INIT(task_queue); + struct iucv_irq_list *p, *n; /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ spin_lock(&iucv_table_lock); - iucv_tasklet_cpu = smp_processor_id(); + iucv_active_cpu = smp_processor_id(); - spin_lock_irq(&iucv_work_lock); - while (!list_empty(&iucv_work_queue)) { - p = list_entry(iucv_work_queue.next, struct iucv_work, list); + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_task_queue, &task_queue); + spin_unlock_irq(&iucv_queue_lock); + + list_for_each_entry_safe(p, n, &task_queue, list) { list_del_init(&p->list); - spin_unlock_irq(&iucv_work_lock); irq_fn[p->data.iptype](&p->data); kfree(p); - spin_lock_irq(&iucv_work_lock); } - spin_unlock_irq(&iucv_work_lock); - iucv_tasklet_cpu = -1; + iucv_active_cpu = -1; spin_unlock(&iucv_table_lock); } +/** + * iucv_work_fn: + * + * This work function loops over the queue of path pending irq blocks + * created by iucv_external_interrupt, calls the appropriate action + * handler and then frees the buffer. + */ +static void iucv_work_fn(struct work_struct *work) +{ + typedef void iucv_irq_fn(struct iucv_irq_data *); + struct list_head work_queue = LIST_HEAD_INIT(work_queue); + struct iucv_irq_list *p, *n; + + /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ + spin_lock_bh(&iucv_table_lock); + iucv_active_cpu = smp_processor_id(); + + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_work_queue, &work_queue); + spin_unlock_irq(&iucv_queue_lock); + + iucv_cleanup_queue(); + list_for_each_entry_safe(p, n, &work_queue, list) { + list_del_init(&p->list); + iucv_path_pending(&p->data); + kfree(p); + } + + iucv_active_cpu = -1; + spin_unlock_bh(&iucv_table_lock); +} + /** * iucv_external_interrupt * @code: irq code * * Handles external interrupts coming in from CP. - * Places the interrupt buffer on a queue and schedules iucv_tasklet_handler(). + * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ static void iucv_external_interrupt(u16 code) { struct iucv_irq_data *p; - struct iucv_work *work; + struct iucv_irq_list *work; p = percpu_ptr(iucv_irq_data, smp_processor_id()); if (p->ippathid >= iucv_max_pathid) { @@ -1498,16 +1548,23 @@ static void iucv_external_interrupt(u16 code) printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n"); return; } - work = kmalloc(sizeof(struct iucv_work), GFP_ATOMIC); + work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); if (!work) { printk(KERN_WARNING "iucv_external_interrupt: out of memory\n"); return; } memcpy(&work->data, p, sizeof(work->data)); - spin_lock(&iucv_work_lock); - list_add_tail(&work->list, &iucv_work_queue); - spin_unlock(&iucv_work_lock); - tasklet_schedule(&iucv_tasklet); + spin_lock(&iucv_queue_lock); + if (p->iptype == 0x01) { + /* Path pending interrupt. */ + list_add_tail(&work->list, &iucv_work_queue); + schedule_work(&iucv_work); + } else { + /* The other interrupts. */ + list_add_tail(&work->list, &iucv_task_queue); + tasklet_schedule(&iucv_tasklet); + } + spin_unlock(&iucv_queue_lock); } /** @@ -1577,12 +1634,14 @@ out: */ static void iucv_exit(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; - spin_lock_irq(&iucv_work_lock); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) + kfree(p); list_for_each_entry_safe(p, n, &iucv_work_queue, list) kfree(p); - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); unregister_hotcpu_notifier(&iucv_cpu_notifier); percpu_free(iucv_param); percpu_free(iucv_irq_data); -- cgit v1.2.3 From 6aaf47fa48d3c44280810b1b470261d340e4ed87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Apr 2007 00:26:00 -0700 Subject: [PATCH] INET : IPV4 UDP lookups converted to a 2 pass algo Some people want to have many UDP sockets, binded to a single port but many different addresses. We currently hash all those sockets into a single chain. Processing of incoming packets is very expensive, because the whole chain must be examined to find the best match. I chose in this patch to hash UDP sockets with a hash function that take into account both their port number and address : This has a drawback because we need two lookups : one with a given address, one with a wildcard (null) address. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 171 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 114 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cec0f2cc49b..144970704c2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +/* + * Note about this hash function : + * Typical use is probably daddr = 0, only dport is going to vary hash + */ +static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) +{ + addr ^= addr >> 16; + addr ^= addr >> 8; + return port ^ addr; +} + +static inline int __udp_lib_port_inuse(unsigned int hash, int port, + __be32 daddr, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; + struct inet_sock *inet; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + if (sk->sk_hash != hash) + continue; + inet = inet_sk(sk); + if (inet->num != port) + continue; + if (inet->rcv_saddr == daddr) return 1; + } return 0; } @@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_node *node; struct hlist_head *head; struct sock *sk2; + unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + if (! __udp_lib_port_inuse(hash, result, + inet_sk(sk)->rcv_saddr, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && + if (sk2->sk_hash == hash && sk2 != sk && + inet_sk(sk2)->num == snum && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && @@ -201,9 +228,9 @@ gotit: goto fail; } inet_sk(sk)->num = snum; - sk->sk_hash = snum; + sk->sk_hash = hash; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + unsigned int hash, hashwild; + int score, best = -1; + + hash = hash_port_and_addr(ntohs(dport), daddr); + hashwild = hash_port_and_addr(ntohs(dport), 0); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + +lookup: + + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 9) { - result = sk; - break; - } else if (score > badness) { - result = sk; - badness = score; - } + if (sk->sk_hash != hash || ipv6_only_sock(sk) || + inet->num != dport) + continue; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + goto found; + } else if (score > best) { + result = sk; + best = score; } } + + if (hash != hashwild) { + hash = hashwild; + goto lookup; + } +found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) +static inline struct sock *udp_v4_mcast_next( + struct sock *sk, + unsigned int hnum, __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) { struct hlist_node *node; struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || + inet->num != loc_port || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk; + struct sock *sk, *skw, *sknext; int dif; + unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr); + unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0); - read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + read_lock(&udp_hash_lock); + + sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); + skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); + + sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { + hash = hashwild; + sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source, + saddr, dif); + } + if (sk) { do { struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); + sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest, + daddr, uh->source, saddr, dif); + if (!sknext && hash != hashwild) { + hash = hashwild; + sknext = udp_v4_mcast_next(skw, hash, uh->dest, + daddr, uh->source, saddr, dif); + } if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ + /* + * we should probably re-process + * instead of dropping packets here. + */ kfree_skb(skb1); } sk = sknext; -- cgit v1.2.3 From 157bfc25020f7eb731f94140e099307ade47299e Mon Sep 17 00:00:00 2001 From: Masahide NAKAMURA Date: Mon, 30 Apr 2007 00:33:35 -0700 Subject: [XFRM]: Restrict upper layer information by bundle. On MIPv6 usage, XFRM sub policy is enabled. When main (IPsec) and sub (MIPv6) policy selectors have the same address set but different upper layer information (i.e. protocol number and its ports or type/code), multiple bundle should be created. However, currently we have issue to use the same bundle created for the first time with all flows covered by the case. It is useful for the bundle to have the upper layer information to be restructured correctly if it does not match with the flow. 1. Bundle was created by two policies Selector from another policy is added to xfrm_dst. If the flow does not match the selector, it goes to slow path to restructure new bundle by single policy. 2. Bundle was created by one policy Flow cache is added to xfrm_dst as originated one. If the flow does not match the cache, it goes to slow path to try searching another policy. Signed-off-by: Masahide NAKAMURA Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dbf9d96a2f0..263e34e4526 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1344,6 +1344,40 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } +static int inline +xfrm_dst_alloc_copy(void **target, void *src, int size) +{ + if (!*target) { + *target = kmalloc(size, GFP_ATOMIC); + if (!*target) + return -ENOMEM; + } + memcpy(*target, src, size); + return 0; +} + +static int inline +xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->partner), + sel, sizeof(*sel)); +#else + return 0; +#endif +} + +static int inline +xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); +#else + return 0; +#endif +} static int stale_bundle(struct dst_entry *dst); @@ -1532,6 +1566,18 @@ restart: err = -EHOSTUNREACH; goto error; } + + if (npols > 1) + err = xfrm_dst_update_parent(dst, &pols[1]->selector); + else + err = xfrm_dst_update_origin(dst, fl); + if (unlikely(err)) { + write_unlock_bh(&policy->lock); + if (dst) + dst_free(dst); + goto error; + } + dst->next = policy->bundles; policy->bundles = dst; dst_hold(dst); @@ -1947,6 +1993,15 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; +#ifdef CONFIG_XFRM_SUB_POLICY + if (fl) { + if (first->origin && !flow_cache_uli_match(first->origin, fl)) + return 0; + if (first->partner && + !xfrm_selector_match(first->partner, fl, family)) + return 0; + } +#endif last = NULL; -- cgit v1.2.3 From 575ee7140dabe9b9c4f66f4f867039b97e548867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 30 Apr 2007 00:39:55 -0700 Subject: [TCP] FRTO: Delay skb available check until it's mandatory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new data is needed until the first ACK comes, so no need to check for application limitedness until then. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 051f0f815f1..6b669898b19 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if these conditions are satisfied: - * - there must be some unsent new data - * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head (SACK enhanced - * variant from Appendix B of RFC4138 is more robust here) +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !tcp_send_head(sk) || - after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) + if (!sysctl_tcp_frto) return 0; if (IsSackFrto()) @@ -2710,6 +2705,14 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } if (tp->frto_counter == 1) { + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; -- cgit v1.2.3 From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 30 Apr 2007 00:42:20 -0700 Subject: [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a corner case where less than MSS sized new data thingie is awaiting in the send queue. For F-RTO to work correctly, a new data segment must be sent at certain point or F-RTO cannot be used at all. RFC4138 allows overriding of Nagle at that point. Implementation uses frto_counter states 2 and 3 to distinguish when Nagle override is needed. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 13 ++++++++----- net/ipv4/tcp_output.c | 6 ++++-- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6b669898b19..7641b2761a1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. @@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) inet_csk(sk)->icsk_retransmits = 0; if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } @@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - if ((tp->frto_counter == 2) && + if ((tp->frto_counter >= 2) && (!(flag&FLAG_FORWARD_PROGRESS) || ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ @@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (!tcp_send_head(sk) || after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); return 1; } tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; - } else /* frto_counter == 2 */ { + } else { switch (sysctl_tcp_frto_response) { case 2: tcp_undo_spur_to_response(sk, flag); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b5fa3c19afe..0faacf9c419 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; -- cgit v1.2.3 From e91a47ebb130b90790c7a8c625ade4dcea246842 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:45:49 -0700 Subject: [IPV4] SNMP: Support InNoRoutes An IP datagram which is being discarded because of no routes in the forwarding path should be counted as InNoRoutes. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 324e7e0fdb2..63ab5230c61 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,6 +340,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); goto drop; } } -- cgit v1.2.3 From 704aed53b4e43bebfbd425cf95b66794a9cfa2c2 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:46:30 -0700 Subject: [IPV4] SNMP: Support InTruncatedPkts An IP datagram which is being discarded because the datagram frame didn't carry enough data should be counted as InTruncatedPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 63ab5230c61..c8c455dd9ca 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -416,7 +416,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto inhdr_error; len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl*4)) + if (skb->len < len) { + IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it -- cgit v1.2.3 From 5506b54b36f067b9776935085c9f8e607b026b23 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:48:10 -0700 Subject: [IPV4] SNMP: Support InMcastPkts and InBcastPkts A received IP multicast datagram should be counted as InMcastPkts. By the same token, a received IP broadcast datagram should be counted as InBcastPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c8c455dd9ca..97069399d86 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -329,6 +329,7 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; /* * Initialise the virtual path cache for the packet. It describes @@ -360,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; + rt = (struct rtable*)skb->dst; + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + return dst_input(skb); drop: -- cgit v1.2.3 From 80787ebc2bbd8e675d8b9ff8cfa40f15134feebe Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:48:20 -0700 Subject: [IPV4] SNMP: Support OutMcastPkts and OutBcastPkts A transmitted IP multicast datagram should be counted as OutMcastPkts. By the same token, a transmitted IP broadcast datagram should be counted as OutBcastPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 534650cad3a..d6427d91851 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2; -- cgit v1.2.3