From a9828ec6bc0b7e19a65f7e13daa8bd35a926a753 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 1 Oct 2009 11:33:03 +0000 Subject: ethtool: Remove support for obsolete string query operations The in-tree implementations have all been converted to get_sset_count(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 58 ++++++++++-------------------------------------------- 1 file changed, 10 insertions(+), 48 deletions(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4c12ddb5f5e..e1951084b97 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -198,13 +198,6 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; - } else { - /* code path for obsolete hooks */ - - if (ops->self_test_count) - info.testinfo_len = ops->self_test_count(dev); - if (ops->get_stats_count) - info.n_stats = ops->get_stats_count(dev); } if (ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); @@ -684,16 +677,10 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) u64 *data; int ret, test_len; - if (!ops->self_test) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->self_test_count) + if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - test_len = ops->get_sset_count(dev, ETH_SS_TEST); - else - /* code path for obsolete hook */ - test_len = ops->self_test_count(dev); + test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); @@ -728,36 +715,17 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) u8 *data; int ret; - if (!ops->get_strings) + if (!ops->get_strings || !ops->get_sset_count) return -EOPNOTSUPP; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; - if (ops->get_sset_count) { - ret = ops->get_sset_count(dev, gstrings.string_set); - if (ret < 0) - return ret; - - gstrings.len = ret; - } else { - /* code path for obsolete hooks */ - - switch (gstrings.string_set) { - case ETH_SS_TEST: - if (!ops->self_test_count) - return -EOPNOTSUPP; - gstrings.len = ops->self_test_count(dev); - break; - case ETH_SS_STATS: - if (!ops->get_stats_count) - return -EOPNOTSUPP; - gstrings.len = ops->get_stats_count(dev); - break; - default: - return -EINVAL; - } - } + ret = ops->get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); if (!data) @@ -798,16 +766,10 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) u64 *data; int ret, n_stats; - if (!ops->get_ethtool_stats) - return -EOPNOTSUPP; - if (!ops->get_sset_count && !ops->get_stats_count) + if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; - if (ops->get_sset_count) - n_stats = ops->get_sset_count(dev, ETH_SS_STATS); - else - /* code path for obsolete hook */ - n_stats = ops->get_stats_count(dev); + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; WARN_ON(n_stats == 0); -- cgit v1.2.3 From 0835acfe72e43b2f9bd46ec8c0d219e94c3525e0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Sep 2009 13:03:33 +0000 Subject: pktgen: Avoid dirtying skb->users when txq is full We can avoid two atomic ops on skb->users if packet is not going to be sent to the device (because hardware txqueue is full) Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b69455217ed..e856ab0d074 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3441,12 +3441,14 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) txq = netdev_get_tx_queue(odev, queue_map); __netif_tx_lock_bh(txq); - atomic_inc(&(pkt_dev->skb->users)); - if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) + if (unlikely(netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq))) { ret = NETDEV_TX_BUSY; - else - ret = (*xmit)(pkt_dev->skb, odev); + pkt_dev->last_ok = 0; + goto unlock; + } + atomic_inc(&(pkt_dev->skb->users)); + ret = (*xmit)(pkt_dev->skb, odev); switch (ret) { case NETDEV_TX_OK: @@ -3468,6 +3470,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) atomic_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } +unlock: __netif_tx_unlock_bh(txq); /* If pkt_dev->count is zero, then run forever */ -- cgit v1.2.3 From 7ffbe3fdace0bdfcdab8dc6c77506feda0871f79 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Oct 2009 05:15:27 +0000 Subject: net: introduce NETDEV_POST_INIT notifier For various purposes including a wireless extensions bugfix, we need to hook into the netdev creation before before netdev_register_kobject(). This will also ease doing the dev type assignment that Marcel was working on for cfg80211 drivers w/o touching them all. Signed-off-by: Johannes Berg Signed-off-by: Marcel Holtmann Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b8f74cfb1bf..a74c8fd6955 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4836,6 +4836,12 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_GSO; netdev_initialize_kobject(dev); + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + ret = netdev_register_kobject(dev); if (ret) goto err_uninit; -- cgit v1.2.3 From d519e17e2d01a0ee9abe083019532061b4438065 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Fri, 2 Oct 2009 09:26:12 +0000 Subject: net: export device speed and duplex via sysfs This patch exports the link-speed (in Mbps) and duplex of an interface via sysfs. This eliminates the need to use ethtool just to check the link-speed. Not requiring 'ethtool' and not relying on the SIOCETHTOOL ioctl should be helpful in an embedded environment where space is at a premium as well. NOTE: This patch also intentionally allows non-root users to check the link speed and duplex -- something not possible with ethtool. Here's some sample output: # cat /sys/class/net/eth0/speed 100 # cat /sys/class/net/eth0/duplex half # ethtool eth0 Settings for eth0: Supported ports: [ TP ] Supported link modes: 10baseT/Half 10baseT/Full 100baseT/Half 100baseT/Full 1000baseT/Half 1000baseT/Full Supports auto-negotiation: Yes Advertised link modes: Not reported Advertised auto-negotiation: No Speed: 100Mb/s Duplex: Half Port: Twisted Pair PHYAD: 1 Transceiver: internal Auto-negotiation: off Supports Wake-on: g Wake-on: g Current message level: 0x000000ff (255) Link detected: yes Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 821d30918cf..effb78410eb 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -130,6 +130,44 @@ static ssize_t show_carrier(struct device *dev, return -EINVAL; } +static ssize_t show_speed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_duplex(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + + if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); + } + rtnl_unlock(); + return ret; +} + static ssize_t show_dormant(struct device *dev, struct device_attribute *attr, char *buf) { @@ -259,6 +297,8 @@ static struct device_attribute net_class_attributes[] = { __ATTR(address, S_IRUGO, show_address, NULL), __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(speed, S_IRUGO, show_speed, NULL), + __ATTR(duplex, S_IRUGO, show_duplex, NULL), __ATTR(dormant, S_IRUGO, show_dormant, NULL), __ATTR(operstate, S_IRUGO, show_operstate, NULL), __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), -- cgit v1.2.3 From d250a5f90e53f5e150618186230795352d154c88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Oct 2009 10:32:18 +0000 Subject: pkt_sched: gen_estimator: Dont report fake rate estimators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jarek Poplawski a écrit : > > > Hmm... So you made me to do some "real" work here, and guess what?: > there is one serious checkpatch warning! ;-) Plus, this new parameter > should be added to the function description. Otherwise: > Signed-off-by: Jarek Poplawski > > Thanks, > Jarek P. > > PS: I guess full "Don't" would show we really mean it... Okay :) Here is the last round, before the night ! Thanks again [RFC] pkt_sched: gen_estimator: Don't report fake rate estimators We currently send TCA_STATS_RATE_EST elements to netlink users, even if no estimator is running. # tc -s -d qdisc qdisc pfifo_fast 0: dev eth0 root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 Sent 112833764978 bytes 1495081739 pkt (dropped 0, overlimits 0 requeues 0) rate 0bit 0pps backlog 0b 0p requeues 0 User has no way to tell if the "rate 0bit 0pps" is a real estimation, or a fake one (because no estimator is active) After this patch, tc command output is : $ tc -s -d qdisc qdisc pfifo_fast 0: dev eth0 root bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 Sent 561075 bytes 1196 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 We add a parameter to gnet_stats_copy_rate_est() function so that it can use gen_estimator_active(bstats, r), as suggested by Jarek. This parameter can be NULL if check is not necessary, (htb for example has a mandatory rate estimator) Signed-off-by: Eric Dumazet Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/gen_stats.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 8569310268a..393b1d8618e 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -127,6 +127,7 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle + * @b: basic statistics * @r: rate estimator statistics * * Appends the rate estimator statistics to the top level TLV created by @@ -136,8 +137,13 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +gnet_stats_copy_rate_est(struct gnet_dump *d, + const struct gnet_stats_basic_packed *b, + struct gnet_stats_rate_est *r) { + if (b && !gen_estimator_active(b, r)) + return 0; + if (d->compat_tc_stats) { d->tc_stats.bps = r->bps; d->tc_stats.pps = r->pps; -- cgit v1.2.3 From d73d3a8cb4723e161589864741d8528d70b350eb Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 5 Oct 2009 10:59:58 +0000 Subject: ethtool: Add reset operation After updating firmware stored in flash, users may wish to reset the relevant hardware and start the new firmware immediately. This should not be completely automatic as it may be disruptive. A selective reset may also be useful for debugging or diagnostics. This adds a separate reset operation which takes flags indicating the components to be reset. Drivers are allowed to reset only a subset of those requested, and must indicate the actual subset. This allows the use of generic component masks and some future expansion. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e1951084b97..d8aee584e8d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -302,6 +302,26 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) return ret; } +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; @@ -1089,6 +1109,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, useraddr); break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From 3d23e349d807177eaf519d444677cee86b1a04cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 29 Sep 2009 23:27:28 +0200 Subject: wext: refactor Refactor wext to * split out iwpriv handling * split out iwspy handling * split out procfs support * allow cfg80211 to have wireless extensions compat code w/o CONFIG_WIRELESS_EXT After this, drivers need to - select WIRELESS_EXT - for wext support - select WEXT_PRIV - for iwpriv support - select WEXT_SPY - for iwspy support except cfg80211 -- which gets new hooks in wext-core.c and can then get wext handlers without CONFIG_WIRELESS_EXT. Wireless extensions procfs support is auto-selected based on PROC_FS and anything that requires the wext core (i.e. WIRELESS_EXT or CFG80211_WEXT). Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index effb78410eb..9b07535c288 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -543,8 +543,12 @@ int netdev_register_kobject(struct net_device *net) *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->wireless_handlers || net->ieee80211_ptr) + if (net->ieee80211_ptr) *groups++ = &wireless_group; +#ifdef CONFIG_WIRELESS_EXT + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif #endif #endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From d9f5950f90292f7cc42834338dfd5f44dc4cc4ca Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 7 Oct 2009 12:24:25 +0000 Subject: net: Make UFO on master device independent of attached devices Now that software UFO is supported, UFO can be enabled on master devices like bridge, bond even though the attached device doesn't support this feature in hardware. This allows UFO to be used between KVM host and guest even when a physical interface attached to the bridge doesn't support UFO. Signed-off-by: Sridhar Samudrala Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a74c8fd6955..510ff205d5d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5489,7 +5489,7 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; - all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; all |= one & mask & NETIF_F_ONE_FOR_ALL; return all; -- cgit v1.2.3 From 3b885787ea4112eaa80945999ea0901bf742707f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 12 Oct 2009 13:26:31 -0700 Subject: net: Generalize socket rx gap / receive queue overflow cmsg Create a new socket level option to report number of queue overflows Recently I augmented the AF_PACKET protocol to report the number of frames lost on the socket receive queue between any two enqueued frames. This value was exported via a SOL_PACKET level cmsg. AFter I completed that work it was requested that this feature be generalized so that any datagram oriented socket could make use of this option. As such I've created this patch, It creates a new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue overflowed between any two given frames. It also augments the AF_PACKET protocol to take advantage of this new feature (as it previously did not touch sk->sk_drops, which this patch uses to record the overflow count). Tested successfully by me. Notes: 1) Unlike my previous patch, this patch simply records the sk_drops value, which is not a number of drops between packets, but rather a total number of drops. Deltas must be computed in user space. 2) While this patch currently works with datagram oriented protocols, it will also be accepted by non-datagram oriented protocols. I'm not sure if thats agreeable to everyone, but my argument in favor of doing so is that, for those protocols which aren't applicable to this option, sk_drops will always be zero, and reporting no drops on a receive queue that isn't used for those non-participating protocols seems reasonable to me. This also saves us having to code in a per-protocol opt in mechanism. 3) This applies cleanly to net-next assuming that commit 977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted Signed-off-by: Neil Horman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 7626b6aacd6..43ca2c99539 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -276,6 +276,8 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err = 0; int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK @@ -305,7 +307,10 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ skb_len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len); @@ -702,6 +707,12 @@ set_rcvbuf: /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + if (valbool) + sock_set_flag(sk, SOCK_RXQ_OVFL); + else + sock_reset_flag(sk, SOCK_RXQ_OVFL); + break; default: ret = -ENOPROTOOPT; break; @@ -901,6 +912,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_mark; break; + case SO_RXQ_OVFL: + v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From 89d71a66c40d629e3b1285def543ab1425558cd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Oct 2009 05:34:20 +0000 Subject: net: Use netdev_alloc_skb_ip_align() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 510ff205d5d..28b0b9e992a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2604,20 +2604,13 @@ EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { - struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; if (!skb) { - skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); - if (!skb) - goto out; - - skb_reserve(skb, NET_IP_ALIGN); - - napi->skb = skb; + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; } - -out: return skb; } EXPORT_SYMBOL(napi_get_frags); -- cgit v1.2.3 From 766e9037cc139ee25ed93ee5ad11e1450c4b99f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Oct 2009 20:40:11 -0700 Subject: net: sk_drops consolidation sock_queue_rcv_skb() can update sk_drops itself, removing need for callers to take care of it. This is more consistent since sock_queue_rcv_skb() also reads sk_drops when queueing a skb. This adds sk_drops managment to many protocols that not cared yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 43ca2c99539..38713aa3faf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -274,7 +274,7 @@ static void sock_disable_timestamp(struct sock *sk, int flag) int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err = 0; + int err; int skb_len; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -284,17 +284,17 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf) { - err = -ENOMEM; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOMEM; } err = sk_filter(sk, skb); if (err) - goto out; + return err; if (!sk_rmem_schedule(sk, skb->truesize)) { - err = -ENOBUFS; - goto out; + atomic_inc(&sk->sk_drops); + return -ENOBUFS; } skb->dev = NULL; @@ -314,8 +314,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, skb_len); -out: - return err; + return 0; } EXPORT_SYMBOL(sock_queue_rcv_skb); -- cgit v1.2.3 From 8edf19c2fe028563fc6ea9cb1995b8ee4172d4b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Oct 2009 00:12:40 +0000 Subject: net: sk_drops consolidation part 2 - skb_kill_datagram() can increment sk->sk_drops itself, not callers. - UDP on IPV4 & IPV6 dropped frames (because of bad checksum or policy checks) increment sk_drops Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 1c6cf3a1a4f..4d57f5e12b0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -262,6 +262,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } kfree_skb(skb); + atomic_inc(&sk->sk_drops); sk_mem_reclaim_partial(sk); return err; -- cgit v1.2.3 From 7e75f93eda027d9f9e0203ee6ffd210ea92e98f3 Mon Sep 17 00:00:00 2001 From: jamal Date: Mon, 19 Oct 2009 02:17:56 +0000 Subject: pkt_sched: ingress socket filter by mark Allow bpf to set a filter to drop packets that dont match a specific mark Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d1d779ca096..e3987e1d483 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -303,6 +303,9 @@ load_b: case SKF_AD_IFINDEX: A = skb->dev->ifindex; continue; + case SKF_AD_MARK: + A = skb->mark; + continue; case SKF_AD_NLATTR: { struct nlattr *nla; -- cgit v1.2.3 From d19742fb1c68e6db83b76e06dea5a374c99e104f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Oct 2009 01:06:22 -0700 Subject: filter: Add SKF_AD_QUEUE instruction It can help being able to filter packets on their queue_mapping. If filter performance is not good, we could add a "numqueue" field in struct packet_type, so that netif_nit_deliver() and other functions can directly ignore packets with not expected queue number. Lets experiment this simple filter extension first. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e3987e1d483..08db7b9143a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -306,6 +306,9 @@ load_b: case SKF_AD_MARK: A = skb->mark; continue; + case SKF_AD_QUEUE: + A = skb->queue_mapping; + continue; case SKF_AD_NLATTR: { struct nlattr *nla; -- cgit v1.2.3 From e022f0b4a03f4fff9323b509df023b8af635716e Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:46:20 +0000 Subject: net: Introduce sk_tx_queue_mapping Introduce sk_tx_queue_mapping; and functions that set, test and get this value. Reset sk_tx_queue_mapping to -1 whenever the dst cache is set/reset, and in socket alloc. Setting txq to -1 and using valid txq=<0 to n-1> allows the tx path to use the value of sk_tx_queue_mapping directly instead of subtracting 1 on every tx. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 38713aa3faf..934d9673f08 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -357,6 +357,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) struct dst_entry *dst = sk->sk_dst_cache; if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_tx_queue_clear(sk); sk->sk_dst_cache = NULL; dst_release(dst); return NULL; @@ -953,7 +954,8 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) void *sptr = nsk->sk_security; #endif BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != - sizeof(osk->sk_node) + sizeof(osk->sk_refcnt)); + sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) + + sizeof(osk->sk_tx_queue_mapping)); memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); #ifdef CONFIG_SECURITY_NETWORK @@ -997,6 +999,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; + sk_tx_queue_clear(sk); } return sk; -- cgit v1.2.3 From ea94ff3b55188df157a8740bdf3976a87563d705 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:46:45 +0000 Subject: net: Fix for dst_negative_advice dst_negative_advice() should check for changed dst and reset sk_tx_queue_mapping accordingly. Pass sock to the callers of dst_negative_advice. (sk_reset_txq is defined just for use by dst_negative_advice. The only way I could find to get around this is to move dst_negative_() from dst.h to dst.c, include sock.h in dst.c, etc) Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/sock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 934d9673f08..5a51512f638 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -352,6 +352,12 @@ discard_and_relse: } EXPORT_SYMBOL(sk_receive_skb); +void sk_reset_txq(struct sock *sk) +{ + sk_tx_queue_clear(sk); +} +EXPORT_SYMBOL(sk_reset_txq); + struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk->sk_dst_cache; -- cgit v1.2.3 From a4ee3ce3293dc931fab19beb472a8bde1295aebe Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 19 Oct 2009 23:50:07 +0000 Subject: net: Use sk_tx_queue_mapping for connected sockets For connected sockets, the first run of dev_pick_tx saves the calculated txq in sk_tx_queue_mapping. This is not saved if either the device has a queue select or the socket is not connected. Next iterations of dev_pick_tx uses the cached value of sk_tx_queue_mapping. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 28b0b9e992a..fa88dcd476d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1791,13 +1791,25 @@ EXPORT_SYMBOL(skb_tx_hash); static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - const struct net_device_ops *ops = dev->netdev_ops; - u16 queue_index = 0; + u16 queue_index; + struct sock *sk = skb->sk; + + if (sk_tx_queue_recorded(sk)) { + queue_index = sk_tx_queue_get(sk); + } else { + const struct net_device_ops *ops = dev->netdev_ops; - if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb); - else if (dev->real_num_tx_queues > 1) - queue_index = skb_tx_hash(dev, skb); + if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + } else { + queue_index = 0; + if (dev->real_num_tx_queues > 1) + queue_index = skb_tx_hash(dev, skb); + + if (sk && sk->sk_dst_cache) + sk_tx_queue_set(sk, queue_index); + } + } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); -- cgit v1.2.3 From a3d1289126e7b14307074b76bf1677015ea5036f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Oct 2009 10:59:31 +0000 Subject: rtnetlink: rtnl_setlink() and rtnl_getlink() changes rtnl_getlink() & rtnl_setlink() run with RTNL held, we can use __dev_get_by_index() and __dev_get_by_name() variants and avoid dev_hold()/dev_put() Adds to rtnl_getlink() the capability to find a device by its name, not only by its index. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eb42873f2a3..ba13b0974a7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -910,9 +910,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = dev_get_by_name(net, ifname); + dev = __dev_get_by_name(net, ifname); else goto errout; @@ -922,11 +922,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } if ((err = validate_linkmsg(dev, tb)) < 0) - goto errout_dev; + goto errout; err = do_setlink(dev, ifm, tb, ifname, 0); -errout_dev: - dev_put(dev); errout: return err; } @@ -1154,6 +1152,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -1163,19 +1162,23 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (err < 0) return err; + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) { - dev = dev_get_by_index(net, ifm->ifi_index); - if (dev == NULL) - return -ENODEV; - } else + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else return -EINVAL; + if (dev == NULL) + return -ENODEV; + nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); - if (nskb == NULL) { - err = -ENOBUFS; - goto errout; - } + if (nskb == NULL) + return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, 0); @@ -1183,11 +1186,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); - goto errout; - } - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); -errout: - dev_put(dev); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); return err; } -- cgit v1.2.3 From 7c28bd0b8ec4d128bd7660671d1b626b0abc471f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Oct 2009 06:13:17 -0700 Subject: rtnetlink: speedup rtnl_dump_ifinfo() When handling large number of netdevice, rtnl_dump_ifinfo() is very slow because it has O(N^2) complexity. Instead of scanning one single list, we can use the 256 sub lists of the dev_index hash table. This considerably speedups "ip link" operations Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++----- net/core/rtnetlink.c | 37 ++++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fa88dcd476d..e7bada1d5ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -193,18 +193,15 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); -#define NETDEV_HASHBITS 8 -#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) - static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { - return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } /* Device list insertion */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ba13b0974a7..52ea418d530 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -682,22 +682,33 @@ nla_put_failure: static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx; - int s_idx = cb->args[0]; + int h, s_h; + int idx = 0, s_idx; struct net_device *dev; - - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, 0, NLM_F_MULTI) <= 0) - break; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI) <= 0) + goto out; cont: - idx++; + idx++; + } } - cb->args[0] = idx; +out: + cb->args[1] = idx; + cb->args[0] = h; return skb->len; } -- cgit v1.2.3 From 05423b241311c9380b7280179295bac7794281b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Oct 2009 18:40:35 -0700 Subject: vlan: allow null VLAN ID to be used We currently use a 16 bit field (vlan_tci) to store VLAN ID/PRIO on a skb. Null value is used as a special value, meaning vlan tagging not enabled. This forbids use of null vlan ID. As pointed by David, some drivers use the 3 high order bits (PRIO) As VLAN ID is 12 bits, we can use the remaining bit (CFI) as a flag, and allow null VLAN ID. In case future code really wants to use VLAN_CFI_MASK, we'll have to use a bit outside of vlan_tci. #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ #define VLAN_TAG_PRESENT VLAN_CFI_MASK #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ Reported-by: Gertjan Hofman Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e7bada1d5ed..950c13fa60d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2300,7 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); - if (skb->vlan_tci && vlan_hwaccel_do_receive(skb)) + if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; /* if we've gotten here through NAPI, check netpoll */ -- cgit v1.2.3 From 44a0873d52282f24b1894c58c0f157e0f626ddc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:03:04 +0000 Subject: net: Introduce unregister_netdevice_queue() This patchs adds an unreg_list anchor to struct net_device, and introduces an unregister_netdevice_queue() function, able to queue a net_device to a list instead of immediately unregister it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 950c13fa60d..ff94e2b8df7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5245,25 +5245,31 @@ void synchronize_net(void) EXPORT_SYMBOL(synchronize_net); /** - * unregister_netdevice - remove device from the kernel + * unregister_netdevice_queue - remove device from the kernel * @dev: device - * + * @head: list + * This function shuts down a device interface and removes it * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ -void unregister_netdevice(struct net_device *dev) +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + if (head) { + list_add_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } } -EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdev - remove device from the kernel -- cgit v1.2.3 From 9b5e383c11b08784eb0087617f880077982ef769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:04:19 +0000 Subject: net: Introduce unregister_netdevice_many() Introduce rollback_registered_many() and unregister_netdevice_many() rollback_registered_many() is able to perform necessary steps at device dismantle time, factorizing two expensive synchronize_net() calls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 97 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ff94e2b8df7..04d3e301402 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4637,59 +4637,76 @@ static void net_set_todo(struct net_device *dev) list_add_tail(&dev->todo_list, &net_todo_list); } -static void rollback_registered(struct net_device *dev) +static void rollback_registered_many(struct list_head *head) { + struct net_device *dev; + BUG_ON(dev_boot_phase); ASSERT_RTNL(); - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); + list_for_each_entry(dev, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); - WARN_ON(1); - return; - } + WARN_ON(1); + return; + } - BUG_ON(dev->reg_state != NETREG_REGISTERED); + BUG_ON(dev->reg_state != NETREG_REGISTERED); - /* If device is running, close it first. */ - dev_close(dev); + /* If device is running, close it first. */ + dev_close(dev); - /* And unlink it from device chain. */ - unlist_netdevice(dev); + /* And unlink it from device chain. */ + unlist_netdevice(dev); - dev->reg_state = NETREG_UNREGISTERING; + dev->reg_state = NETREG_UNREGISTERING; + } synchronize_net(); - /* Shutdown queueing discipline. */ - dev_shutdown(dev); + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* - * Flush the unicast and multicast chains - */ - dev_unicast_flush(dev); - dev_addr_discard(dev); + /* + * Flush the unicast and multicast chains + */ + dev_unicast_flush(dev); + dev_addr_discard(dev); - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); - /* Notifier chain MUST detach us from master device. */ - WARN_ON(dev->master); + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } synchronize_net(); - dev_put(dev); + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); +} + +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); } static void __netdev_init_queue_locks_one(struct net_device *dev, @@ -5271,6 +5288,22 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } +} + /** * unregister_netdev - remove device from the kernel * @dev: device -- cgit v1.2.3 From 23289a37e2b127dfc4de1313fba15bb4c9f0cd5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:06:36 +0000 Subject: net: add a list_head parameter to dellink() method Adding a list_head parameter to rtnl_link_ops->dellink() methods allow us to queue devices on a list, in order to dismantle them all at once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/rtnetlink.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 04d3e301402..4513dfd5718 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5629,7 +5629,7 @@ restart: /* Delete virtual devices */ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev); + dev->rtnl_link_ops->dellink(dev, NULL); goto restart; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 52ea418d530..391a62cd9df 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -248,7 +248,7 @@ static LIST_HEAD(link_ops); int __rtnl_link_register(struct rtnl_link_ops *ops) { if (!ops->dellink) - ops->dellink = unregister_netdevice; + ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; @@ -277,13 +277,13 @@ EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; -restart: + LIST_HEAD(list_kill); + for_each_netdev(net, dev) { - if (dev->rtnl_link_ops == ops) { - ops->dellink(dev); - goto restart; - } + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); } + unregister_netdevice_many(&list_kill); } void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) @@ -972,7 +972,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (!ops) return -EOPNOTSUPP; - ops->dellink(dev); + ops->dellink(dev, NULL); return 0; } -- cgit v1.2.3 From 63c8099d90096db56ee1c66c31f05d4fcfbc1c69 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:06:49 +0000 Subject: vlan: Optimize multiple unregistration Use unregister_netdevice_many() to speedup master device unregister. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4513dfd5718..09551cc143a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5303,6 +5303,7 @@ void unregister_netdevice_many(struct list_head *head) net_set_todo(dev); } } +EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel -- cgit v1.2.3 From ac5e3af9996fb911d4fdff910a8ac3cb7fc63a94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Oct 2009 01:23:33 +0000 Subject: net: sysfs: ethtool_ops can be NULL commit d519e17e2d01a0ee9abe083019532061b4438065 (net: export device speed and duplex via sysfs) made the wrong assumption that netdev->ethtool_ops was always set. This makes possible to crash kernel and let rtnl in locked state. modprobe dummy ip link set dummy0 up (udev runs and crash) Signed-off-by: Eric Dumazet Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 753c420060d..89de182353b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -139,7 +139,9 @@ static ssize_t show_speed(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { struct ethtool_cmd cmd = { ETHTOOL_GSET }; if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) @@ -158,7 +160,9 @@ static ssize_t show_duplex(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netdev->ethtool_ops->get_settings) { + if (netif_running(netdev) && + netdev->ethtool_ops && + netdev->ethtool_ops->get_settings) { struct ethtool_cmd cmd = { ETHTOOL_GSET }; if (!netdev->ethtool_ops->get_settings(netdev, &cmd)) -- cgit v1.2.3 From fb699dfd426a189fe33b91586c15176a75c8aed0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Oct 2009 19:18:49 +0000 Subject: net: Introduce dev_get_by_index_rcu() Some workloads hit dev_base_lock rwlock pretty hard. We can use RCU lookups to avoid touching this rwlock. netdevices are already freed after a RCU grace period, so this patch adds no penalty at device dismantle time. dev_ifname() converted to dev_get_by_index_rcu() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09551cc143a..68a1bb68b5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -214,12 +214,15 @@ static int list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail(&dev->dev_list, &net->dev_base_head); hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); - hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); return 0; } -/* Device list removal */ +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ static void unlist_netdevice(struct net_device *dev) { ASSERT_RTNL(); @@ -228,7 +231,7 @@ static void unlist_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_del(&dev->dev_list); hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); + hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -646,6 +649,31 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex) } EXPORT_SYMBOL(__dev_get_by_index); +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + /** * dev_get_by_index - find a device by its ifindex @@ -662,11 +690,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); @@ -2939,15 +2967,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifr.ifr_ifindex); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); if (!dev) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); return -ENODEV; } strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; -- cgit v1.2.3 From 5b252f0c2f98df21fadf0f6cf189b87a0b938228 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 29 Oct 2009 07:17:09 +0000 Subject: gro: Name the GRO result enumeration type This clarifies which return and parameter types are GRO result codes and not RX result codes. Signed-off-by: Ben Hutchings Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 68a1bb68b5a..1dc13744684 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2476,7 +2476,7 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct packet_type *ptype; @@ -2484,7 +2484,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int same_flow; int mac_len; - int ret; + enum gro_result ret; if (!(skb->dev->features & NETIF_F_GRO)) goto normal; @@ -2568,7 +2568,8 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; @@ -2585,7 +2586,7 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return dev_gro_receive(napi, skb); } -int napi_skb_finish(int ret, struct sk_buff *skb) +int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { int err = NET_RX_SUCCESS; @@ -2600,6 +2601,10 @@ int napi_skb_finish(int ret, struct sk_buff *skb) case GRO_MERGED_FREE: kfree_skb(skb); break; + + case GRO_HELD: + case GRO_MERGED: + break; } return err; @@ -2652,7 +2657,8 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) +int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { int err = NET_RX_SUCCESS; @@ -2674,6 +2680,9 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + + case GRO_MERGED: + break; } return err; -- cgit v1.2.3 From c7c4b3b6e976b95facbb723951bdcd554a3530a4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 29 Oct 2009 21:36:53 -0700 Subject: gro: Change all receive functions to return GRO result codes This will allow drivers to adjust their receive path dynamically based on whether GRO is being applied successfully. Currently all in-tree callers ignore the return values of these functions and do not need to be changed. Signed-off-by: Ben Hutchings Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1dc13744684..631cc40da19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2586,18 +2586,15 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return dev_gro_receive(napi, skb); } -int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: - return netif_receive_skb(skb); + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: kfree_skb(skb); break; @@ -2607,7 +2604,7 @@ int napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; } - return err; + return ret; } EXPORT_SYMBOL(napi_skb_finish); @@ -2627,7 +2624,7 @@ void skb_gro_reset_offset(struct sk_buff *skb) } EXPORT_SYMBOL(skb_gro_reset_offset); -int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); @@ -2657,26 +2654,21 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, - gro_result_t ret) +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) { - int err = NET_RX_SUCCESS; - switch (ret) { case GRO_NORMAL: case GRO_HELD: skb->protocol = eth_type_trans(skb, napi->dev); - if (ret == GRO_NORMAL) - return netif_receive_skb(skb); - - skb_gro_pull(skb, -ETH_HLEN); + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; break; case GRO_DROP: - err = NET_RX_DROP; - /* fall through */ - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; @@ -2685,7 +2677,7 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, break; } - return err; + return ret; } EXPORT_SYMBOL(napi_frags_finish); @@ -2726,12 +2718,12 @@ out: } EXPORT_SYMBOL(napi_frags_skb); -int napi_gro_frags(struct napi_struct *napi) +gro_result_t napi_gro_frags(struct napi_struct *napi) { struct sk_buff *skb = napi_frags_skb(napi); if (!skb) - return NET_RX_DROP; + return GRO_DROP; return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); } -- cgit v1.2.3 From 0bd8d53656da72bc065766b5f2a05ca738020b8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Oct 2009 01:40:11 -0700 Subject: net: use hlist_for_each_entry() Small cleanup of __dev_get_by_name() and __dev_get_by_index() to use hlist_for_each_entry() : They'll look like their _rcu variant. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 631cc40da19..94f42a15fff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -587,13 +587,13 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); - hlist_for_each(p, dev_name_hash(net, name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); + hlist_for_each_entry(dev, p, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -638,13 +638,13 @@ EXPORT_SYMBOL(dev_get_by_name); struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); - hlist_for_each(p, dev_index_hash(net, ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); + hlist_for_each_entry(dev, p, head, index_hlist) if (dev->ifindex == ifindex) return dev; - } + return NULL; } EXPORT_SYMBOL(__dev_get_by_index); -- cgit v1.2.3 From 0c509a6c9393b27a8c5a01acd4a72616206cfc24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 29 Oct 2009 14:18:21 +0000 Subject: net: Allow devices to specify a device specific sysfs group. This isn't beautifully abstracted, but it is simple, simplifies uses and so far is only needed for the bonding driver. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 89de182353b..157645c0da7 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -544,8 +544,11 @@ int netdev_register_kobject(struct net_device *net) dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS - *groups++ = &netstat_group; + /* Allow for a device specific group */ + if (*groups) + groups++; + *groups++ = &netstat_group; #ifdef CONFIG_WIRELESS_EXT_SYSFS if (net->ieee80211_ptr) *groups++ = &wireless_group; -- cgit v1.2.3 From 72c9528bab94cc052d00ce241b8e85f5d71e45f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Oct 2009 07:11:27 +0000 Subject: net: Introduce dev_get_by_name_rcu() Some workloads hit dev_base_lock rwlock pretty hard. We can use RCU lookups to avoid touching this rwlock (and avoid touching netdevice refcount) netdevices are already freed after a RCU grace period, so this patch adds no penalty at device dismantle time. However, it adds a synchronize_rcu() call in dev_change_name() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 94f42a15fff..f54d8b8a434 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -213,7 +213,7 @@ static int list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail(&dev->dev_list, &net->dev_base_head); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -230,7 +230,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del(&dev->dev_list); - hlist_del(&dev->name_hlist); + hlist_del_rcu(&dev->name_hlist); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); } @@ -598,6 +598,32 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name) } EXPORT_SYMBOL(__dev_get_by_name); +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + /** * dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -614,11 +640,11 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); @@ -960,7 +986,12 @@ rollback: write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -1062,9 +1093,9 @@ void dev_load(struct net *net, const char *name) { struct net_device *dev; - read_lock(&dev_base_lock); - dev = __dev_get_by_name(net, name); - read_unlock(&dev_base_lock); + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); if (!dev && capable(CAP_NET_ADMIN)) request_module("%s", name); -- cgit v1.2.3 From 9fdce099bb72df534daa6193318feaec177998fc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Oct 2009 14:51:13 +0000 Subject: veth: Fix unregister_netdevice_queue for veth I tested the recent unregister many changes and got a weird, nasty and seemingly unrelasted kernel oops. Changing unregister_netdevice_queue to use list_move_tail fixes the problem for me. ip link add type veth rmmod veth ls /sys/class/net/ showed one of the veth devices still present. A subsequent ip link oopsed the box. Signed-off-by: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f54d8b8a434..3c40d545a03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5258,6 +5258,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); @@ -5339,7 +5340,7 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); if (head) { - list_add_tail(&dev->unreg_list, head); + list_move_tail(&dev->unreg_list, head); } else { rollback_registered(dev); /* Finish processing unregister after unlock */ -- cgit v1.2.3 From 3710becf8a58a5c6c4e797e3a3c968c161abdb41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 1 Nov 2009 19:42:09 +0000 Subject: net: RCU locking for simple ioctl() All ioctls() implemented by dev_ifsioc_locked() : SIOCGIFFLAGS, SIOCGIFMETRIC, SIOCGIFMTU, SIOCGIFHWADDR, SIOCGIFSLAVE, SIOCGIFMAP, SIOCGIFINDEX & SIOCGIFTXQLEN can use RCU lock instead of dev_base_lock rwlock Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3c40d545a03..76a1502efe6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4315,12 +4315,12 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) EXPORT_SYMBOL(dev_set_mac_address); /* - * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; @@ -4552,9 +4552,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr.ifr_name); - read_lock(&dev_base_lock); + rcu_read_lock(); ret = dev_ifsioc_locked(net, &ifr, cmd); - read_unlock(&dev_base_lock); + rcu_read_unlock(); if (!ret) { if (colon) *colon = ':'; -- cgit v1.2.3 From c6d14c84566d6b70ad9dc1618db0dec87cca9300 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Nov 2009 05:43:23 -0800 Subject: net: Introduce for_each_netdev_rcu() iterator Adds RCU management to the list of netdevices. Convert some for_each_netdev() users to RCU version, if it can avoid read_lock-ing dev_base_lock Ie: read_lock(&dev_base_loack); for_each_netdev(net, dev) some_action(); read_unlock(&dev_base_lock); becomes : rcu_read_lock(); for_each_netdev_rcu(net, dev) some_action(); rcu_read_unlock(); Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 76a1502efe6..bf629ac08b8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -175,7 +175,7 @@ static struct list_head ptype_all __read_mostly; /* Taps */ * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * - * Pure readers hold dev_base_lock for reading. + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the @@ -212,7 +212,7 @@ static int list_netdevice(struct net_device *dev) ASSERT_RTNL(); write_lock_bh(&dev_base_lock); - list_add_tail(&dev->dev_list, &net->dev_base_head); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); @@ -229,7 +229,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); - list_del(&dev->dev_list); + list_del_rcu(&dev->dev_list); hlist_del_rcu(&dev->name_hlist); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -799,15 +799,15 @@ struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, struct net_device *dev, *ret; ret = NULL; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); ret = dev; break; } } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_get_by_flags); @@ -3077,18 +3077,18 @@ static int dev_ifconf(struct net *net, char __user *arg) * in detail. */ void *dev_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(RCU) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; - read_lock(&dev_base_lock); + rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; - for_each_netdev(net, dev) + for_each_netdev_rcu(net, dev) if (off++ == *pos) return dev; @@ -3097,16 +3097,18 @@ void *dev_seq_start(struct seq_file *seq, loff_t *pos) void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net *net = seq_file_net(seq); + struct net_device *dev = (v == SEQ_START_TOKEN) ? + first_net_device(seq_file_net(seq)) : + next_net_device((struct net_device *)v); + ++*pos; - return v == SEQ_START_TOKEN ? - first_net_device(net) : next_net_device((struct net_device *)v); + return rcu_dereference(dev); } void dev_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(RCU) { - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -- cgit v1.2.3 From bf8e56bfc4fcfcef9f08e6233dc619706807893a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2009 21:03:39 -0800 Subject: net: sock_bindtodevice() RCU-ification Avoid dev_hold()/dev_put() in sock_bindtodevice() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5a51512f638..38820eaecd4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -420,14 +420,16 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (devname[0] == '\0') { index = 0; } else { - struct net_device *dev = dev_get_by_name(net, devname); + struct net_device *dev; + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); ret = -ENODEV; if (!dev) goto out; - - index = dev->ifindex; - dev_put(dev); } lock_sock(sk); -- cgit v1.2.3 From baac8564547ac7f944af1c2e8cc6fdd57f2836a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2009 21:04:32 -0800 Subject: pktgen: tx_bytes might be slightly wrong cur_pkt_size can be changed in proc fs while pktgen is running, we better use a private field to get precise tx-bytes counter. Signed-off-by: Ben Greear Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5ce017bf4af..d38470a3279 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -340,6 +340,7 @@ struct pktgen_dev { __u16 cur_udp_src; __u16 cur_queue_map; __u32 cur_pkt_size; + __u32 last_pkt_size; __u8 hh[14]; /* = { @@ -3434,7 +3435,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->clone_count--; /* back out increment, OOM */ return; } - + pkt_dev->last_pkt_size = pkt_dev->skb->len; pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } @@ -3461,7 +3462,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; - pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; break; default: /* Drivers are not supposed to return other values! */ if (net_ratelimit()) -- cgit v1.2.3 From 000ba2e43f33901859fd794bb33c885909d53b3b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 5 Nov 2009 22:37:11 -0800 Subject: net: Fix build warning in sock_bindtodevice(). net/core/sock.c: In function 'sock_setsockopt': net/core/sock.c:396: warning: 'index' may be used uninitialized in this function net/core/sock.c:396: note: 'index' was declared here GCC can't see that all paths initialize index, so just set it to the default (0) and eliminate the specific code block that handles the null device name string. Signed-off-by: David S. Miller --- net/core/sock.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 38820eaecd4..76ff58d43e2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -417,9 +417,8 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) if (copy_from_user(devname, optval, optlen)) goto out; - if (devname[0] == '\0') { - index = 0; - } else { + index = 0; + if (devname[0] != '\0') { struct net_device *dev; rcu_read_lock(); -- cgit v1.2.3 From d3bcfefaca27c1bfc8f740f5fff5b25d52ed1211 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 6 Nov 2009 22:17:25 -0800 Subject: net: Replace old style lock initializer SPIN_LOCK_UNLOCKED is deprecated. Use DEFINE_SPINLOCK instead. Signed-off-by: Thomas Gleixner Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 0a113f26bc9..b8e9d3a8688 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -41,7 +41,7 @@ static void send_dm_alert(struct work_struct *unused); * netlink alerts */ static int trace_state = TRACE_OFF; -static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(trace_state_lock); struct per_cpu_dm_data { struct work_struct dm_alert_work; -- cgit v1.2.3 From e0d087af725b09358336098a6b57bb7f90f96175 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Nov 2009 01:26:17 -0800 Subject: rtnetlink: Cleanups Pure cleanups patch Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 54 ++++++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 31 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 391a62cd9df..e2f3317f290 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -38,7 +38,6 @@ #include #include -#include #include #include @@ -53,8 +52,7 @@ #include #include -struct rtnl_link -{ +struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; }; @@ -65,6 +63,7 @@ void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_lock); void __rtnl_unlock(void) { @@ -76,16 +75,19 @@ void rtnl_unlock(void) /* This fellow will unlock it for us. */ netdev_run_todo(); } +EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } +EXPORT_SYMBOL(rtnl_is_locked); static struct rtnl_link *rtnl_msg_handlers[NPROTO]; @@ -168,7 +170,6 @@ int __rtnl_register(int protocol, int msgtype, return 0; } - EXPORT_SYMBOL_GPL(__rtnl_register); /** @@ -188,7 +189,6 @@ void rtnl_register(int protocol, int msgtype, "protocol = %d, message type = %d\n", protocol, msgtype); } - EXPORT_SYMBOL_GPL(rtnl_register); /** @@ -213,7 +213,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } - EXPORT_SYMBOL_GPL(rtnl_unregister); /** @@ -230,7 +229,6 @@ void rtnl_unregister_all(int protocol) kfree(rtnl_msg_handlers[protocol]); rtnl_msg_handlers[protocol] = NULL; } - EXPORT_SYMBOL_GPL(rtnl_unregister_all); static LIST_HEAD(link_ops); @@ -253,7 +251,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) list_add_tail(&ops->list, &link_ops); return 0; } - EXPORT_SYMBOL_GPL(__rtnl_link_register); /** @@ -271,7 +268,6 @@ int rtnl_link_register(struct rtnl_link_ops *ops) rtnl_unlock(); return err; } - EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) @@ -309,7 +305,6 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) } list_del(&ops->list); } - EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /** @@ -322,7 +317,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) __rtnl_link_unregister(ops); rtnl_unlock(); } - EXPORT_SYMBOL_GPL(rtnl_link_unregister); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) @@ -427,12 +421,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data struct rtattr *rta; int size = RTA_LENGTH(attrlen); - rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); rta->rta_type = attrtype; rta->rta_len = size; memcpy(RTA_DATA(rta), data, attrlen); memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); } +EXPORT_SYMBOL(__rta_fill); int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) { @@ -454,6 +449,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) return nlmsg_unicast(rtnl, skb, pid); } +EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) @@ -466,6 +462,7 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, nlmsg_notify(rtnl, skb, pid, group, report, flags); } +EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { @@ -473,6 +470,7 @@ void rtnl_set_sk_err(struct net *net, u32 group, int error) netlink_set_err(rtnl, 0, group, error); } +EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { @@ -501,6 +499,7 @@ nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } +EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, u32 ts, u32 tsage, long expires, u32 error) @@ -520,14 +519,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } - EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; - switch(transition) { + switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_UNKNOWN) && @@ -728,6 +726,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, }; +EXPORT_SYMBOL(ifla_policy); static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, @@ -932,7 +931,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) goto errout; } - if ((err = validate_linkmsg(dev, tb)) < 0) + err = validate_linkmsg(dev, tb); + if (err < 0) goto errout; err = do_setlink(dev, ifm, tb, ifname, 0); @@ -985,7 +985,8 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, unsigned int real_num_queues = 1; if (ops->get_tx_queues) { - err = ops->get_tx_queues(net, tb, &num_queues, &real_num_queues); + err = ops->get_tx_queues(net, tb, &num_queues, + &real_num_queues); if (err) goto err; } @@ -1026,6 +1027,7 @@ err_free: err: return ERR_PTR(err); } +EXPORT_SYMBOL(rtnl_create_link); static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { @@ -1059,7 +1061,8 @@ replay: else dev = NULL; - if ((err = validate_linkmsg(dev, tb)) < 0) + err = validate_linkmsg(dev, tb); + if (err < 0) return err; if (tb[IFLA_LINKINFO]) { @@ -1210,7 +1213,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) if (s_idx == 0) s_idx = 1; - for (idx=1; idxnlh->nlmsg_type-RTM_BASE; if (idx < s_idx || idx == PF_PACKET) continue; @@ -1277,7 +1280,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) return 0; - family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; if (family >= NPROTO) return -EAFNOSUPPORT; @@ -1310,7 +1313,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_len > min_len) { int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); - struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); while (RTA_OK(attr, attrlen)) { unsigned flavor = attr->rta_type; @@ -1416,14 +1419,3 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all); } -EXPORT_SYMBOL(__rta_fill); -EXPORT_SYMBOL(rtnetlink_put_metrics); -EXPORT_SYMBOL(rtnl_lock); -EXPORT_SYMBOL(rtnl_trylock); -EXPORT_SYMBOL(rtnl_unlock); -EXPORT_SYMBOL(rtnl_is_locked); -EXPORT_SYMBOL(rtnl_unicast); -EXPORT_SYMBOL(rtnl_notify); -EXPORT_SYMBOL(rtnl_set_sk_err); -EXPORT_SYMBOL(rtnl_create_link); -EXPORT_SYMBOL(ifla_policy); -- cgit v1.2.3 From 81adee47dfb608df3ad0b91d230fb3cef75f0060 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 8 Nov 2009 00:53:51 -0800 Subject: net: Support specifying the network namespace upon device creation. There is no good reason to not support userspace specifying the network namespace during device creation, and it makes it easier to create a network device and pass it to a child network namespace with a well known name. We have to be careful to ensure that the target network namespace for the new device exists through the life of the call. To keep that logic clear I have factored out the network namespace grabbing logic into rtnl_link_get_net. In addtion we need to continue to pass the source network namespace to the rtnl_link_ops.newlink method so that we can find the base device source network namespace. Signed-off-by: Eric W. Biederman Acked-by: Eric Dumazet --- net/core/rtnetlink.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e2f3317f290..33148a56819 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -733,6 +733,20 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -756,8 +770,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int err; if (tb[IFLA_NET_NS_PID]) { - struct net *net; - net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; @@ -976,8 +989,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } -struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -985,7 +998,7 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, unsigned int real_num_queues = 1; if (ops->get_tx_queues) { - err = ops->get_tx_queues(net, tb, &num_queues, + err = ops->get_tx_queues(src_net, tb, &num_queues, &real_num_queues); if (err) goto err; @@ -995,16 +1008,16 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, if (!dev) goto err; + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; dev->real_num_tx_queues = real_num_queues; + if (strchr(dev->name, '%')) { err = dev_alloc_name(dev, dev->name); if (err < 0) goto err_free; } - dev_net_set(dev, net); - dev->rtnl_link_ops = ops; - if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); if (tb[IFLA_ADDRESS]) @@ -1083,6 +1096,7 @@ replay: if (1) { struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; if (ops) { if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { @@ -1147,17 +1161,19 @@ replay: if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - dev = rtnl_create_link(net, ifname, ops, tb); + dest_net = rtnl_link_get_net(net, tb); + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); if (IS_ERR(dev)) err = PTR_ERR(dev); else if (ops->newlink) - err = ops->newlink(dev, tb, data); + err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) free_netdev(dev); + + put_net(dest_net); return err; } } -- cgit v1.2.3 From e84af6ddef0e447c56b256a2bb5a90af11bdac2e Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 10 Nov 2009 14:11:01 +0000 Subject: skbuff: Do not allow skb recycling with disabled IRQs NAPI drivers try to recycle SKBs in their polling routine, but we generally don't know the context in which the polling will be called, and the skb recycling itself may require IRQs to be enabled. This patch adds irqs_disabled() test to the skb_recycle_check() routine, so that we'll not let the drivers hit the skb recycling path with IRQs disabled. As a side effect, this patch actually disables skb recycling for some [broken] drivers. E.g. gianfar driver grabs an irqsave spinlock during TX ring processing, and then tries to recycle an skb, and that caused the following badness: nf_conntrack version 0.5.0 (1008 buckets, 4032 max) ------------[ cut here ]------------ Badness at kernel/softirq.c:143 NIP: c003e3c4 LR: c423a528 CTR: c003e344 ... NIP [c003e3c4] local_bh_enable+0x80/0xc4 LR [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack] Call Trace: [c15d1b60] [c003e32c] local_bh_disable+0x1c/0x34 (unreliable) [c15d1b70] [c423a528] destroy_conntrack+0xd4/0x13c [nf_conntrack] [c15d1b80] [c02c6370] nf_conntrack_destroy+0x3c/0x70 Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80a96166df3..941bac90748 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -493,6 +493,9 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) { struct skb_shared_info *shinfo; + if (irqs_disabled()) + return 0; + if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) return 0; -- cgit v1.2.3 From 08e9897d512fe7a67e46209543b3815b57a36dc7 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 10 Nov 2009 07:20:34 +0000 Subject: netdev: fold name hash properly (v3) The full_name_hash function does not produce well distributed values in the lower bits, so most code uses hash_32() to fold it. This is really a bug introduced when name hashing was added, back in 2.5 when I added name hashing. hash_32 is all that is needed since full_name_hash returns unsigned int which is only 32 bits on 64 bit platforms. Also, there is no point in using hash_32 on ifindex, because the is naturally sequential and usually well distributed. Signed-off-by: Stephen Hemminger Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bf629ac08b8..ad8e320ceba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -196,7 +197,7 @@ EXPORT_SYMBOL(dev_base_lock); static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)]; + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) -- cgit v1.2.3 From 572a9d7b6fc7f20f573664063324c086be310c42 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 10 Nov 2009 06:14:14 +0000 Subject: net: allow to propagate errors through ->ndo_hard_start_xmit() Currently the ->ndo_hard_start_xmit() callbacks are only permitted to return one of the NETDEV_TX codes. This prevents any kind of error propagation for virtual devices, like queue congestion of the underlying device in case of layered devices, or unreachability in case of tunnels. This patches changes the NET_XMIT codes to avoid clashes with the NETDEV_TX codes and changes the two callers of dev_hard_start_xmit() to expect either errno codes, NET_XMIT codes or NETDEV_TX codes as return value. In case of qdisc_restart(), all non NETDEV_TX codes are mapped to NETDEV_TX_OK since no error propagation is possible when using qdiscs. In case of dev_queue_xmit(), the error is propagated upwards. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ad8e320ceba..548340b5729 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1757,7 +1757,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { const struct net_device_ops *ops = dev->netdev_ops; - int rc; + int rc = NETDEV_TX_OK; if (likely(!skb->next)) { if (!list_empty(&ptype_all)) @@ -1805,6 +1805,8 @@ gso: nskb->next = NULL; rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; nskb->next = skb->next; skb->next = nskb; return rc; @@ -1814,11 +1816,12 @@ gso: return NETDEV_TX_BUSY; } while (skb->next); - skb->destructor = DEV_GSO_CB(skb)->destructor; - +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); - return NETDEV_TX_OK; + return rc; } static u32 skb_tx_hashrnd; @@ -1906,6 +1909,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +static inline bool dev_xmit_complete(int rc) +{ + /* successful transmission */ + if (rc == NETDEV_TX_OK) + return true; + + /* error while transmitting, driver consumed skb */ + if (rc < 0) + return true; + + /* error while queueing to a different device, driver consumed skb */ + if (rc & NET_XMIT_MASK) + return true; + + return false; +} + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2003,8 +2023,8 @@ gso: HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { - rc = NET_XMIT_SUCCESS; - if (!dev_hard_start_xmit(skb, dev, txq)) { + rc = dev_hard_start_xmit(skb, dev, txq); + if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } -- cgit v1.2.3 From ed04642f753b1240fc65c2978b7365e93209972a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Nov 2009 21:54:04 +0000 Subject: net: check the return value of ndo_select_queue() Check the return value of ndo_select_queue(). If the value isn't smaller than the real_num_tx_queues, print a warning message, and reset it to zero. Signed-off-by: Changli Gao Signed-off-by: Eric Dumazet ---- Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 548340b5729..32045df1da5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1848,6 +1848,20 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) } EXPORT_SYMBOL(skb_tx_hash); +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + WARN(1, "%s selects TX queue %d, but " + "real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} + static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { @@ -1861,6 +1875,7 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev, if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); } else { queue_index = 0; if (dev->real_num_tx_queues > 1) -- cgit v1.2.3 From 9a1654ba0b50402a6bd03c7b0fe9b0200a5ea7b1 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sun, 15 Nov 2009 07:20:12 +0000 Subject: net: Optimize hard_start_xmit() return checking Recent changes in the TX error propagation require additional checking and masking of values returned from hard_start_xmit(), mainly to separate cases where skb was consumed. This aim can be simplified by changing the order of NETDEV_TX and NET_XMIT codes, because the latter are treated similarly to negative (ERRNO) values. After this change much simpler dev_xmit_complete() is also used in sch_direct_xmit(), so it is moved to netdevice.h. Additionally NET_RX definitions in netdevice.h are moved up from between TX codes to avoid confusion while reading the TX comment. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 32045df1da5..4b24d79414e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1924,23 +1924,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } -static inline bool dev_xmit_complete(int rc) -{ - /* successful transmission */ - if (rc == NETDEV_TX_OK) - return true; - - /* error while transmitting, driver consumed skb */ - if (rc < 0) - return true; - - /* error while queueing to a different device, driver consumed skb */ - if (rc & NET_XMIT_MASK) - return true; - - return false; -} - /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit -- cgit v1.2.3 From d83345adf96bc13a5e360f4649a2e68ef968dec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Nov 2009 03:36:51 +0000 Subject: net: add dev_txq_stats_fold() helper Some drivers ndo_get_stats() method need to perform txqueue stats folding. Move folding from dev_get_stats() to a new dev_txq_stats_fold() function Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d867522290b..c3e0578d29d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5168,6 +5168,32 @@ void netdev_run_todo(void) } } +/** + * dev_txq_stats_fold - fold tx_queues stats + * @dev: device to get statistics from + * @stats: struct net_device_stats to hold results + */ +void dev_txq_stats_fold(const struct net_device *dev, + struct net_device_stats *stats) +{ + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } +} +EXPORT_SYMBOL(dev_txq_stats_fold); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -5182,25 +5208,9 @@ const struct net_device_stats *dev_get_stats(struct net_device *dev) if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else { - unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; - struct net_device_stats *stats = &dev->stats; - unsigned int i; - struct netdev_queue *txq; - - for (i = 0; i < dev->num_tx_queues; i++) { - txq = netdev_get_tx_queue(dev, i); - tx_bytes += txq->tx_bytes; - tx_packets += txq->tx_packets; - tx_dropped += txq->tx_dropped; - } - if (tx_bytes || tx_packets || tx_dropped) { - stats->tx_bytes = tx_bytes; - stats->tx_packets = tx_packets; - stats->tx_dropped = tx_dropped; - } - return stats; - } + + dev_txq_stats_fold(dev, &dev->stats); + return &dev->stats; } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.2.3 From 395264d509aec45149745843d9a737140a1ece16 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 16 Nov 2009 13:49:35 +0000 Subject: net: introduce NETDEV_UNREGISTER_PERNET This new event is called once for each unique net namespace in batched unregister operations (with the argument set to a random device from that namespace) and once per device in non-batched unregister operations. It allows us to factorize some device unregister work such as clearing the routing cache. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c3e0578d29d..e25fe5d9343 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1344,6 +1344,7 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev); } } @@ -4721,7 +4722,8 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *aux, *fdev; + LIST_HEAD(pernet_list); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -4779,8 +4781,24 @@ static void rollback_registered_many(struct list_head *head) synchronize_net(); - list_for_each_entry(dev, head, unreg_list) + list_for_each_entry_safe(dev, aux, head, unreg_list) { + int new_net = 1; + list_for_each_entry(fdev, &pernet_list, unreg_list) { + if (dev_net(dev) == dev_net(fdev)) { + new_net = 0; + dev_put(dev); + break; + } + } + if (new_net) + list_move(&dev->unreg_list, &pernet_list); + } + + list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) { + call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); + list_move(&dev->unreg_list, head); dev_put(dev); + } } static void rollback_registered(struct net_device *dev) @@ -5074,6 +5092,8 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users + * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5385,6 +5405,10 @@ EXPORT_SYMBOL(unregister_netdevice_queue); * unregister_netdevice_many - unregister many devices * @head: list of devices * + * WARNING: Calling this modifies the given list + * (in rollback_registered_many). It may change the order of the elements + * in the list. However, you can assume it does not add or delete elements + * to/from the list. */ void unregister_netdevice_many(struct list_head *head) { @@ -5504,6 +5528,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); /* * Flush the unicast and multicast chains -- cgit v1.2.3 From e014debecd3ee3832e6476b3a9c948edfcfd1250 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Nov 2009 05:59:21 +0000 Subject: linkwatch: linkwatch_forget_dev() to speedup device dismantle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Herbert Xu a écrit : > On Tue, Nov 17, 2009 at 04:26:04AM -0800, David Miller wrote: >> Really, the link watch stuff is just due for a redesign. I don't >> think a simple hack is going to cut it this time, sorry Eric :-) > > I have no objections against any redesigns, but since the only > caller of linkwatch_forget_dev runs in process context with the > RTNL, it could also legally emit those events. Thanks guys, here an updated version then, before linkwatch surgery ? In this version, I force the event to be sent synchronously. [PATCH net-next-2.6] linkwatch: linkwatch_forget_dev() to speedup device dismantle time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105 real 0m0.266s user 0m0.000s sys 0m0.001s real 0m0.770s user 0m0.000s sys 0m0.000s real 0m1.022s user 0m0.000s sys 0m0.000s One problem of current schem in vlan dismantle phase is the holding of device done by following chain : vlan_dev_stop() -> netif_carrier_off(dev) -> linkwatch_fire_event(dev) -> dev_hold() ... And __linkwatch_run_queue() runs up to one second later... A generic fix to this problem is to add a linkwatch_forget_dev() method to unlink the device from the list of watched devices. dev->link_watch_next becomes dev->link_watch_list (and use a bit more memory), to be able to unlink device in O(1). After patch : time ip link del eth3.103 ; time ip link del eth3.104 ; time ip link del eth3.105 real 0m0.024s user 0m0.000s sys 0m0.000s real 0m0.032s user 0m0.000s sys 0m0.001s real 0m0.033s user 0m0.000s sys 0m0.000s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++ net/core/link_watch.c | 94 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e25fe5d9343..c128af708eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5085,6 +5085,8 @@ static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + linkwatch_forget_dev(dev); + rebroadcast_time = warning_time = jiffies; while (atomic_read(&dev->refcnt) != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { @@ -5311,6 +5313,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index bf8f7af699d..5910b555a54 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -35,7 +35,7 @@ static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); -static struct net_device *lweventlist; +static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) @@ -89,8 +89,10 @@ static void linkwatch_add_event(struct net_device *dev) unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); - dev->link_watch_next = lweventlist; - lweventlist = dev; + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + dev_hold(dev); + } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -133,9 +135,35 @@ static void linkwatch_schedule_work(int urgent) } +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + dev_put(dev); +} + static void __linkwatch_run_queue(int urgent_only) { - struct net_device *next; + struct net_device *dev; + LIST_HEAD(wrk); /* * Limit the number of linkwatch events to one @@ -153,46 +181,40 @@ static void __linkwatch_run_queue(int urgent_only) clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); - next = lweventlist; - lweventlist = NULL; - spin_unlock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); - while (next) { - struct net_device *dev = next; + while (!list_empty(&wrk)) { - next = dev->link_watch_next; + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); if (urgent_only && !linkwatch_urgent_event(dev)) { - linkwatch_add_event(dev); + list_add_tail(&dev->link_watch_list, &lweventlist); continue; } - - /* - * Make sure the above read is complete since it can be - * rewritten as soon as we clear the bit below. - */ - smp_mb__before_clear_bit(); - - /* We are about to handle this device, - * so new events can be accepted - */ - clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); - - rfc2863_policy(dev); - if (dev->flags & IFF_UP) { - if (netif_carrier_ok(dev)) - dev_activate(dev); - else - dev_deactivate(dev); - - netdev_state_change(dev); - } - - dev_put(dev); + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + spin_lock_irq(&lweventlist_lock); } - if (lweventlist) + if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); } @@ -216,8 +238,6 @@ void linkwatch_fire_event(struct net_device *dev) bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { - dev_hold(dev); - linkwatch_add_event(dev); } else if (!urgent) return; -- cgit v1.2.3 From d90310243fd750240755e217c5faa13e24f41536 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 18 Nov 2009 02:36:59 +0000 Subject: net: device name allocation cleanups Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 69 ++++++++++++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c128af708eb..9977288583b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -893,7 +893,8 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) free_page((unsigned long) inuse); } - snprintf(buf, IFNAMSIZ, name, i); + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; @@ -933,6 +934,21 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); +static int dev_get_valid_name(struct net *net, const char *name, char *buf, + bool fmt) +{ + if (!dev_valid_name(name)) + return -EINVAL; + + if (fmt && strchr(name, '%')) + return __dev_alloc_name(net, name, buf); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (buf != name) + strlcpy(buf, name, IFNAMSIZ); + + return 0; +} /** * dev_change_name - change name of a device @@ -956,22 +972,14 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (!dev_valid_name(newname)) - return -EINVAL; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) return 0; memcpy(oldname, dev->name, IFNAMSIZ); - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - } else if (__dev_get_by_name(net, newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); + err = dev_get_valid_name(net, newname, dev->name, 1); + if (err < 0) + return err; rollback: /* For now only devices in the initial network namespace @@ -4883,8 +4891,6 @@ EXPORT_SYMBOL(netdev_fix_features); int register_netdevice(struct net_device *dev) { - struct hlist_head *head; - struct hlist_node *p; int ret; struct net *net = dev_net(dev); @@ -4913,26 +4919,14 @@ int register_netdevice(struct net_device *dev) } } - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; + ret = dev_get_valid_name(net, dev->name, dev->name, 0); + if (ret) goto err_uninit; - } dev->ifindex = dev_new_index(net); if (dev->iflink == -1) dev->iflink = dev->ifindex; - /* Check for existence of name */ - head = dev_name_hash(net, dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto err_uninit; - } - } - /* Fix illegal checksum combinations */ if ((dev->features & NETIF_F_HW_CSUM) && (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { @@ -5460,8 +5454,6 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - char buf[IFNAMSIZ]; - const char *destname; int err; ASSERT_RTNL(); @@ -5494,20 +5486,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char * we can use it in the destination network namespace. */ err = -EEXIST; - destname = dev->name; - if (__dev_get_by_name(net, destname)) { + if (__dev_get_by_name(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; - if (!dev_valid_name(pat)) - goto out; - if (strchr(pat, '%')) { - if (__dev_alloc_name(net, pat, buf) < 0) - goto out; - destname = buf; - } else - destname = pat; - if (__dev_get_by_name(net, destname)) + if (dev_get_valid_name(net, pat, dev->name, 1)) goto out; } @@ -5544,10 +5527,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - /* Assign the new device name */ - if (destname != dev->name) - strcpy(dev->name, destname); - /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) { int iflink = (dev->iflink == dev->ifindex); -- cgit v1.2.3 From 8964be4a9a5ca8cab1219bb046db2f6d1936227c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Nov 2009 15:35:04 -0800 Subject: net: rename skb->iif to skb->skb_iif To help grep games, rename iif to skb_iif Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/skbuff.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9977288583b..09f3d6b9c0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2287,7 +2287,7 @@ static int ing_filter(struct sk_buff *skb) if (MAX_RED_LOOP < ttl++) { printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n", - skb->iif, dev->ifindex); + skb->skb_iif, dev->ifindex); return TC_ACT_SHOT; } @@ -2395,8 +2395,8 @@ int netif_receive_skb(struct sk_buff *skb) if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->iif) - skb->iif = skb->dev->ifindex; + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; null_or_orig = NULL; orig_dev = skb->dev; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 739b8f4dd32..bfa3e7865a8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -549,7 +549,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->protocol = old->protocol; new->mark = old->mark; - new->iif = old->iif; + new->skb_iif = old->skb_iif; __nf_copy(new, old); #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -- cgit v1.2.3 From 6ebfbc065624790772398f5b327ac33a7ae3880b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 22 Nov 2009 20:43:13 -0800 Subject: net: Fix missing kernel-doc notation Fix the following htmldocs warning: Warning(net/core/dev.c:5378): bad line: Signed-off-by: Jaswinder Singh Rajput Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 09f3d6b9c0c..ccefa2473c3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5375,7 +5375,7 @@ EXPORT_SYMBOL(synchronize_net); * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list - + * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. -- cgit v1.2.3 From 09ad9bc752519cc167d0a573e1acf69b5c707c67 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 25 Nov 2009 15:14:13 -0800 Subject: net: use net_eq to compare nets Generated with the following semantic patch @@ struct net *n1; struct net *n2; @@ - n1 == n2 + net_eq(n1, n2) @@ struct net *n1; struct net *n2; @@ - n1 != n2 + !net_eq(n1, n2) applied over {include,net,drivers/net}. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/neighbour.c | 2 +- net/core/net-sysfs.c | 4 ++-- net/core/net_namespace.c | 2 +- net/core/sysctl_net_core.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ccefa2473c3..e65af604141 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -985,7 +985,7 @@ rollback: /* For now only devices in the initial network namespace * are in sysfs. */ - if (net == &init_net) { + if (net_eq(net, &init_net)) { ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); @@ -4792,7 +4792,7 @@ static void rollback_registered_many(struct list_head *head) list_for_each_entry_safe(dev, aux, head, unreg_list) { int new_net = 1; list_for_each_entry(fdev, &pernet_list, unreg_list) { - if (dev_net(dev) == dev_net(fdev)) { + if (net_eq(dev_net(dev), dev_net(fdev))) { new_net = 0; dev_put(dev); break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e587e681969..a08a35bf0a7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2092,7 +2092,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (h > s_h) s_idx = 0; for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { - if (dev_net(n->dev) != net) + if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) goto next; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 157645c0da7..fbc1c7472c5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -525,7 +525,7 @@ void netdev_unregister_kobject(struct net_device * net) kobject_get(&dev->kobj); - if (dev_net(net) != &init_net) + if (!net_eq(dev_net(net), &init_net)) return; device_del(dev); @@ -559,7 +559,7 @@ int netdev_register_kobject(struct net_device *net) #endif #endif /* CONFIG_SYSFS */ - if (dev_net(net) != &init_net) + if (!net_eq(dev_net(net), &init_net)) return 0; return device_add(dev); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1c1af2756f3..86ed7f44d08 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -280,7 +280,7 @@ out_undo: list_del(&ops->list); if (ops->exit) { for_each_net(undo_net) { - if (undo_net == net) + if (net_eq(undo_net, net)) goto undone; ops->exit(undo_net); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7db1de0497c..fcfc5458c39 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -134,7 +134,7 @@ static __net_init int sysctl_core_net_init(struct net *net) net->core.sysctl_somaxconn = SOMAXCONN; tbl = netns_core_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; -- cgit v1.2.3 From 445409602c09219767c06497c0dc2285eac244ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Nov 2009 06:07:08 +0000 Subject: veth: move loopback logic to common location The veth driver contains code to forward an skb from the start_xmit function of one network device into the receive path of another device. Moving that code into a common location lets us reuse the code for direct forwarding of data between macvlan ports, and possibly in other drivers. Signed-off-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e65af604141..7775e8b48de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include #include @@ -1419,6 +1420,45 @@ static inline void net_timestamp(struct sk_buff *skb) skb->tstamp.tv64 = 0; } +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + skb_orphan(skb); + + if (!(dev->flags & IFF_UP)) + return NET_RX_DROP; + + if (skb->len > (dev->mtu + dev->hard_header_len)) + return NET_RX_DROP; + + skb_dst_drop(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + /* * Support routine. Sends outgoing frames to any network * taps currently in use. -- cgit v1.2.3 From 3291b9db567e1ec38362024049cbd02987b1e277 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Nov 2009 00:44:33 -0800 Subject: pktgen: NUMA aware pktgen threads are bound to given CPU, we can allocate memory for these threads in a NUMA aware way. After a pktgen session on two threads, we can check flows memory was allocated on right node, instead of a not related one. # grep pktgen_thread_write /proc/vmallocinfo 0xffffc90007204000-0xffffc90007385000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N0=384 0xffffc90007386000-0xffffc90007507000 1576960 pktgen_thread_write+0x3a4/0x6b0 [pktgen] pages=384 vmalloc N1=384 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b44104d3e6b..19ee493ec17 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3626,6 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) { struct pktgen_dev *pkt_dev; int err; + int node = cpu_to_node(t->cpu); /* We don't allow a device to be on several threads */ @@ -3635,12 +3636,13 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) return -EBUSY; } - pkt_dev = kzalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); if (!pkt_dev) return -ENOMEM; strcpy(pkt_dev->odevname, ifname); - pkt_dev->flows = vmalloc(MAX_CFLOWS * sizeof(struct flow_state)); + pkt_dev->flows = vmalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + node); if (pkt_dev->flows == NULL) { kfree(pkt_dev); return -ENOMEM; @@ -3702,7 +3704,8 @@ static int __init pktgen_create_thread(int cpu) struct proc_dir_entry *pe; struct task_struct *p; - t = kzalloc(sizeof(struct pktgen_thread), GFP_KERNEL); + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); if (!t) { printk(KERN_ERR "pktgen: ERROR: out of memory, can't " "create new thread.\n"); -- cgit v1.2.3 From f64f9e719261a87818dd192a3a2352e5b20fbd0f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 29 Nov 2009 16:55:45 -0800 Subject: net: Move && and || to end of previous line Not including net/atm/ Compiled tested x86 allyesconfig only Added a > 80 column line or two, which I ignored. Existing checkpatch plaints willfully, cheerfully ignored. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++--- net/core/pktgen.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7775e8b48de..5d131c2f84c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2677,9 +2677,10 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return GRO_NORMAL; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) - && !compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = + (p->dev == skb->dev) && + !compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 19ee493ec17..a23b45f08ec 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2052,9 +2052,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) read_lock_bh(&idev->lock); for (ifp = idev->addr_list; ifp; ifp = ifp->if_next) { - if (ifp->scope == IFA_LINK - && !(ifp-> - flags & IFA_F_TENTATIVE)) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & IFA_F_TENTATIVE)) { ipv6_addr_copy(&pkt_dev-> cur_in6_saddr, &ifp->addr); -- cgit v1.2.3 From a5ee155136b4a8f4ab0e4c9c064b661da475e298 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 15:45:58 +0000 Subject: net: NETDEV_UNREGISTER_PERNET -> NETDEV_UNREGISTER_BATCH The motivation for an additional notifier in batched netdevice notification (rt_do_flush) only needs to be called once per batch not once per namespace. For further batching improvements I need a guarantee that the netdevices are unregistered in order allowing me to unregister an all of the network devices in a network namespace at the same time with the guarantee that the loopback device is really and truly unregistered last. Additionally it appears that we moved the route cache flush after the final synchronize_net, which seems wrong and there was no explanation. So I have restored the original location of the final synchronize_net. Cc: Octavian Purdila Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5d131c2f84c..bb37ee1e090 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1353,7 +1353,7 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_PERNET, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -4771,8 +4771,7 @@ static void net_set_todo(struct net_device *dev) static void rollback_registered_many(struct list_head *head) { - struct net_device *dev, *aux, *fdev; - LIST_HEAD(pernet_list); + struct net_device *dev; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -4828,26 +4827,14 @@ static void rollback_registered_many(struct list_head *head) netdev_unregister_kobject(dev); } - synchronize_net(); + /* Process any work delayed until the end of the batch */ + dev = list_entry(head->next, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - list_for_each_entry_safe(dev, aux, head, unreg_list) { - int new_net = 1; - list_for_each_entry(fdev, &pernet_list, unreg_list) { - if (net_eq(dev_net(dev), dev_net(fdev))) { - new_net = 0; - dev_put(dev); - break; - } - } - if (new_net) - list_move(&dev->unreg_list, &pernet_list); - } + synchronize_net(); - list_for_each_entry_safe(dev, aux, &pernet_list, unreg_list) { - call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); - list_move(&dev->unreg_list, head); + list_for_each_entry(dev, head, unreg_list) dev_put(dev); - } } static void rollback_registered(struct net_device *dev) @@ -5129,7 +5116,7 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_PERNET, _PERNET users + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users * should have already handle it the first time */ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, @@ -5442,11 +5429,6 @@ EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices - * - * WARNING: Calling this modifies the given list - * (in rollback_registered_many). It may change the order of the elements - * in the list. However, you can assume it does not add or delete elements - * to/from the list. */ void unregister_netdevice_many(struct list_head *head) { @@ -5555,7 +5537,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_PERNET, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); /* * Flush the unicast and multicast chains -- cgit v1.2.3 From 2b035b39970740722598f7a9d548835f9bdd730f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:27 +0000 Subject: net: Batch network namespace destruction. It is fairly common to kill several network namespaces at once. Either because they are nested one inside the other or because they are cooperating in multiple machine networking experiments. As the network stack control logic does not parallelize easily batch up multiple network namespaces existing together. To get the full benefit of batching the virtual network devices to be removed must be all removed in one batch. For that purpose I have added a loop after the last network device operations have run that batches up all remaining network devices and deletes them. An extra benefit is that the reorganization slightly shrinks the size of the per network namespace data structures replaceing a work_struct with a list_head. In a trivial test with 4K namespaces this change reduced the cost of a destroying 4K namespaces from 7+ minutes (at 12% cpu) to 44 seconds (at 60% cpu). The bulk of that 44s was spent in inet_twsk_purge. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 66 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 86ed7f44d08..a42caa2b909 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include /* * Our network namespace constructor/destructor lists @@ -27,6 +29,20 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static void unregister_netdevices(struct net *net, struct list_head *list) +{ + struct net_device *dev; + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + */ + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, list); + else + unregister_netdevice_queue(dev, list); + } +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -59,6 +75,13 @@ out_undo: list_for_each_entry_continue_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } } rcu_barrier(); @@ -147,18 +170,26 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) return net_create(); } +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + static void cleanup_net(struct work_struct *work) { struct pernet_operations *ops; - struct net *net; + struct net *net, *tmp; + LIST_HEAD(net_kill_list); - net = container_of(work, struct net, work); + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); mutex_lock(&net_mutex); /* Don't let anyone else find us. */ rtnl_lock(); - list_del_rcu(&net->list); + list_for_each_entry(net, &net_kill_list, cleanup_list) + list_del_rcu(&net->list); rtnl_unlock(); /* @@ -170,8 +201,18 @@ static void cleanup_net(struct work_struct *work) /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); + if (ops->exit) { + list_for_each_entry(net, &net_kill_list, cleanup_list) + ops->exit(net); + } + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } } mutex_unlock(&net_mutex); @@ -182,14 +223,23 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - net_free(net); + list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { + list_del_init(&net->cleanup_list); + net_free(net); + } } +static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - INIT_WORK(&net->work, cleanup_net); - queue_work(netns_wq, &net->work); + unsigned long flags; + + spin_lock_irqsave(&cleanup_list_lock, flags); + list_add(&net->cleanup_list, &cleanup_list); + spin_unlock_irqrestore(&cleanup_list_lock, flags); + + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); -- cgit v1.2.3 From f875bae065334907796da12523f9df85c89f5712 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:28 +0000 Subject: net: Automatically allocate per namespace data. To get the full benefit of batched network namespace cleanup netowrk device deletion needs to be performed by the generic code. When using register_pernet_gen_device and freeing the data in exit_net it is impossible to delay allocation until after exit_net has called as the device uninit methods are no longer safe. To correct this, and to simplify working with per network namespace data I have moved allocation and deletion of per network namespace data into the network namespace core. The core now frees the data only after all of the network namespace exit routines have run. Now it is only required to set the new fields .id and .size in the pernet_operations structure if you want network namespace data to be managed for you automatically. This makes the current register_pernet_gen_device and register_pernet_gen_subsys routines unnecessary. For the moment I have left them as compatibility wrappers in net_namespace.h They will be removed once all of the users have been updated. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 188 +++++++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 86 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a42caa2b909..9679ad292da 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -43,13 +43,40 @@ static void unregister_netdevices(struct net *net, struct list_head *list) } } +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + int err; + if (ops->id && ops->size) { + void *data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = net_assign_generic(net, *ops->id, data); + if (err) { + kfree(data); + return err; + } + } + if (ops->init) + return ops->init(net); + return 0; +} + +static void ops_free(const struct pernet_operations *ops, struct net *net) +{ + if (ops->id && ops->size) { + int id = *ops->id; + kfree(net_generic(net, id)); + } +} + /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net) { /* Must be called with net_mutex held */ - struct pernet_operations *ops; + const struct pernet_operations *ops, *saved_ops; int error = 0; atomic_set(&net->count, 1); @@ -59,11 +86,9 @@ static __net_init int setup_net(struct net *net) #endif list_for_each_entry(ops, &pernet_list, list) { - if (ops->init) { - error = ops->init(net); - if (error < 0) - goto out_undo; - } + error = ops_init(ops, net); + if (error < 0) + goto out_undo; } out: return error; @@ -72,6 +97,7 @@ out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ + saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) { if (ops->exit) ops->exit(net); @@ -83,6 +109,9 @@ out_undo: rtnl_unlock(); } } + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free(ops, net); rcu_barrier(); goto out; @@ -175,7 +204,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { - struct pernet_operations *ops; + const struct pernet_operations *ops; struct net *net, *tmp; LIST_HEAD(net_kill_list); @@ -214,6 +243,13 @@ static void cleanup_net(struct work_struct *work) rtnl_unlock(); } } + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->size && ops->id) { + list_for_each_entry(net, &net_kill_list, cleanup_list) + ops_free(ops, net); + } + } mutex_unlock(&net_mutex); @@ -309,16 +345,16 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); #ifdef CONFIG_NET_NS -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) { struct net *net, *undo_net; int error; list_add_tail(&ops->list, list); - if (ops->init) { + if (ops->init || (ops->id && ops->size)) { for_each_net(net) { - error = ops->init(net); + error = ops_init(ops, net); if (error) goto out_undo; } @@ -336,10 +372,18 @@ out_undo: } } undone: + if (ops->size && ops->id) { + for_each_net(undo_net) { + if (net_eq(undo_net, net)) + goto freed; + ops_free(ops, undo_net); + } + } +freed: return error; } -static void unregister_pernet_operations(struct pernet_operations *ops) +static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; @@ -347,27 +391,66 @@ static void unregister_pernet_operations(struct pernet_operations *ops) if (ops->exit) for_each_net(net) ops->exit(net); + if (ops->id && ops->size) + for_each_net(net) + ops_free(ops, net); } #else -static int register_pernet_operations(struct list_head *list, - struct pernet_operations *ops) +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) { - if (ops->init == NULL) - return 0; - return ops->init(&init_net); + int err = 0; + err = ops_init(ops, &init_net); + if (err) + ops_free(ops, &init_net); + return err; + } -static void unregister_pernet_operations(struct pernet_operations *ops) +static void __unregister_pernet_operations(struct pernet_operations *ops) { if (ops->exit) ops->exit(&init_net); + ops_free(ops, &init_net); } -#endif + +#endif /* CONFIG_NET_NS */ static DEFINE_IDA(net_generic_ids); +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int error; + + if (ops->id) { +again: + error = ida_get_new_above(&net_generic_ids, 1, ops->id); + if (error < 0) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + return error; + } + } + error = __register_pernet_operations(list, ops); + if (error && ops->id) + ida_remove(&net_generic_ids, *ops->id); + + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + + __unregister_pernet_operations(ops); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); +} + /** * register_pernet_subsys - register a network namespace subsystem * @ops: pernet operations structure for the subsystem @@ -414,38 +497,6 @@ void unregister_pernet_subsys(struct pernet_operations *module) } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); -int register_pernet_gen_subsys(int *id, struct pernet_operations *ops) -{ - int rv; - - mutex_lock(&net_mutex); -again: - rv = ida_get_new_above(&net_generic_ids, 1, id); - if (rv < 0) { - if (rv == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - rv = register_pernet_operations(first_device, ops); - if (rv < 0) - ida_remove(&net_generic_ids, *id); -out: - mutex_unlock(&net_mutex); - return rv; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_subsys); - -void unregister_pernet_gen_subsys(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_subsys); - /** * register_pernet_device - register a network namespace device * @ops: pernet operations structure for the subsystem @@ -477,30 +528,6 @@ int register_pernet_device(struct pernet_operations *ops) } EXPORT_SYMBOL_GPL(register_pernet_device); -int register_pernet_gen_device(int *id, struct pernet_operations *ops) -{ - int error; - mutex_lock(&net_mutex); -again: - error = ida_get_new_above(&net_generic_ids, 1, id); - if (error) { - if (error == -EAGAIN) { - ida_pre_get(&net_generic_ids, GFP_KERNEL); - goto again; - } - goto out; - } - error = register_pernet_operations(&pernet_list, ops); - if (error) - ida_remove(&net_generic_ids, *id); - else if (first_device == &pernet_list) - first_device = &ops->list; -out: - mutex_unlock(&net_mutex); - return error; -} -EXPORT_SYMBOL_GPL(register_pernet_gen_device); - /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate @@ -520,17 +547,6 @@ void unregister_pernet_device(struct pernet_operations *ops) } EXPORT_SYMBOL_GPL(unregister_pernet_device); -void unregister_pernet_gen_device(int id, struct pernet_operations *ops) -{ - mutex_lock(&net_mutex); - if (&ops->list == first_device) - first_device = first_device->next; - unregister_pernet_operations(ops); - ida_remove(&net_generic_ids, id); - mutex_unlock(&net_mutex); -} -EXPORT_SYMBOL_GPL(unregister_pernet_gen_device); - static void net_generic_release(struct rcu_head *rcu) { struct net_generic *ng; -- cgit v1.2.3 From e008b5fc8dc7f46d9904001c7a2155eb1e7d35ab Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 29 Nov 2009 22:25:30 +0000 Subject: net: Simplfy default_device_exit and improve batching. - Defer dellink to net_cleanup() allowing for batching. - Fix comment. - Use for_each_netdev_safe again as dev_change_net_namespace touches at most one network device (unlike veth dellink). Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bb37ee1e090..e3e18dee0bd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5736,14 +5736,13 @@ static struct pernet_operations __net_initdata netdev_net_ops = { static void __net_exit default_device_exit(struct net *net) { - struct net_device *dev; + struct net_device *dev, *aux; /* - * Push all migratable of the network devices back to the + * Push all migratable network devices back to the * initial network namespace */ rtnl_lock(); -restart: - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -5751,11 +5750,9 @@ restart: if (dev->features & NETIF_F_NETNS_LOCAL) continue; - /* Delete virtual devices */ - if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { - dev->rtnl_link_ops->dellink(dev, NULL); - goto restart; - } + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; /* Push remaing network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); @@ -5765,7 +5762,6 @@ restart: __func__, dev->name, err); BUG(); } - goto restart; } rtnl_unlock(); } -- cgit v1.2.3 From c81c2d95449cd218c2022ce6014c52fef1eb1f66 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 2 Dec 2009 16:49:02 +0000 Subject: skbuff: remove skb_dma_map/unmap The two functions skb_dma_map/unmap are unsafe to use as they cause problems when packets are cloned and sent to multiple devices while a HW IOMMU is enabled. Due to this it is best to remove the code so it is not used by any other network driver maintainters. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/Makefile | 1 - net/core/skb_dma_map.c | 65 -------------------------------------------------- 2 files changed, 66 deletions(-) delete mode 100644 net/core/skb_dma_map.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 796f46eece5..08791ac3e05 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -6,7 +6,6 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-$(CONFIG_HAS_DMA) += skb_dma_map.o obj-y += dev.o ethtool.o dev_mcast.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c deleted file mode 100644 index 79687dfd695..00000000000 --- a/net/core/skb_dma_map.c +++ /dev/null @@ -1,65 +0,0 @@ -/* skb_dma_map.c: DMA mapping helpers for socket buffers. - * - * Copyright (C) David S. Miller - */ - -#include -#include -#include -#include - -int skb_dma_map(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - dma_addr_t map; - int i; - - map = dma_map_single(dev, skb->data, - skb_headlen(skb), dir); - if (dma_mapping_error(dev, map)) - goto out_err; - - sp->dma_head = map; - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - map = dma_map_page(dev, fp->page, fp->page_offset, - fp->size, dir); - if (dma_mapping_error(dev, map)) - goto unwind; - sp->dma_maps[i] = map; - } - - return 0; - -unwind: - while (--i >= 0) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); -out_err: - return -ENOMEM; -} -EXPORT_SYMBOL(skb_dma_map); - -void skb_dma_unmap(struct device *dev, struct sk_buff *skb, - enum dma_data_direction dir) -{ - struct skb_shared_info *sp = skb_shinfo(skb); - int i; - - dma_unmap_single(dev, sp->dma_head, - skb_headlen(skb), dir); - for (i = 0; i < sp->nr_frags; i++) { - skb_frag_t *fp = &sp->frags[i]; - - dma_unmap_page(dev, sp->dma_maps[i], - fp->size, dir); - } -} -EXPORT_SYMBOL(skb_dma_unmap); -- cgit v1.2.3 From 491deb24bf5bf7124141287aaf02c3219783ceab Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:54 +0000 Subject: net 02/05: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME commit 229e77eec406ad68662f18e49fda8b5d366768c5 Author: Patrick McHardy Date: Thu Dec 3 12:05:23 2009 +0100 net: fib_rules: rename ifindex/ifname/FRA_IFNAME to iifindex/iifname/FRA_IIFNAME The next patch will add oif classification, rename interface related members and attributes to reflect that they're used for iif classification. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bd309384f8b..8e8028cdc87 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -135,7 +135,7 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->ifindex && (rule->ifindex != fl->iif)) + if (rule->iifindex && (rule->iifindex != fl->iif)) goto out; if ((rule->mark ^ fl->mark) & rule->mark_mask) @@ -248,14 +248,14 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[FRA_PRIORITY]) rule->pref = nla_get_u32(tb[FRA_PRIORITY]); - if (tb[FRA_IFNAME]) { + if (tb[FRA_IIFNAME]) { struct net_device *dev; - rule->ifindex = -1; - nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->ifname); + rule->iifindex = -1; + nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->iifname); if (dev) - rule->ifindex = dev->ifindex; + rule->iifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -388,8 +388,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) continue; - if (tb[FRA_IFNAME] && - nla_strcmp(tb[FRA_IFNAME], rule->ifname)) + if (tb[FRA_IIFNAME] && + nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) continue; if (tb[FRA_FWMARK] && @@ -447,7 +447,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) - + nla_total_size(IFNAMSIZ) /* FRA_IFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_FWMARK */ @@ -481,11 +481,11 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL) frh->flags |= FIB_RULE_UNRESOLVED; - if (rule->ifname[0]) { - NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname); + if (rule->iifname[0]) { + NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); - if (rule->ifindex == -1) - frh->flags |= FIB_RULE_DEV_DETACHED; + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->pref) @@ -600,9 +600,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->ifindex == -1 && - strcmp(dev->name, rule->ifname) == 0) - rule->ifindex = dev->ifindex; + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; } } @@ -611,8 +611,8 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) - if (rule->ifindex == dev->ifindex) - rule->ifindex = -1; + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; } -- cgit v1.2.3 From 1b038a5e60c7812f19818e8a5df96d029e49c38f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:56 +0000 Subject: net 03/05: fib_rules: add oif classification commit 68144d350f4f6c348659c825cde6a82b34c27a91 Author: Patrick McHardy Date: Thu Dec 3 12:05:25 2009 +0100 net: fib_rules: add oif classification Support routing table lookup based on the flow's oif. This is useful to classify packets originating from sockets bound to interfaces differently. The route cache already includes the oif and needs no changes. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 8e8028cdc87..d1a70ad4b54 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -138,6 +138,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->iifindex && (rule->iifindex != fl->iif)) goto out; + if (rule->oifindex && (rule->oifindex != fl->oif)) + goto out; + if ((rule->mark ^ fl->mark) & rule->mark_mask) goto out; @@ -258,6 +261,16 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->iifindex = dev->ifindex; } + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + rule->oifindex = -1; + nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->oifname); + if (dev) + rule->oifindex = dev->ifindex; + } + if (tb[FRA_FWMARK]) { rule->mark = nla_get_u32(tb[FRA_FWMARK]); if (rule->mark) @@ -392,6 +405,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) continue; + if (tb[FRA_OIFNAME] && + nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) + continue; + if (tb[FRA_FWMARK] && (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) continue; @@ -448,6 +465,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_FWMARK */ @@ -488,6 +506,13 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->flags |= FIB_RULE_IIF_DETACHED; } + if (rule->oifname[0]) { + NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); + + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; + } + if (rule->pref) NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); @@ -603,6 +628,9 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; } } @@ -610,9 +638,12 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; - list_for_each_entry(rule, rules, list) + list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } } -- cgit v1.2.3 From 5adef1809147a9c39119ffd5a13a1ca4fe23a411 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Dec 2009 01:25:57 +0000 Subject: net 04/05: fib_rules: allow to delete local rule commit d124356ce314fff22a047ea334379d5105b2d834 Author: Patrick McHardy Date: Thu Dec 3 12:16:35 2009 +0100 net: fib_rules: allow to delete local rule Allow to delete the local rule and recreate it with a higher priority. This can be used to force packets with a local destination out on the wire instead of routing them to loopback. Additionally this patch allows to recreate rules with a priority of 0. Combined with the previous patch to allow oif classification, a socket can be bound to the desired interface and packets routed to the wire like this: # move local rule to lower priority ip rule add pref 1000 lookup local ip rule del pref 0 # route packets of sockets bound to eth0 to the wire independant # of the destination address ip rule add pref 100 oif eth0 lookup 100 ip route add default dev eth0 table 100 Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d1a70ad4b54..ef0e7d9e664 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -287,7 +287,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); - if (!rule->pref && ops->default_pref) + if (!tb[FRA_PRIORITY] && ops->default_pref) rule->pref = ops->default_pref(ops); err = -EINVAL; -- cgit v1.2.3 From 72ad937abd0a43b7cf2c557ba1f2ec75e608c516 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:03 +0000 Subject: net: Add support for batching network namespace cleanups - Add exit_list to struct net to support building lists of network namespaces to cleanup. - Add exit_batch to pernet_operations to allow running operations only once during a network namespace exit. Instead of once per network namespace. - Factor opt ops_exit_list and ops_exit_free so the logic with cleanup up a network namespace does not need to be duplicated. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 122 +++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9679ad292da..6c7f6e04dbb 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -70,6 +70,36 @@ static void ops_free(const struct pernet_operations *ops, struct net *net) } } +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->exit(net); + } + if (&ops->list == first_device) { + LIST_HEAD(dev_kill_list); + rtnl_lock(); + list_for_each_entry(net, net_exit_list, exit_list) + unregister_netdevices(net, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + ops_free(ops, net); + } +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -78,6 +108,7 @@ static __net_init int setup_net(struct net *net) /* Must be called with net_mutex held */ const struct pernet_operations *ops, *saved_ops; int error = 0; + LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); @@ -97,21 +128,14 @@ out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ + list_add(&net->exit_list, &net_exit_list); saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit) - ops->exit(net); - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - } + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free(ops, net); + ops_free_list(ops, &net_exit_list); rcu_barrier(); goto out; @@ -207,6 +231,7 @@ static void cleanup_net(struct work_struct *work) const struct pernet_operations *ops; struct net *net, *tmp; LIST_HEAD(net_kill_list); + LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ spin_lock_irq(&cleanup_list_lock); @@ -217,8 +242,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) + list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); + list_add_tail(&net->exit_list, &net_exit_list); + } rtnl_unlock(); /* @@ -229,27 +256,12 @@ static void cleanup_net(struct work_struct *work) synchronize_rcu(); /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit) { - list_for_each_entry(net, &net_kill_list, cleanup_list) - ops->exit(net); - } - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - } + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->size && ops->id) { - list_for_each_entry(net, &net_kill_list, cleanup_list) - ops_free(ops, net); - } - } + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); mutex_unlock(&net_mutex); @@ -259,8 +271,8 @@ static void cleanup_net(struct work_struct *work) rcu_barrier(); /* Finally it is safe to free my network namespace structure */ - list_for_each_entry_safe(net, tmp, &net_kill_list, cleanup_list) { - list_del_init(&net->cleanup_list); + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); net_free(net); } } @@ -348,8 +360,9 @@ pure_initcall(net_ns_init); static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - struct net *net, *undo_net; + struct net *net; int error; + LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { @@ -357,6 +370,7 @@ static int __register_pernet_operations(struct list_head *list, error = ops_init(ops, net); if (error) goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); } } return 0; @@ -364,36 +378,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - if (ops->exit) { - for_each_net(undo_net) { - if (net_eq(undo_net, net)) - goto undone; - ops->exit(undo_net); - } - } -undone: - if (ops->size && ops->id) { - for_each_net(undo_net) { - if (net_eq(undo_net, net)) - goto freed; - ops_free(ops, undo_net); - } - } -freed: + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; + LIST_HEAD(net_exit_list); list_del(&ops->list); - if (ops->exit) - for_each_net(net) - ops->exit(net); - if (ops->id && ops->size) - for_each_net(net) - ops_free(ops, net); + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); } #else @@ -411,9 +410,10 @@ static int __register_pernet_operations(struct list_head *list, static void __unregister_pernet_operations(struct pernet_operations *ops) { - if (ops->exit) - ops->exit(&init_net); - ops_free(ops, &init_net); + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); } #endif /* CONFIG_NET_NS */ -- cgit v1.2.3 From 04dc7f6be3a7b308f8545bb45772c9fb75f71aca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:04 +0000 Subject: net: Move network device exit batching Move network device exit batching from a special case in net_namespace.c to using common mechanisms in dev.c Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 25 +++++++++++++++++++++++++ net/core/net_namespace.c | 24 ------------------------ 2 files changed, 25 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e3e18dee0bd..0913a08a87d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5766,8 +5766,33 @@ static void __net_exit default_device_exit(struct net *net) rtnl_unlock(); } +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registeration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); +} + static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, + .exit_batch = default_device_exit_batch, }; /* diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6c7f6e04dbb..4026a4cff93 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -8,10 +8,8 @@ #include #include #include -#include #include #include -#include /* * Our network namespace constructor/destructor lists @@ -29,20 +27,6 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ -static void unregister_netdevices(struct net *net, struct list_head *list) -{ - struct net_device *dev; - /* At exit all network devices most be removed from a network - * namespace. Do this in the reverse order of registeration. - */ - for_each_netdev_reverse(net, dev) { - if (dev->rtnl_link_ops) - dev->rtnl_link_ops->dellink(dev, list); - else - unregister_netdevice_queue(dev, list); - } -} - static int ops_init(const struct pernet_operations *ops, struct net *net) { int err; @@ -78,14 +62,6 @@ static void ops_exit_list(const struct pernet_operations *ops, list_for_each_entry(net, net_exit_list, exit_list) ops->exit(net); } - if (&ops->list == first_device) { - LIST_HEAD(dev_kill_list); - rtnl_lock(); - list_for_each_entry(net, net_exit_list, exit_list) - unregister_netdevices(net, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } if (ops->exit_batch) ops->exit_batch(net_exit_list); } -- cgit v1.2.3 From 3a765edadb28cc736d185f67d1ba6bedcc85f4b9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:06 +0000 Subject: netns: Add an explicit rcu_barrier to unregister_pernet_{device|subsys} This allows namespace exit methods to batch work that comes requires an rcu barrier using call_rcu without having to treat the unregister_pernet_operations cases specially. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net_namespace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4026a4cff93..bd8c4712ea2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -413,8 +413,11 @@ again: } } error = __register_pernet_operations(list, ops); - if (error && ops->id) - ida_remove(&net_generic_ids, *ops->id); + if (error) { + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); + } return error; } @@ -423,6 +426,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops) { __unregister_pernet_operations(ops); + rcu_barrier(); if (ops->id) ida_remove(&net_generic_ids, *ops->id); } -- cgit v1.2.3 From e9c5158ac26affd5d8ce006521bdfb7148090e18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 12:22:55 -0800 Subject: net: Allow fib_rule_unregister to batch Refactor the code so fib_rules_register always takes a template instead of the actual fib_rules_ops structure that will be used. This is required for network namespace support so 2 out of the 3 callers already do this, it allows the error handling to be made common, and it allows fib_rules_unregister to free the template for hte caller. Modify fib_rules_unregister to use call_rcu instead of syncrhonize_rcu to allw multiple namespaces to be cleaned up in the same rcu grace period. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/fib_rules.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ef0e7d9e664..02a3b2c69c1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -72,7 +72,7 @@ static void flush_route_cache(struct fib_rules_ops *ops) ops->flush_cache(ops); } -int fib_rules_register(struct fib_rules_ops *ops) +static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; @@ -102,6 +102,28 @@ errout: return err; } +struct fib_rules_ops * +fib_rules_register(struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof (*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} + EXPORT_SYMBOL_GPL(fib_rules_register); void fib_rules_cleanup_ops(struct fib_rules_ops *ops) @@ -115,6 +137,15 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops) } EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); +static void fib_rules_put_rcu(struct rcu_head *head) +{ + struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); + struct net *net = ops->fro_net; + + release_net(net); + kfree(ops); +} + void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; @@ -124,8 +155,7 @@ void fib_rules_unregister(struct fib_rules_ops *ops) fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); - synchronize_rcu(); - release_net(net); + call_rcu(&ops->rcu, fib_rules_put_rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); -- cgit v1.2.3 From fc4a7489663250360cd40d5adf06a08d1c5d54df Mon Sep 17 00:00:00 2001 From: Patrick Mullaney Date: Thu, 3 Dec 2009 15:59:22 -0800 Subject: netdevice: provide common routine for macvlan and vlan operstate management Provide common routine for the transition of operational state for a leaf device during a root device transition. Signed-off-by: Patrick Mullaney Acked-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/core/dev.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0913a08a87d..c36a17aafcf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4900,6 +4900,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + /** * register_netdevice - register a network device * @dev: device to register -- cgit v1.2.3