From 5b057c6b1a25d57edf2b4d1e956e50936480a9ff Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 23 Jun 2006 02:06:41 -0700 Subject: [NET]: Avoid allocating skb in skb_pad First of all it is unnecessary to allocate a new skb in skb_pad since the existing one is not shared. More importantly, our hard_start_xmit interface does not allow a new skb to be allocated since that breaks requeueing. This patch uses pskb_expand_head to expand the existing skb and linearize it if needed. Actually, someone should sift through every instance of skb_pad on a non-linear skb as they do not fit the reasons why this was originally created. Incidentally, this fixes a minor bug when the skb is cloned (tcpdump, TCP, etc.). As it is skb_pad will simply write over a cloned skb. Because of the position of the write it is unlikely to cause problems but still it's best if we don't do it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb7210f4005..fe63d4efbd4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -781,24 +781,40 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return NULL in out of memory cases. + * May return error in out of memory cases. The skb is freed on error. */ -struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +int skb_pad(struct sk_buff *skb, int pad) { - struct sk_buff *nskb; + int err; + int ntail; /* If the skbuff is non linear tailroom is always zero.. */ - if (skb_tailroom(skb) >= pad) { + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { memset(skb->data+skb->len, 0, pad); - return skb; + return 0; } - - nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: kfree_skb(skb); - if (nskb) - memset(nskb->data+nskb->len, 0, pad); - return nskb; + return err; } /* Trims skb to length len. It can change skb pointers. -- cgit v1.2.3 From d4828d85d188dc70ed172802e798d3978bb6e29e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:28:18 -0700 Subject: [NET]: Prevent transmission after dev_deactivate The dev_deactivate function has bit-rotted since the introduction of lockless drivers. In particular, the spin_unlock_wait call at the end has no effect on the xmit routine of lockless drivers. With a little bit of work, we can make it much more useful by providing the guarantee that when it returns, no more calls to the xmit routine of the underlying driver will be made. The idea is simple. There are two entry points in to the xmit routine. The first comes from dev_queue_xmit. That one is easily stopped by using synchronize_rcu. This works because we set the qdisc to noop_qdisc before the synchronize_rcu call. That in turn causes all subsequent packets sent to dev_queue_xmit to be dropped. The synchronize_rcu call also ensures all outstanding calls leave their critical section. The other entry point is from qdisc_run. Since we now have a bit that indicates whether it's running, all we have to do is to wait until the bit is off. I've removed the loop to wait for __LINK_STATE_SCHED to clear. This is useless because netif_wake_queue can cause it to be set again. It is also harmless because we've disarmed qdisc_run. I've also removed the spin_unlock_wait on xmit_lock because its only purpose of making sure that all outstanding xmit_lock holders have exited is also given by dev_watchdog_down. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ab39fe17cb5..29e3888102b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1295,7 +1295,7 @@ int dev_queue_xmit(struct sk_buff *skb) /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ - local_bh_disable(); + rcu_read_lock_bh(); /* Updates of qdisc are serialized by queue_lock. * The struct Qdisc which is pointed to by qdisc is now a @@ -1369,13 +1369,13 @@ int dev_queue_xmit(struct sk_buff *skb) } rc = -ENETDOWN; - local_bh_enable(); + rcu_read_unlock_bh(); out_kfree_skb: kfree_skb(skb); return rc; out: - local_bh_enable(); + rcu_read_unlock_bh(); return rc; } -- cgit v1.2.3 From 7967168cefdbc63bf332d6b1548eca7cd65ebbcc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:40:14 -0700 Subject: [NET]: Merge TSO/UFO fields in sk_buff Having separate fields in sk_buff for TSO/UFO (tso_size/ufo_size) is not going to scale if we add any more segmentation methods (e.g., DCCP). So let's merge them. They were used to tell the protocol of a packet. This function has been subsumed by the new gso_type field. This is essentially a set of netdev feature bits (shifted by 16 bits) that are required to process a specific skb. As such it's easy to tell whether a given device can process a GSO skb: you just have to and the gso_type field and the netdev's features field. I've made gso_type a conjunction. The idea is that you have a base type (e.g., SKB_GSO_TCPV4) that can be modified further to support new features. For example, if we add a hardware TSO type that supports ECN, they would declare NETIF_F_TSO | NETIF_F_TSO_ECN. All TSO packets with CWR set would have a gso_type of SKB_GSO_TCPV4 | SKB_GSO_TCPV4_ECN while all other TSO packets would be SKB_GSO_TCPV4. This means that only the CWR packets need to be emulated in software. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe63d4efbd4..368d98578c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -172,9 +172,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); atomic_set(&shinfo->dataref, 1); shinfo->nr_frags = 0; - shinfo->tso_size = 0; - shinfo->tso_segs = 0; - shinfo->ufo_size = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->frag_list = NULL; @@ -238,8 +238,9 @@ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->frag_list = NULL; out: return skb; @@ -528,8 +529,9 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif skb_copy_secmark(new, old); atomic_set(&new->users, 1); - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } /** -- cgit v1.2.3 From f6a78bfcb141f963187464bac838d46a81c3882a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 02:57:17 -0700 Subject: [NET]: Add generic segmentation offload This patch adds the infrastructure for generic segmentation offload. The idea is to tap into the potential savings of TSO without hardware support by postponing the allocation of segmented skb's until just before the entry point into the NIC driver. The same structure can be used to support software IPv6 TSO, as well as UFO and segmentation offload for other relevant protocols, e.g., DCCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 122 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 29e3888102b..d293e0f90a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -116,6 +116,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -1048,7 +1049,7 @@ static inline void net_timestamp(struct sk_buff *skb) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; @@ -1186,6 +1187,40 @@ out: return ret; } +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather is supported on the target. + * + * This function segments the given skb and returns a list of segments. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + int type = skb->protocol; + + BUG_ON(skb_shinfo(skb)->frag_list); + BUG_ON(skb->ip_summed != CHECKSUM_HW); + + skb->mac.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->data; + __skb_pull(skb, skb->mac_len); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + segs = ptype->gso_segment(skb, sg); + break; + } + } + rcu_read_unlock(); + + return segs; +} + +EXPORT_SYMBOL(skb_gso_segment); + /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev) @@ -1222,6 +1257,86 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct sk_buff *segs; + + segs = skb_gso_segment(skb, dev->features & NETIF_F_SG && + !illegal_highdma(dev, skb)); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + if (likely(!skb->next)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + if (!netif_needs_gso(dev, skb)) + return dev->hard_start_xmit(skb, dev); + + if (unlikely(dev_gso_segment(skb))) + goto out_kfree_skb; + } + + do { + struct sk_buff *nskb = skb->next; + int rc; + + skb->next = nskb->next; + nskb->next = NULL; + rc = dev->hard_start_xmit(nskb, dev); + if (unlikely(rc)) { + skb->next = nskb; + return rc; + } + } while (skb->next); + + skb->destructor = DEV_GSO_CB(skb)->destructor; + +out_kfree_skb: + kfree_skb(skb); + return 0; +} + #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1266,6 +1381,10 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; + /* GSO will handle the following emulations directly. */ + if (netif_needs_gso(dev, skb)) + goto gso; + if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) @@ -1290,6 +1409,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_checksum_help(skb, 0)) goto out_kfree_skb; +gso: spin_lock_prefetch(&dev->queue_lock); /* Disable soft irqs for various locks below. Also @@ -1346,11 +1466,8 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { + if (!dev_hard_start_xmit(skb, dev)) { HARD_TX_UNLOCK(dev); goto out; } -- cgit v1.2.3 From f4c50d990dcf11a296679dc05de3873783236711 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:02:40 -0700 Subject: [NET]: Add software TSOv4 This patch adds the GSO implementation for IPv4 TCP. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368d98578c1..8e5044ba3ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1842,6 +1842,132 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL_GPL(skb_pull_rcsum); +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @sg: whether scatter-gather can be used for generated segments + * + * This function performs segmentation on the given skb. It returns + * the segment at the given position. It returns NULL if there are + * no more segments to generate, or when an error is encountered. + */ +struct sk_buff *skb_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb->mac.raw; + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize, nsize; + int k; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + nsize = hsize + doffset; + if (nsize > len + doffset || !sg) + nsize = len + doffset; + + nskb = alloc_skb(nsize + headroom, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + nskb->dev = skb->dev; + nskb->priority = skb->priority; + nskb->protocol = skb->protocol; + nskb->dst = dst_clone(skb->dst); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + nskb->pkt_type = skb->pkt_type; + nskb->mac_len = skb->mac_len; + + skb_reserve(nskb, headroom); + nskb->mac.raw = nskb->data; + nskb->nh.raw = nskb->data + skb->mac_len; + nskb->h.raw = nskb->nh.raw + (skb->h.raw - skb->nh.raw); + memcpy(skb_put(nskb, doffset), skb->data, doffset); + + if (!sg) { + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + k = 0; + + nskb->ip_summed = CHECKSUM_HW; + nskb->csum = skb->csum; + memcpy(skb_put(nskb, hsize), skb->data + offset, hsize); + + while (pos < offset + len) { + BUG_ON(i >= nfrags); + + *frag = skb_shinfo(skb)->frags[i]; + get_page(frag->page); + size = frag->size; + + if (pos < offset) { + frag->page_offset += offset - pos; + frag->size -= offset - pos; + } + + k++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + frag->size -= pos + size - (offset + len); + break; + } + + frag++; + } + + skb_shinfo(nskb)->nr_frags = k; + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree(skb); + } + return ERR_PTR(err); +} + +EXPORT_SYMBOL_GPL(skb_segment); + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", -- cgit v1.2.3 From 37c3185a02d4b85fbe134bf5204535405dd2c957 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 22 Jun 2006 03:07:29 -0700 Subject: [NET]: Added GSO toggle This patch adds a generic segmentation offload toggle that can be turned on/off for each net device. For now it only supports in TCPv4. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/ethtool.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 33ce7ed6afc..27ce1683caf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -614,6 +614,29 @@ static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_ufo(dev, edata.data); } +static int ethtool_get_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GGSO }; + + edata.data = dev->features & NETIF_F_GSO; + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_gso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + if (edata.data) + dev->features |= NETIF_F_GSO; + else + dev->features &= ~NETIF_F_GSO; + return 0; +} + static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -905,6 +928,12 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_SUFO: rc = ethtool_set_ufo(dev, useraddr); break; + case ETHTOOL_GGSO: + rc = ethtool_get_gso(dev, useraddr); + break; + case ETHTOOL_SGSO: + rc = ethtool_set_gso(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From f4b8ea7849544114e9d3d682df4d400180854677 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 22 Jun 2006 16:00:11 -0700 Subject: [NET]: fix net-core kernel-doc Warning(/var/linsrc/linux-2617-g4//include/linux/skbuff.h:304): No description found for parameter 'dma_cookie' Warning(/var/linsrc/linux-2617-g4//include/net/sock.h:1274): No description found for parameter 'copied_early' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'chan' Warning(/var/linsrc/linux-2617-g4//net/core/dev.c:3309): No description found for parameter 'event' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d293e0f90a0..9b8f0f22c81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3418,8 +3418,8 @@ static void net_dma_rebalance(void) /** * netdev_dma_event - event callback for the net_dma_client * @client: should always be net_dma_client - * @chan: - * @event: + * @chan: DMA channel for the event + * @event: event type */ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, enum dma_event event) -- cgit v1.2.3