From f3781d2e89f12dd5afa046dc56032af6e39bd116 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 14 Jul 2008 23:48:44 -0700 Subject: RDMA: Remove subversion $Id tags They don't get updated by git and so they're worse than useless. Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 -- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_fs.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 2 -- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 2 -- 8 files changed, 16 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ca126fc2b85..0dcbab3203c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib.h 1358 2004-12-17 22:00:11Z roland $ */ #ifndef _IPOIB_H diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 97e67d36378..91c95929991 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id$ */ #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 8b882bbd1d0..961c585da21 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_fs.c 1389 2004-12-27 22:56:47Z roland $ */ #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f429bce24c2..eca8518d79a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -31,8 +31,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $ */ #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2442090ac8d..f217b1edd0a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ */ #include "ipoib.h" diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 3f663fb852c..4a6538b9301 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -30,8 +30,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_multicast.c 1362 2004-12-18 15:56:29Z roland $ */ #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 8766d29ce3b..810790ae753 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -29,8 +29,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $ */ #include "ipoib.h" diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 1cdb5cfb0ff..b08eb56196d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -28,8 +28,6 @@ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * - * $Id: ipoib_vlan.c 1349 2004-12-16 21:09:43Z roland $ */ #include -- cgit v1.2.3 From f89271da32bc1a636cf4eb078e615930886cd013 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:44 -0700 Subject: IPoIB: Copy small received SKBs in connected mode The connected mode implementation in the IPoIB driver has a large overhead in the way SKBs are handled in the receive flow. It usually allocates an SKB with as big as was used in the currently received SKB and moves unused fragments from the old SKB to the new one. This involves a loop on all the remaining fragments and incurs overhead on the CPU. This patch, for small SKBs, allocates an SKB just large enough to contain the received data and copies to it the data from the received SKB. The newly allocated SKB is passed to the stack and the old SKB is reposted. When running netperf, UDP small messages, without this pach I get: UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 14.4.3.178 (14.4.3.178) port 0 AF_INET Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 114688 128 10.00 5142034 0 526.31 114688 10.00 1130489 115.71 With this patch I get both send and receive at ~315 mbps. The reason that send performance actually slows down is as follows: When using this patch, the overhead of the CPU for handling RX packets is dramatically reduced. As a result, we do not experience RNR NAK messages from the receiver which cause the connection to be closed and reopened again; when the patch is not used, the receiver cannot handle the packets fast enough so there is less time to post new buffers and hence the mentioned RNR NACKs. So what happens is that the application *thinks* it posted a certain number of packets for transmission but these packets are flushed and do not really get transmitted. Since the connection gets opened and closed many times, each time netperf gets the CPU time that otherwise would have been given to IPoIB to actually transmit the packets. This can be verified when looking at the port counters -- the output of ifconfig and the oputput of netperf (this is for the case without the patch): tx packets ========== port counter: 1,543,996 ifconfig: 1,581,426 netperf: 5,142,034 rx packets ========== netperf 1,1304,089 Signed-off-by: Eli Cohen --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 + drivers/infiniband/ulp/ipoib/ipoib_cm.c | 19 +++++++++++++++++++ drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 ++++++ 3 files changed, 26 insertions(+) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 0dcbab3203c..8754b364f22 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -95,6 +95,7 @@ enum { IPOIB_MCAST_FLAG_ATTACHED = 3, MAX_SEND_CQE = 16, + IPOIB_CM_COPYBREAK = 256, }; #define IPOIB_OP_RECV (1ul << 31) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 91c95929991..6223fc39af7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -523,6 +523,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) u64 mapping[IPOIB_CM_RX_SG]; int frags; int has_srq; + struct sk_buff *small_skb; ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -577,6 +578,23 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } } + if (wc->byte_len < IPOIB_CM_COPYBREAK) { + int dlen = wc->byte_len; + + small_skb = dev_alloc_skb(dlen + 12); + if (small_skb) { + skb_reserve(small_skb, 12); + ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_copy_from_linear_data(skb, small_skb->data, dlen); + ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0], + dlen, DMA_FROM_DEVICE); + skb_put(small_skb, dlen); + skb = small_skb; + goto copied; + } + } + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; @@ -599,6 +617,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); +copied: skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index f217b1edd0a..bfe1dbf9920 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1302,6 +1302,12 @@ static int __init ipoib_init_module(void) ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif + /* + * When copying small received packets, we only copy from the + * linear data part of the SKB, so we rely on this condition. + */ + BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); + ret = ipoib_register_debugfs(); if (ret) return ret; -- cgit v1.2.3 From a7d834c4bc6be73e8f83eaa5072fac3c5549f7f2 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 14 Jul 2008 23:48:47 -0700 Subject: IPoIB/cm: Fix racy use of receive WR/SGL in ipoib_cm_post_receive_nonsrq() For devices that don't support SRQs, ipoib_cm_post_receive_nonsrq() is called from both ipoib_cm_handle_rx_wc() and ipoib_cm_nonsrq_init_rx(), and these two callers are not synchronized against each other. However, ipoib_cm_post_receive_nonsrq() always reuses the same receive work request and scatter list structures, so multiple callers can end up stepping on each other, which leads to posting garbled work requests. Fix this by having the caller pass in the ib_recv_wr and ib_sge structures to use, and allocating new local structures in ipoib_cm_nonsrq_init_rx(). Based on a patch by Pradeep Satyanarayana and David Wilder , with debugging help from Hoang-Nam Nguyen . Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 63 ++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 6223fc39af7..37bf67b2a26 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -111,18 +111,20 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) } static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, - struct ipoib_cm_rx *rx, int id) + struct ipoib_cm_rx *rx, + struct ib_recv_wr *wr, + struct ib_sge *sge, int id) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_recv_wr *bad_wr; int i, ret; - priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; + wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) - priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i]; + sge[i].addr = rx->rx_ring[id].mapping[i]; - ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr); + ret = ib_post_recv(rx->qp, wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, @@ -320,10 +322,33 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev, return 0; } +static void ipoib_cm_init_rx_wr(struct net_device *dev, + struct ib_recv_wr *wr, + struct ib_sge *sge) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < priv->cm.num_frags; ++i) + sge[i].lkey = priv->mr->lkey; + + sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < priv->cm.num_frags; ++i) + sge[i].length = PAGE_SIZE; + + wr->next = NULL; + wr->sg_list = priv->cm.rx_sge; + wr->num_sge = priv->cm.num_frags; +} + static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct { + struct ib_recv_wr wr; + struct ib_sge sge[IPOIB_CM_RX_SG]; + } *t; int ret; int i; @@ -331,6 +356,14 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i if (!rx->rx_ring) return -ENOMEM; + t = kmalloc(sizeof *t, GFP_KERNEL); + if (!t) { + ret = -ENOMEM; + goto err_free; + } + + ipoib_cm_init_rx_wr(dev, &t->wr, t->sge); + spin_lock_irq(&priv->lock); if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) { @@ -349,8 +382,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; - } - ret = ipoib_cm_post_receive_nonsrq(dev, rx, i); + } + ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i); if (ret) { ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq " "failed for buf %d\n", i); @@ -361,6 +394,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i rx->recv_count = ipoib_recvq_size; + kfree(t); + return 0; err_count: @@ -369,6 +404,7 @@ err_count: spin_unlock_irq(&priv->lock); err_free: + kfree(t); ipoib_cm_free_rx_ring(dev, rx->rx_ring); return ret; @@ -637,7 +673,10 @@ repost: ipoib_warn(priv, "ipoib_cm_post_receive_srq failed " "for buf %d\n", wr_id); } else { - if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) { + if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, + &priv->cm.rx_wr, + priv->cm.rx_sge, + wr_id))) { --p->recv_count; ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed " "for buf %d\n", wr_id); @@ -1502,15 +1541,7 @@ int ipoib_cm_dev_init(struct net_device *dev) priv->cm.num_frags = IPOIB_CM_RX_SG; } - for (i = 0; i < priv->cm.num_frags; ++i) - priv->cm.rx_sge[i].lkey = priv->mr->lkey; - - priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE; - for (i = 1; i < priv->cm.num_frags; ++i) - priv->cm.rx_sge[i].length = PAGE_SIZE; - priv->cm.rx_wr.next = NULL; - priv->cm.rx_wr.sg_list = priv->cm.rx_sge; - priv->cm.rx_wr.num_sge = priv->cm.num_frags; + ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge); if (ipoib_cm_has_srq(dev)) { for (i = 0; i < ipoib_recvq_size; ++i) { -- cgit v1.2.3 From 12406734051a26e9fe4c8568e931dfddbb72d431 Mon Sep 17 00:00:00 2001 From: Ron Livne Date: Mon, 14 Jul 2008 23:48:48 -0700 Subject: IPoIB: Use multicast loopback blocking if available Set IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK for IPoIB's UD QPs if supported by the underlying device. This creates an improvement of up to 39% in bandwidth when sending multicast packets with IPoIB, and an improvment of 12% in cpu usage. Signed-off-by: Ron Livne Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 810790ae753..7b8fa36f509 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -199,7 +199,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) init_attr.recv_cq = priv->recv_cq; if (priv->hca_caps & IB_DEVICE_UD_TSO) - init_attr.create_flags = IB_QP_CREATE_IPOIB_UD_LSO; + init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + + if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; if (dev->features & NETIF_F_SG) init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1; -- cgit v1.2.3 From af40da894e96d5c826d38be3ea53ee00d9de0367 Mon Sep 17 00:00:00 2001 From: Vladimir Sokolovsky Date: Mon, 14 Jul 2008 23:48:48 -0700 Subject: IPoIB: add LRO support Add "ipoib_use_lro" module parameter to enable LRO and an "ipoib_lro_max_aggr" module parameter to set the max number of packets to be aggregated. Make LRO controllable and LRO statistics accessible through ethtool. Signed-off-by: Vladimir Sokolovsky Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/Kconfig | 1 + drivers/infiniband/ulp/ipoib/ipoib.h | 11 +++++ drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 46 +++++++++++++++++++++ drivers/infiniband/ulp/ipoib/ipoib_ib.c | 8 +++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 62 ++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index 1f76bad020f..691525cf394 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" depends on NETDEVICES && INET && (IPV6 || IPV6=n) + select INET_LRO ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 8754b364f22..2c522572e3c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -50,6 +50,7 @@ #include #include #include +#include /* constants */ @@ -94,6 +95,9 @@ enum { IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MAX_LRO_DESCRIPTORS = 8, + IPOIB_LRO_MAX_AGGR = 64, + MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, }; @@ -248,6 +252,11 @@ struct ipoib_ethtool_st { u16 max_coalesced_frames; }; +struct ipoib_lro { + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; +}; + /* * Device private locking: tx_lock protects members used in TX fast * path (and we use LLTX so upper layers don't do extra locking). @@ -334,6 +343,8 @@ struct ipoib_dev_priv { int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; + + struct ipoib_lro lro; }; struct ipoib_ah { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 10279b79c44..66af5c1a76e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -86,11 +86,57 @@ static int ipoib_set_coalesce(struct net_device *dev, return 0; } +static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { + "LRO aggregated", "LRO flushed", + "LRO avg aggr", "LRO no desc" +}; + +static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + switch (stringset) { + case ETH_SS_STATS: + memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys)); + break; + } +} + +static int ipoib_get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return ARRAY_SIZE(ipoib_stats_keys); + default: + return -EOPNOTSUPP; + } +} + +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int index = 0; + + /* Get LRO statistics */ + data[index++] = priv->lro.lro_mgr.stats.aggregated; + data[index++] = priv->lro.lro_mgr.stats.flushed; + if (priv->lro.lro_mgr.stats.flushed) + data[index++] = priv->lro.lro_mgr.stats.aggregated / + priv->lro.lro_mgr.stats.flushed; + else + data[index++] = 0; + data[index++] = priv->lro.lro_mgr.stats.no_desc; +} + static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_tso = ethtool_op_get_tso, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, + .get_flags = ethtool_op_get_flags, + .set_flags = ethtool_op_set_flags, + .get_strings = ipoib_get_strings, + .get_sset_count = ipoib_get_sset_count, + .get_ethtool_stats = ipoib_get_ethtool_stats, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index eca8518d79a..5d50e5261ee 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -288,7 +288,10 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) skb->ip_summed = CHECKSUM_UNNECESSARY; - netif_receive_skb(skb); + if (dev->features & NETIF_F_LRO) + lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); + else + netif_receive_skb(skb); repost: if (unlikely(ipoib_ib_post_receive(dev, wr_id))) @@ -440,6 +443,9 @@ poll_more: } if (done < budget) { + if (dev->features & NETIF_F_LRO) + lro_flush_all(&priv->lro.lro_mgr); + netif_rx_complete(dev, napi); if (unlikely(ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index bfe1dbf9920..fead88f7fb1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -60,6 +60,15 @@ MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); +static int lro; +module_param(lro, bool, 0444); +MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)"); + +static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; +module_param(lro_max_aggr, int, 0644); +MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " + "(default = 64)"); + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -936,6 +945,54 @@ static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; +static int get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + + if (unlikely(skb->protocol != htons(ETH_P_IP))) + return -1; + + /* + * In the future we may add an else clause that verifies the + * checksum and allows devices which do not calculate checksum + * to use LRO. + */ + if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) + return -1; + + /* Check for non-TCP packet */ + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + /* check if IP header and TCP header are complete */ + if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) + return -1; + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + + return 0; +} + +static void ipoib_lro_setup(struct ipoib_dev_priv *priv) +{ + priv->lro.lro_mgr.max_aggr = lro_max_aggr; + priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; + priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; + priv->lro.lro_mgr.get_skb_header = get_skb_hdr; + priv->lro.lro_mgr.features = LRO_F_NAPI; + priv->lro.lro_mgr.dev = priv->dev; + priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; +} + static void ipoib_setup(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -975,6 +1032,8 @@ static void ipoib_setup(struct net_device *dev) priv->dev = dev; + ipoib_lro_setup(priv); + spin_lock_init(&priv->lock); spin_lock_init(&priv->tx_lock); @@ -1152,6 +1211,9 @@ static struct net_device *ipoib_add_port(const char *format, priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; } + if (lro) + priv->dev->features |= NETIF_F_LRO; + /* * Set the full membership bit, so that we join the right * broadcast group, etc. -- cgit v1.2.3 From ee1e2c82c245a5fb2864e9dbcdaab3390fde3fcc Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Mon, 14 Jul 2008 23:48:49 -0700 Subject: IPoIB: Refresh paths instead of flushing them on SM change events The patch tries to solve the problem of device going down and paths being flushed on an SM change event. The method is to mark the paths as candidates for refresh (by setting the new valid flag to 0), and wait for an ARP probe a new path record query. The solution requires a different and less intrusive handling of SM change event. For that, the second argument of the flush function changes its meaning from a boolean flag to a level. In most cases, SM failover doesn't cause LID change so traffic won't stop. In the rare cases of LID change, the remote host (the one that hadn't changed its LID) will lose connectivity until paths are refreshed. This is no worse than the current state. In fact, preventing the device from going down saves packets that otherwise would be lost. Signed-off-by: Moni Levy Signed-off-by: Moni Shoua Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 17 ++++++++++-- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 42 ++++++++++++++++++---------- drivers/infiniband/ulp/ipoib/ipoib_main.c | 44 +++++++++++++++++++++++++++--- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 18 ++++++------ 4 files changed, 91 insertions(+), 30 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 2c522572e3c..bb19587c5ea 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -54,6 +54,12 @@ /* constants */ +enum ipoib_flush_level { + IPOIB_FLUSH_LIGHT, + IPOIB_FLUSH_NORMAL, + IPOIB_FLUSH_HEAVY +}; + enum { IPOIB_ENCAP_LEN = 4, @@ -284,10 +290,11 @@ struct ipoib_dev_priv { struct delayed_work pkey_poll_task; struct delayed_work mcast_task; - struct work_struct flush_task; + struct work_struct flush_light; + struct work_struct flush_normal; + struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; - struct work_struct pkey_event_task; struct ib_device *ca; u8 port; @@ -369,6 +376,7 @@ struct ipoib_path { struct rb_node rb_node; struct list_head list; + int valid; }; struct ipoib_neigh { @@ -433,11 +441,14 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); +void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); -void ipoib_ib_dev_flush(struct work_struct *work); +void ipoib_ib_dev_flush_light(struct work_struct *work); +void ipoib_ib_dev_flush_normal(struct work_struct *work); +void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 5d50e5261ee..66cafa20c24 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -902,7 +902,8 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) return 0; } -static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) +static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, + enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; @@ -915,7 +916,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) * the parent is down. */ list_for_each_entry(cpriv, &priv->child_intfs, list) - __ipoib_ib_dev_flush(cpriv, pkey_event); + __ipoib_ib_dev_flush(cpriv, level); mutex_unlock(&priv->vlan_mutex); @@ -929,7 +930,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) return; } - if (pkey_event) { + if (level == IPOIB_FLUSH_HEAVY) { if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ipoib_ib_dev_down(dev, 0); @@ -947,11 +948,15 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) priv->pkey_index = new_index; } - ipoib_dbg(priv, "flushing\n"); + if (level == IPOIB_FLUSH_LIGHT) { + ipoib_mark_paths_invalid(dev); + ipoib_mcast_dev_flush(dev); + } - ipoib_ib_dev_down(dev, 0); + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_down(dev, 0); - if (pkey_event) { + if (level == IPOIB_FLUSH_HEAVY) { ipoib_ib_dev_stop(dev, 0); ipoib_ib_dev_open(dev); } @@ -961,27 +966,34 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event) * we get here, don't bring it back up if it's not configured up */ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { - ipoib_ib_dev_up(dev); + if (level >= IPOIB_FLUSH_NORMAL) + ipoib_ib_dev_up(dev); ipoib_mcast_restart_task(&priv->restart_task); } } -void ipoib_ib_dev_flush(struct work_struct *work) +void ipoib_ib_dev_flush_light(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, flush_light); + + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); +} + +void ipoib_ib_dev_flush_normal(struct work_struct *work) { struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, flush_task); + container_of(work, struct ipoib_dev_priv, flush_normal); - ipoib_dbg(priv, "Flushing %s\n", priv->dev->name); - __ipoib_ib_dev_flush(priv, 0); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); } -void ipoib_pkey_event(struct work_struct *work) +void ipoib_ib_dev_flush_heavy(struct work_struct *work) { struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, pkey_event_task); + container_of(work, struct ipoib_dev_priv, flush_heavy); - ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name); - __ipoib_ib_dev_flush(priv, 1); + __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); } void ipoib_ib_dev_cleanup(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index fead88f7fb1..b3fd7e8333c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -357,6 +357,23 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ +void ipoib_mark_paths_invalid(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path, *tp; + + spin_lock_irq(&priv->lock); + + list_for_each_entry_safe(path, tp, &priv->path_list, list) { + ipoib_dbg(priv, "mark path LID 0x%04x GID " IPOIB_GID_FMT " invalid\n", + be16_to_cpu(path->pathrec.dlid), + IPOIB_GID_ARG(path->pathrec.dgid)); + path->valid = 0; + } + + spin_unlock_irq(&priv->lock); +} + void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -393,6 +410,7 @@ static void path_rec_completion(int status, struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_ah *ah = NULL; + struct ipoib_ah *old_ah; struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; @@ -416,6 +434,7 @@ static void path_rec_completion(int status, spin_lock_irqsave(&priv->lock, flags); + old_ah = path->ah; path->ah = ah; if (ah) { @@ -428,6 +447,17 @@ static void path_rec_completion(int status, __skb_queue_tail(&skqueue, skb); list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { + if (neigh->ah) { + WARN_ON(neigh->ah != old_ah); + /* + * Dropping the ah reference inside + * priv->lock is safe here, because we + * will hold one more reference from + * the original value of path->ah (ie + * old_ah). + */ + ipoib_put_ah(neigh->ah); + } kref_get(&path->ah->ref); neigh->ah = path->ah; memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, @@ -450,6 +480,7 @@ static void path_rec_completion(int status, while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } + path->valid = 1; } path->query = NULL; @@ -457,6 +488,9 @@ static void path_rec_completion(int status, spin_unlock_irqrestore(&priv->lock, flags); + if (old_ah) + ipoib_put_ah(old_ah); + while ((skb = __skb_dequeue(&skqueue))) { skb->dev = dev; if (dev_queue_xmit(skb)) @@ -630,8 +664,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock(&priv->lock); path = __path_find(dev, phdr->hwaddr + 4); - if (!path) { - path = path_rec_create(dev, phdr->hwaddr + 4); + if (!path || !path->valid) { + if (!path) + path = path_rec_create(dev, phdr->hwaddr + 4); if (path) { /* put pseudoheader back on for next time */ skb_push(skb, sizeof *phdr); @@ -1046,9 +1081,10 @@ static void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); - INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); - INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush); + INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); + INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); + INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 7b8fa36f509..96f9aa79cbb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -290,15 +290,17 @@ void ipoib_event(struct ib_event_handler *handler, if (record->element.port_num != priv->port) return; - if (record->event == IB_EVENT_PORT_ERR || - record->event == IB_EVENT_PORT_ACTIVE || - record->event == IB_EVENT_LID_CHANGE || - record->event == IB_EVENT_SM_CHANGE || + ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, + record->device->name, record->element.port_num); + + if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { - ipoib_dbg(priv, "Port state change event\n"); - queue_work(ipoib_workqueue, &priv->flush_task); + queue_work(ipoib_workqueue, &priv->flush_light); + } else if (record->event == IB_EVENT_PORT_ERR || + record->event == IB_EVENT_PORT_ACTIVE || + record->event == IB_EVENT_LID_CHANGE) { + queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { - ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port); - queue_work(ipoib_workqueue, &priv->pkey_event_task); + queue_work(ipoib_workqueue, &priv->flush_heavy); } } -- cgit v1.2.3 From c03d4731b5b6de45b95a10bf1d510dde423d6757 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:50 -0700 Subject: IPoIB: Remove unused IPOIB_MCAST_STARTED code The IPOIB_MCAST_STARTED flag is not used at all since commit b3e2749b ("IPoIB: Don't drop multicast sends when they can be queued"), so remove it. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 - drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index bb19587c5ea..66a897567ea 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -89,7 +89,6 @@ enum { IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, - IPOIB_MCAST_STARTED = 8, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, IPOIB_FLAG_CSUM = 11, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 4a6538b9301..0b7d129161e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -592,10 +592,6 @@ int ipoib_mcast_start_thread(struct net_device *dev) queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); mutex_unlock(&mcast_mutex); - spin_lock_irq(&priv->lock); - set_bit(IPOIB_MCAST_STARTED, &priv->flags); - spin_unlock_irq(&priv->lock); - return 0; } @@ -605,10 +601,6 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - spin_lock_irq(&priv->lock); - clear_bit(IPOIB_MCAST_STARTED, &priv->flags); - spin_unlock_irq(&priv->lock); - mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); cancel_delayed_work(&priv->mcast_task); -- cgit v1.2.3 From 5892eff91ad60ba365ae7f75050ce464036c5396 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:50 -0700 Subject: IPoIB: Remove priv->mcast_mutex No need for a mutex around calls to ib_attach_mcast/ib_detach_mcast since these operations are synchronized at the HW driver layer. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 - drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 - drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 4 ---- 3 files changed, 6 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 66a897567ea..b8753222c87 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -277,7 +277,6 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex mcast_mutex; struct mutex vlan_mutex; struct rb_root path_tree; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b3fd7e8333c..8be9ea0436e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1072,7 +1072,6 @@ static void ipoib_setup(struct net_device *dev) spin_lock_init(&priv->lock); spin_lock_init(&priv->tx_lock); - mutex_init(&priv->mcast_mutex); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 96f9aa79cbb..f50ebe0643e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -61,9 +61,7 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) } /* attach QP to multicast group */ - mutex_lock(&priv->mcast_mutex); ret = ib_attach_mcast(priv->qp, mgid, mlid); - mutex_unlock(&priv->mcast_mutex); if (ret) ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret); @@ -77,9 +75,7 @@ int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid) struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - mutex_lock(&priv->mcast_mutex); ret = ib_detach_mcast(priv->qp, mgid, mlid); - mutex_unlock(&priv->mcast_mutex); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); -- cgit v1.2.3 From d0de13622d5ac658efe7c51521dbdbe0752aa3dd Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:50 -0700 Subject: IPoIB: Only set Q_Key once: after joining broadcast group The current code will set the Q_Key for any join of a non-sendonly multicast group. The operation involves a modify QP operation, which is fairly heavyweight, and is only really required after the join of the broadcast group. Fix this by adding a parameter to ipoib_mcast_attach() to control when the Q_Key is set. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 4 +++- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 28 ++++++++++++++------------ 3 files changed, 19 insertions(+), 15 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index b8753222c87..7b46e2d7b3c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -485,7 +485,7 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, #endif int ipoib_mcast_attach(struct net_device *dev, u16 mlid, - union ib_gid *mgid); + union ib_gid *mgid, int set_qkey); int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 0b7d129161e..55ebd950bf2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -186,6 +186,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_ah *ah; int ret; + int set_qkey = 0; mcast->mcmember = *mcmember; @@ -200,6 +201,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; + set_qkey = 1; } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -212,7 +214,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, } ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid), - &mcast->mcmember.mgid); + &mcast->mcmember.mgid, set_qkey); if (ret < 0) { ipoib_warn(priv, "couldn't attach QP to multicast group " IPOIB_GID_FMT "\n", diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index f50ebe0643e..ba7c8868e6f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -33,18 +33,13 @@ #include "ipoib.h" -int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) +int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_qp_attr *qp_attr; + struct ib_qp_attr *qp_attr = NULL; int ret; u16 pkey_index; - ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) - goto out; - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) { clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = -ENXIO; @@ -52,12 +47,19 @@ int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid) } set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - /* set correct QKey for QP */ - qp_attr->qkey = priv->qkey; - ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); - if (ret) { - ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); - goto out; + if (set_qkey) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) + goto out; + + /* set correct QKey for QP */ + qp_attr->qkey = priv->qkey; + ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY); + if (ret) { + ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret); + goto out; + } } /* attach QP to multicast group */ -- cgit v1.2.3 From 9eae554c171e086c89ab83da2a2d3c8bf958fcb5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 14 Jul 2008 23:48:50 -0700 Subject: IPoIB: Get rid of ipoib_mcast_detach() wrapper ipoib_mcast_detach() does nothing except call ib_detach_mcast(), so just use the core API in the one place that does a multicast group detach. add/remove: 0/1 grow/shrink: 0/1 up/down: 0/-105 (-105) function old new delta ipoib_mcast_leave 357 319 -38 ipoib_mcast_detach 67 - -67 Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 -- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 +++--- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 12 ------------ 3 files changed, 3 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b46e2d7b3c..a89b9fbe1ef 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -486,8 +486,6 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey); -int ipoib_mcast_detach(struct net_device *dev, u16 mlid, - union ib_gid *mgid); int ipoib_init_qp(struct net_device *dev); int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 55ebd950bf2..71add7a8d53 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -627,10 +627,10 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) IPOIB_GID_ARG(mcast->mcmember.mgid)); /* Remove ourselves from the multicast group */ - ret = ipoib_mcast_detach(dev, be16_to_cpu(mcast->mcmember.mlid), - &mcast->mcmember.mgid); + ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid, + be16_to_cpu(mcast->mcmember.mlid)); if (ret) - ipoib_warn(priv, "ipoib_mcast_detach failed (result = %d)\n", ret); + ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); } return 0; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index ba7c8868e6f..68325119f74 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -72,18 +72,6 @@ out: return ret; } -int ipoib_mcast_detach(struct net_device *dev, u16 mlid, union ib_gid *mgid) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - int ret; - - ret = ib_detach_mcast(priv->qp, mgid, mlid); - if (ret) - ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); - - return ret; -} - int ipoib_init_qp(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); -- cgit v1.2.3 From c8c2afe360b7366f586f6bece1109a72ea334876 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:51 -0700 Subject: IPoIB: Use rtnl lock/unlock when changing device flags Use of this lock is required to synchronize changes to the netdvice's data structs. Also move the call to ipoib_flush_paths() after the modification of the netdevice flags in set_mode(). Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 8 ++++++-- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 37bf67b2a26..b4269139135 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1440,7 +1440,9 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); + rtnl_lock(); dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); + rtnl_unlock(); priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); @@ -1449,14 +1451,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - dev->mtu = min(priv->mcast_mtu, dev->mtu); - ipoib_flush_paths(dev); + rtnl_lock(); if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; if (priv->hca_caps & IB_DEVICE_UD_TSO) dev->features |= NETIF_F_TSO; } + dev->mtu = min(priv->mcast_mtu, dev->mtu); + rtnl_unlock(); + ipoib_flush_paths(dev); return count; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 71add7a8d53..be1ed38cdcf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -575,8 +575,11 @@ void ipoib_mcast_join_task(struct work_struct *work) priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); - if (!ipoib_cm_admin_enabled(dev)) + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + rtnl_unlock(); + } ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); -- cgit v1.2.3 From bd3606715effbf37df986548c43bbed0842b49d5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:51 -0700 Subject: IPoIB: Use dev_set_mtu() to change mtu When the driver sets the MTU of the net device outside of its change_mtu method, it should make use of dev_set_mtu() instead of directly setting the mtu field of struct netdevice. Otherwise functions registered to be called upon MTU change will not get called (this is done through call_netdevice_notifiers() in dev_set_mtu()). Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index b4269139135..87f9f3ef3b2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1458,7 +1458,7 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, if (priv->hca_caps & IB_DEVICE_UD_TSO) dev->features |= NETIF_F_TSO; } - dev->mtu = min(priv->mcast_mtu, dev->mtu); + dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); rtnl_unlock(); ipoib_flush_paths(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index be1ed38cdcf..1fcc9a898d8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -577,7 +577,7 @@ void ipoib_mcast_join_task(struct work_struct *work) if (!ipoib_cm_admin_enabled(dev)) { rtnl_lock(); - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); rtnl_unlock(); } -- cgit v1.2.3 From e112373fd6aa280bd2cbc0d5cc3809115325a1be Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:52 -0700 Subject: IPoIB/cm: Reduce connected mode TX object size Since IPoIB connected mode does not NETIF_F_SG, we only have one DMA mapping per send, so we don't need a mapping[] array. Define a new struct with a single u64 mapping member and use it for the CM tx_ring. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 7 ++++++- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 12 ++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index a89b9fbe1ef..0281c8fecc9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -157,6 +157,11 @@ struct ipoib_tx_buf { u64 mapping[MAX_SKB_FRAGS + 1]; }; +struct ipoib_cm_tx_buf { + struct sk_buff *skb; + u64 mapping; +}; + struct ib_cm_id; struct ipoib_cm_data { @@ -215,7 +220,7 @@ struct ipoib_cm_tx { struct net_device *dev; struct ipoib_neigh *neigh; struct ipoib_path *path; - struct ipoib_tx_buf *tx_ring; + struct ipoib_cm_tx_buf *tx_ring; unsigned tx_head; unsigned tx_tail; unsigned long flags; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 87f9f3ef3b2..0f2d3045061 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -703,7 +703,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_tx_buf *tx_req; + struct ipoib_cm_tx_buf *tx_req; u64 addr; if (unlikely(skb->len > tx->mtu)) { @@ -734,7 +734,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ return; } - tx_req->mapping[0] = addr; + tx_req->mapping = addr; if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), addr, skb->len))) { @@ -759,7 +759,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx *tx = wc->qp->qp_context; unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; - struct ipoib_tx_buf *tx_req; + struct ipoib_cm_tx_buf *tx_req; unsigned long flags; ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", @@ -773,7 +773,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) tx_req = &tx->tx_ring[wr_id]; - ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE); + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); /* FIXME: is this right? Shouldn't we only increment on success? */ ++dev->stats.tx_packets; @@ -1143,7 +1143,7 @@ err_tx: static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = netdev_priv(p->dev); - struct ipoib_tx_buf *tx_req; + struct ipoib_cm_tx_buf *tx_req; unsigned long flags; unsigned long begin; @@ -1171,7 +1171,7 @@ timeout: while ((int) p->tx_tail - (int) p->tx_head < 0) { tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); dev_kfree_skb_any(tx_req->skb); ++p->tx_tail; -- cgit v1.2.3 From bc3a290b51aaefc6a6af2d6e6d52ed32387c416c Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 14 Jul 2008 23:48:52 -0700 Subject: IPoIB: Double default RX/TX ring sizes Increase IPoIB ring sizes to twice their original sizes (RX: 128->256, TX: 64->128) to act as a shock absorber for high traffic peaks. With the current settings, we have seen cases that there are many calls to netif_stop_queue(), which causes degradation in throughput. Also, larger receive buffer sizes help IPoIB in CM mode to avoid experiencing RNR NAK conditions due to insufficient receive buffers at the SRQ. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 0281c8fecc9..b0ffc9abe8c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -70,8 +70,8 @@ enum { IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, - IPOIB_RX_RING_SIZE = 128, - IPOIB_TX_RING_SIZE = 64, + IPOIB_RX_RING_SIZE = 256, + IPOIB_TX_RING_SIZE = 128, IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, -- cgit v1.2.3