From 13aa6ecb47990cfc78e20e347fdd3f1df6189426 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 12 Jun 2006 16:57:51 +0300 Subject: IB/mthca: restore missing PCI registers after reset mthca does not restore the following PCI-X/PCI Express registers after reset: PCI-X device: PCI-X command register PCI-X bridge: upstream and downstream split transaction registers PCI Express : PCI Express device control and link control registers This causes instability and/or bad performance on systems where one of these registers is set to a non-default value by BIOS. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_reset.c | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c index df5e494a9d3..f4fddd5327f 100644 --- a/drivers/infiniband/hw/mthca/mthca_reset.c +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -49,6 +49,12 @@ int mthca_reset(struct mthca_dev *mdev) u32 *hca_header = NULL; u32 *bridge_header = NULL; struct pci_dev *bridge = NULL; + int bridge_pcix_cap = 0; + int hca_pcie_cap = 0; + int hca_pcix_cap = 0; + + u16 devctl; + u16 linkctl; #define MTHCA_RESET_OFFSET 0xf0010 #define MTHCA_RESET_VALUE swab32(1) @@ -110,6 +116,9 @@ int mthca_reset(struct mthca_dev *mdev) } } + hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX); + hca_pcie_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP); + if (bridge) { bridge_header = kmalloc(256, GFP_KERNEL); if (!bridge_header) { @@ -129,6 +138,13 @@ int mthca_reset(struct mthca_dev *mdev) goto out; } } + bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX); + if (!bridge_pcix_cap) { + err = -ENODEV; + mthca_err(mdev, "Couldn't locate HCA bridge " + "PCI-X capability, aborting.\n"); + goto out; + } } /* actually hit reset */ @@ -178,6 +194,20 @@ int mthca_reset(struct mthca_dev *mdev) good: /* Now restore the PCI headers */ if (bridge) { + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8, + bridge_header[(bridge_pcix_cap + 0x8) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Upstream " + "split transaction control, aborting.\n"); + goto out; + } + if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc, + bridge_header[(bridge_pcix_cap + 0xc) / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA bridge Downstream " + "split transaction control, aborting.\n"); + goto out; + } /* * Bridge control register is at 0x3e, so we'll * naturally restore it last in this loop. @@ -203,6 +233,35 @@ good: } } + if (hca_pcix_cap) { + if (pci_write_config_dword(mdev->pdev, hca_pcix_cap, + hca_header[hca_pcix_cap / 4])) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI-X " + "command register, aborting.\n"); + goto out; + } + } + + if (hca_pcie_cap) { + devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4]; + if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_DEVCTL, + devctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Device Control register, aborting.\n"); + goto out; + } + linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4]; + if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_LNKCTL, + linkctl)) { + err = -ENODEV; + mthca_err(mdev, "Couldn't restore HCA PCI Express " + "Link control register, aborting.\n"); + goto out; + } + } + for (i = 0; i < 16; ++i) { if (i * 4 == PCI_COMMAND) continue; -- cgit v1.2.3 From 4e56ea794ec8636991e21942fc2e0d071ea8ee1d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 13 Jun 2006 17:19:42 +0300 Subject: IB/mthca: memfree completion with error FW bug workaround Memfree firmware is in rare cases reporting WQE index == base - 1 in receive completion with error, instead of (rq size - 1); base is 0 in mthca. Here is a patch to avoid kernel crash and report a correct WR id in this case. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cq.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index 205854e9c66..87a8f1166a3 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -540,8 +540,17 @@ static inline int mthca_poll_one(struct mthca_dev *dev, entry->wr_id = srq->wrid[wqe_index]; mthca_free_srq_wqe(srq, wqe); } else { + s32 wqe; wq = &(*cur_qp)->rq; - wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift; + wqe = be32_to_cpu(cqe->wqe); + wqe_index = wqe >> wq->wqe_shift; + /* + * WQE addr == base - 1 might be reported in receive completion + * with error instead of (rq size - 1) by Sinai FW 1.0.800 and + * Arbel FW 5.1.400. This bug should be fixed in later FW revs. + */ + if (unlikely(wqe_index < 0)) + wqe_index = wq->max - 1; entry->wr_id = (*cur_qp)->wrid[wqe_index]; } -- cgit v1.2.3 From 6a9af2e18a5c6ebcf8283309d20ac0e9fa35e346 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:27 -0700 Subject: IB: common handling for marshalling parameters to/from userspace Provide common handling for marshalling data between userspace clients and kernel InfiniBand drivers. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/ucm.c | 131 +++------------------------- drivers/infiniband/core/uverbs_marshall.c | 138 ++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 118 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_marshall.c (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ec3353f24b2..a56bf9c994f 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -16,4 +16,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o -ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o +ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_mem.o \ + uverbs_marshall.o diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 9164a09b6cc..b396bf703f8 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ucm.c 2594 2005-06-13 19:46:02Z libor $ + * $Id: ucm.c 4311 2005-12-05 18:42:01Z sean.hefty $ */ #include @@ -50,6 +50,7 @@ #include #include +#include MODULE_AUTHOR("Libor Michalek"); MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); @@ -205,36 +206,6 @@ error: return NULL; } -static void ib_ucm_event_path_get(struct ib_ucm_path_rec *upath, - struct ib_sa_path_rec *kpath) -{ - if (!kpath || !upath) - return; - - memcpy(upath->dgid, kpath->dgid.raw, sizeof *upath->dgid); - memcpy(upath->sgid, kpath->sgid.raw, sizeof *upath->sgid); - - upath->dlid = kpath->dlid; - upath->slid = kpath->slid; - upath->raw_traffic = kpath->raw_traffic; - upath->flow_label = kpath->flow_label; - upath->hop_limit = kpath->hop_limit; - upath->traffic_class = kpath->traffic_class; - upath->reversible = kpath->reversible; - upath->numb_path = kpath->numb_path; - upath->pkey = kpath->pkey; - upath->sl = kpath->sl; - upath->mtu_selector = kpath->mtu_selector; - upath->mtu = kpath->mtu; - upath->rate_selector = kpath->rate_selector; - upath->rate = kpath->rate; - upath->packet_life_time = kpath->packet_life_time; - upath->preference = kpath->preference; - - upath->packet_life_time_selector = - kpath->packet_life_time_selector; -} - static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, struct ib_cm_req_event_param *kreq) { @@ -253,8 +224,10 @@ static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, ureq->srq = kreq->srq; ureq->port = kreq->port; - ib_ucm_event_path_get(&ureq->primary_path, kreq->primary_path); - ib_ucm_event_path_get(&ureq->alternate_path, kreq->alternate_path); + ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); + if (kreq->alternate_path) + ib_copy_path_rec_to_user(&ureq->alternate_path, + kreq->alternate_path); } static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, @@ -324,8 +297,8 @@ static int ib_ucm_event_process(struct ib_cm_event *evt, info = evt->param.rej_rcvd.ari; break; case IB_CM_LAP_RECEIVED: - ib_ucm_event_path_get(&uvt->resp.u.lap_resp.path, - evt->param.lap_rcvd.alternate_path); + ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, + evt->param.lap_rcvd.alternate_path); uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; uvt->resp.present = IB_UCM_PRES_ALTERNATE; break; @@ -637,65 +610,11 @@ static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, return result; } -static void ib_ucm_copy_ah_attr(struct ib_ucm_ah_attr *dest_attr, - struct ib_ah_attr *src_attr) -{ - memcpy(dest_attr->grh_dgid, src_attr->grh.dgid.raw, - sizeof src_attr->grh.dgid); - dest_attr->grh_flow_label = src_attr->grh.flow_label; - dest_attr->grh_sgid_index = src_attr->grh.sgid_index; - dest_attr->grh_hop_limit = src_attr->grh.hop_limit; - dest_attr->grh_traffic_class = src_attr->grh.traffic_class; - - dest_attr->dlid = src_attr->dlid; - dest_attr->sl = src_attr->sl; - dest_attr->src_path_bits = src_attr->src_path_bits; - dest_attr->static_rate = src_attr->static_rate; - dest_attr->is_global = (src_attr->ah_flags & IB_AH_GRH); - dest_attr->port_num = src_attr->port_num; -} - -static void ib_ucm_copy_qp_attr(struct ib_ucm_init_qp_attr_resp *dest_attr, - struct ib_qp_attr *src_attr) -{ - dest_attr->cur_qp_state = src_attr->cur_qp_state; - dest_attr->path_mtu = src_attr->path_mtu; - dest_attr->path_mig_state = src_attr->path_mig_state; - dest_attr->qkey = src_attr->qkey; - dest_attr->rq_psn = src_attr->rq_psn; - dest_attr->sq_psn = src_attr->sq_psn; - dest_attr->dest_qp_num = src_attr->dest_qp_num; - dest_attr->qp_access_flags = src_attr->qp_access_flags; - - dest_attr->max_send_wr = src_attr->cap.max_send_wr; - dest_attr->max_recv_wr = src_attr->cap.max_recv_wr; - dest_attr->max_send_sge = src_attr->cap.max_send_sge; - dest_attr->max_recv_sge = src_attr->cap.max_recv_sge; - dest_attr->max_inline_data = src_attr->cap.max_inline_data; - - ib_ucm_copy_ah_attr(&dest_attr->ah_attr, &src_attr->ah_attr); - ib_ucm_copy_ah_attr(&dest_attr->alt_ah_attr, &src_attr->alt_ah_attr); - - dest_attr->pkey_index = src_attr->pkey_index; - dest_attr->alt_pkey_index = src_attr->alt_pkey_index; - dest_attr->en_sqd_async_notify = src_attr->en_sqd_async_notify; - dest_attr->sq_draining = src_attr->sq_draining; - dest_attr->max_rd_atomic = src_attr->max_rd_atomic; - dest_attr->max_dest_rd_atomic = src_attr->max_dest_rd_atomic; - dest_attr->min_rnr_timer = src_attr->min_rnr_timer; - dest_attr->port_num = src_attr->port_num; - dest_attr->timeout = src_attr->timeout; - dest_attr->retry_cnt = src_attr->retry_cnt; - dest_attr->rnr_retry = src_attr->rnr_retry; - dest_attr->alt_port_num = src_attr->alt_port_num; - dest_attr->alt_timeout = src_attr->alt_timeout; -} - static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ib_ucm_init_qp_attr_resp resp; + struct ib_uverbs_qp_attr resp; struct ib_ucm_init_qp_attr cmd; struct ib_ucm_context *ctx; struct ib_qp_attr qp_attr; @@ -718,7 +637,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_ucm_copy_qp_attr(&resp, &qp_attr); + ib_copy_qp_attr_to_user(&resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) @@ -793,7 +712,7 @@ static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src) { - struct ib_ucm_path_rec ucm_path; + struct ib_user_path_rec upath; struct ib_sa_path_rec *sa_path; *path = NULL; @@ -805,36 +724,14 @@ static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src) if (!sa_path) return -ENOMEM; - if (copy_from_user(&ucm_path, (void __user *)(unsigned long)src, - sizeof(ucm_path))) { + if (copy_from_user(&upath, (void __user *)(unsigned long)src, + sizeof(upath))) { kfree(sa_path); return -EFAULT; } - memcpy(sa_path->dgid.raw, ucm_path.dgid, sizeof sa_path->dgid); - memcpy(sa_path->sgid.raw, ucm_path.sgid, sizeof sa_path->sgid); - - sa_path->dlid = ucm_path.dlid; - sa_path->slid = ucm_path.slid; - sa_path->raw_traffic = ucm_path.raw_traffic; - sa_path->flow_label = ucm_path.flow_label; - sa_path->hop_limit = ucm_path.hop_limit; - sa_path->traffic_class = ucm_path.traffic_class; - sa_path->reversible = ucm_path.reversible; - sa_path->numb_path = ucm_path.numb_path; - sa_path->pkey = ucm_path.pkey; - sa_path->sl = ucm_path.sl; - sa_path->mtu_selector = ucm_path.mtu_selector; - sa_path->mtu = ucm_path.mtu; - sa_path->rate_selector = ucm_path.rate_selector; - sa_path->rate = ucm_path.rate; - sa_path->packet_life_time = ucm_path.packet_life_time; - sa_path->preference = ucm_path.preference; - - sa_path->packet_life_time_selector = - ucm_path.packet_life_time_selector; - + ib_copy_path_rec_from_user(sa_path, &upath); *path = sa_path; return 0; } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c new file mode 100644 index 00000000000..ce46b13ae02 --- /dev/null +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +static void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, + struct ib_ah_attr *src) +{ + memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid); + dst->grh.flow_label = src->grh.flow_label; + dst->grh.sgid_index = src->grh.sgid_index; + dst->grh.hop_limit = src->grh.hop_limit; + dst->grh.traffic_class = src->grh.traffic_class; + dst->dlid = src->dlid; + dst->sl = src->sl; + dst->src_path_bits = src->src_path_bits; + dst->static_rate = src->static_rate; + dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; + dst->port_num = src->port_num; +} + +void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src) +{ + dst->cur_qp_state = src->cur_qp_state; + dst->path_mtu = src->path_mtu; + dst->path_mig_state = src->path_mig_state; + dst->qkey = src->qkey; + dst->rq_psn = src->rq_psn; + dst->sq_psn = src->sq_psn; + dst->dest_qp_num = src->dest_qp_num; + dst->qp_access_flags = src->qp_access_flags; + + dst->max_send_wr = src->cap.max_send_wr; + dst->max_recv_wr = src->cap.max_recv_wr; + dst->max_send_sge = src->cap.max_send_sge; + dst->max_recv_sge = src->cap.max_recv_sge; + dst->max_inline_data = src->cap.max_inline_data; + + ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + + dst->pkey_index = src->pkey_index; + dst->alt_pkey_index = src->alt_pkey_index; + dst->en_sqd_async_notify = src->en_sqd_async_notify; + dst->sq_draining = src->sq_draining; + dst->max_rd_atomic = src->max_rd_atomic; + dst->max_dest_rd_atomic = src->max_dest_rd_atomic; + dst->min_rnr_timer = src->min_rnr_timer; + dst->port_num = src->port_num; + dst->timeout = src->timeout; + dst->retry_cnt = src->retry_cnt; + dst->rnr_retry = src->rnr_retry; + dst->alt_port_num = src->alt_port_num; + dst->alt_timeout = src->alt_timeout; +} +EXPORT_SYMBOL(ib_copy_qp_attr_to_user); + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct ib_sa_path_rec *src) +{ + memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid); + memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} +EXPORT_SYMBOL(ib_copy_path_rec_to_user); + +void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, + struct ib_user_path_rec *src) +{ + memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); + memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} +EXPORT_SYMBOL(ib_copy_path_rec_from_user); -- cgit v1.2.3 From 6e61d04f2d8c7ac4f67e1f498ed2a2a3ad8edaa3 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:28 -0700 Subject: IB/cm: Match connection requests based on private data Extend matching connection requests to listens in the InfiniBand CM to include private data checks. This allows applications to listen on the same service identifier, with private data directing the request to the appropriate application. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 91 +++++++++++++++++++++++++++++++++++++------ drivers/infiniband/core/ucm.c | 19 ++++++++- 2 files changed, 97 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 86fee43502c..490fd03766d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -32,7 +32,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: cm.c 2821 2005-07-08 17:07:28Z sean.hefty $ + * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $ */ #include @@ -132,6 +132,7 @@ struct cm_id_private { /* todo: use alternate port on send failure */ struct cm_av av; struct cm_av alt_av; + struct ib_cm_compare_data *compare_data; void *private_data; __be64 tid; @@ -357,6 +358,41 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id) return cm_id_priv; } +static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask) +{ + int i; + + for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++) + ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] & + ((unsigned long *) mask)[i]; +} + +static int cm_compare_data(struct ib_cm_compare_data *src_data, + struct ib_cm_compare_data *dst_data) +{ + u8 src[IB_CM_COMPARE_SIZE]; + u8 dst[IB_CM_COMPARE_SIZE]; + + if (!src_data || !dst_data) + return 0; + + cm_mask_copy(src, src_data->data, dst_data->mask); + cm_mask_copy(dst, dst_data->data, src_data->mask); + return memcmp(src, dst, IB_CM_COMPARE_SIZE); +} + +static int cm_compare_private_data(u8 *private_data, + struct ib_cm_compare_data *dst_data) +{ + u8 src[IB_CM_COMPARE_SIZE]; + + if (!dst_data) + return 0; + + cm_mask_copy(src, private_data, dst_data->mask); + return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE); +} + static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) { struct rb_node **link = &cm.listen_service_table.rb_node; @@ -364,14 +400,18 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) struct cm_id_private *cur_cm_id_priv; __be64 service_id = cm_id_priv->id.service_id; __be64 service_mask = cm_id_priv->id.service_mask; + int data_cmp; while (*link) { parent = *link; cur_cm_id_priv = rb_entry(parent, struct cm_id_private, service_node); + data_cmp = cm_compare_data(cm_id_priv->compare_data, + cur_cm_id_priv->compare_data); if ((cur_cm_id_priv->id.service_mask & service_id) == (service_mask & cur_cm_id_priv->id.service_id) && - (cm_id_priv->id.device == cur_cm_id_priv->id.device)) + (cm_id_priv->id.device == cur_cm_id_priv->id.device) && + !data_cmp) return cur_cm_id_priv; if (cm_id_priv->id.device < cur_cm_id_priv->id.device) @@ -380,6 +420,10 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) link = &(*link)->rb_right; else if (service_id < cur_cm_id_priv->id.service_id) link = &(*link)->rb_left; + else if (service_id > cur_cm_id_priv->id.service_id) + link = &(*link)->rb_right; + else if (data_cmp < 0) + link = &(*link)->rb_left; else link = &(*link)->rb_right; } @@ -389,16 +433,20 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv) } static struct cm_id_private * cm_find_listen(struct ib_device *device, - __be64 service_id) + __be64 service_id, + u8 *private_data) { struct rb_node *node = cm.listen_service_table.rb_node; struct cm_id_private *cm_id_priv; + int data_cmp; while (node) { cm_id_priv = rb_entry(node, struct cm_id_private, service_node); + data_cmp = cm_compare_private_data(private_data, + cm_id_priv->compare_data); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && - (cm_id_priv->id.device == device)) + (cm_id_priv->id.device == device) && !data_cmp) return cm_id_priv; if (device < cm_id_priv->id.device) @@ -407,6 +455,10 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, node = node->rb_right; else if (service_id < cm_id_priv->id.service_id) node = node->rb_left; + else if (service_id > cm_id_priv->id.service_id) + node = node->rb_right; + else if (data_cmp < 0) + node = node->rb_left; else node = node->rb_right; } @@ -730,15 +782,14 @@ retest: wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); - if (cm_id_priv->private_data && cm_id_priv->private_data_len) - kfree(cm_id_priv->private_data); + kfree(cm_id_priv->compare_data); + kfree(cm_id_priv->private_data); kfree(cm_id_priv); } EXPORT_SYMBOL(ib_destroy_cm_id); -int ib_cm_listen(struct ib_cm_id *cm_id, - __be64 service_id, - __be64 service_mask) +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, + struct ib_cm_compare_data *compare_data) { struct cm_id_private *cm_id_priv, *cur_cm_id_priv; unsigned long flags; @@ -752,7 +803,19 @@ int ib_cm_listen(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - BUG_ON(cm_id->state != IB_CM_IDLE); + if (cm_id->state != IB_CM_IDLE) + return -EINVAL; + + if (compare_data) { + cm_id_priv->compare_data = kzalloc(sizeof *compare_data, + GFP_KERNEL); + if (!cm_id_priv->compare_data) + return -ENOMEM; + cm_mask_copy(cm_id_priv->compare_data->data, + compare_data->data, compare_data->mask); + memcpy(cm_id_priv->compare_data->mask, compare_data->mask, + IB_CM_COMPARE_SIZE); + } cm_id->state = IB_CM_LISTEN; @@ -769,6 +832,8 @@ int ib_cm_listen(struct ib_cm_id *cm_id, if (cur_cm_id_priv) { cm_id->state = IB_CM_IDLE; + kfree(cm_id_priv->compare_data); + cm_id_priv->compare_data = NULL; ret = -EBUSY; } return ret; @@ -1241,7 +1306,8 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, /* Find matching listen request. */ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, - req_msg->service_id); + req_msg->service_id, + req_msg->private_data); if (!listen_cm_id_priv) { spin_unlock_irqrestore(&cm.lock, flags); cm_issue_rej(work->port, work->mad_recv_wc, @@ -2654,7 +2720,8 @@ static int cm_sidr_req_handler(struct cm_work *work) goto out; /* Duplicate message. */ } cur_cm_id_priv = cm_find_listen(cm_id->device, - sidr_req_msg->service_id); + sidr_req_msg->service_id, + sidr_req_msg->private_data); if (!cur_cm_id_priv) { rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); spin_unlock_irqrestore(&cm.lock, flags); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index b396bf703f8..0136aee0faa 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -648,6 +648,17 @@ out: return result; } +static int ucm_validate_listen(__be64 service_id, __be64 service_mask) +{ + service_id &= service_mask; + + if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || + ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) + return -EINVAL; + + return 0; +} + static ssize_t ib_ucm_listen(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) @@ -663,7 +674,13 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file, if (IS_ERR(ctx)) return PTR_ERR(ctx); - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); + result = ucm_validate_listen(cmd.service_id, cmd.service_mask); + if (result) + goto out; + + result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, + NULL); +out: ib_ucm_ctx_put(ctx); return result; } -- cgit v1.2.3 From 7025fcd36bd62af2c6ca0ea3490c00b216c4d168 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:28 -0700 Subject: IB: address translation to map IP toIB addresses (GIDs) Add an address translation service that maps IP addresses to InfiniBand GID addresses using IPoIB. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 5 + drivers/infiniband/core/Makefile | 6 +- drivers/infiniband/core/addr.c | 367 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 377 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/addr.c (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index afc612b8577..ba2d6505e9a 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -29,6 +29,11 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from . +config INFINIBAND_ADDR_TRANS + bool + depends on INFINIBAND && INET + default y + source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ipath/Kconfig" diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index a56bf9c994f..05095919d16 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,5 +1,7 @@ +addr_trans-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o + obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o + ib_cm.o $(addr_trans-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o @@ -12,6 +14,8 @@ ib_sa-y := sa_query.o ib_cm-y := cm.o +ib_addr-y := addr.o + ib_umad-y := user_mad.o ib_ucm-y := ucm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c new file mode 100644 index 00000000000..d294bbc42f0 --- /dev/null +++ b/drivers/infiniband/core/addr.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("IB Address Translation"); +MODULE_LICENSE("Dual BSD/GPL"); + +struct addr_req { + struct list_head list; + struct sockaddr src_addr; + struct sockaddr dst_addr; + struct rdma_dev_addr *addr; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(void *data); + +static DEFINE_MUTEX(lock); +static LIST_HEAD(req_list); +static DECLARE_WORK(work, process_req, NULL); +static struct workqueue_struct *addr_wq; + +static int copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, + unsigned char *dst_dev_addr) +{ + switch (dev->type) { + case ARPHRD_INFINIBAND: + dev_addr->dev_type = IB_NODE_CA; + break; + default: + return -EADDRNOTAVAIL; + } + + memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); + if (dst_dev_addr) + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); + return 0; +} + +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + struct net_device *dev; + u32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr; + int ret; + + dev = ip_dev_find(ip); + if (!dev) + return -EADDRNOTAVAIL; + + ret = copy_addr(dev_addr, dev, NULL); + dev_put(dev); + return ret; +} +EXPORT_SYMBOL(rdma_translate_ip); + +static void set_timeout(unsigned long time) +{ + unsigned long delay; + + cancel_delayed_work(&work); + + delay = time - jiffies; + if ((long)delay <= 0) + delay = 1; + + queue_delayed_work(addr_wq, &work, delay); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *temp_req; + + mutex_lock(&lock); + list_for_each_entry_reverse(temp_req, &req_list, list) { + if (time_after(req->timeout, temp_req->timeout)) + break; + } + + list_add(&req->list, &temp_req->list); + + if (req_list.next == &req->list) + set_timeout(req->timeout); + mutex_unlock(&lock); +} + +static void addr_send_arp(struct sockaddr_in *dst_in) +{ + struct rtable *rt; + struct flowi fl; + u32 dst_ip = dst_in->sin_addr.s_addr; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip4_u.daddr = dst_ip; + if (ip_route_output_key(&rt, &fl)) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, rt->rt_gateway, rt->idev->dev, + rt->rt_src, NULL, rt->idev->dev->dev_addr, NULL); + ip_rt_put(rt); +} + +static int addr_resolve_remote(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + u32 src_ip = src_in->sin_addr.s_addr; + u32 dst_ip = dst_in->sin_addr.s_addr; + struct flowi fl; + struct rtable *rt; + struct neighbour *neigh; + int ret; + + memset(&fl, 0, sizeof fl); + fl.nl_u.ip4_u.daddr = dst_ip; + fl.nl_u.ip4_u.saddr = src_ip; + ret = ip_route_output_key(&rt, &fl); + if (ret) + goto out; + + /* If the device does ARP internally, return 'done' */ + if (rt->idev->dev->flags & IFF_NOARP) { + copy_addr(addr, rt->idev->dev, NULL); + goto put; + } + + neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev); + if (!neigh) { + ret = -ENODATA; + goto put; + } + + if (!(neigh->nud_state & NUD_VALID)) { + ret = -ENODATA; + goto release; + } + + if (!src_ip) { + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = rt->rt_src; + } + + ret = copy_addr(addr, neigh->dev, neigh->ha); +release: + neigh_release(neigh); +put: + ip_rt_put(rt); +out: + return ret; +} + +static void process_req(void *data) +{ + struct addr_req *req, *temp_req; + struct sockaddr_in *src_in, *dst_in; + struct list_head done_list; + + INIT_LIST_HEAD(&done_list); + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->status) { + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + req->status = addr_resolve_remote(src_in, dst_in, + req->addr); + } + if (req->status && time_after(jiffies, req->timeout)) + req->status = -ETIMEDOUT; + else if (req->status == -ENODATA) + continue; + + list_del(&req->list); + list_add_tail(&req->list, &done_list); + } + + if (!list_empty(&req_list)) { + req = list_entry(req_list.next, struct addr_req, list); + set_timeout(req->timeout); + } + mutex_unlock(&lock); + + list_for_each_entry_safe(req, temp_req, &done_list, list) { + list_del(&req->list); + req->callback(req->status, &req->src_addr, req->addr, + req->context); + kfree(req); + } +} + +static int addr_resolve_local(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + struct net_device *dev; + u32 src_ip = src_in->sin_addr.s_addr; + u32 dst_ip = dst_in->sin_addr.s_addr; + int ret; + + dev = ip_dev_find(dst_ip); + if (!dev) + return -EADDRNOTAVAIL; + + if (ZERONET(src_ip)) { + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = dst_ip; + ret = copy_addr(addr, dev, dev->dev_addr); + } else if (LOOPBACK(src_ip)) { + ret = rdma_translate_ip((struct sockaddr *)dst_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } else { + ret = rdma_translate_ip((struct sockaddr *)src_in, addr); + if (!ret) + memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN); + } + + dev_put(dev); + return ret; +} + +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr_in *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = kmalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + memset(req, 0, sizeof *req); + + if (src_addr) + memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr)); + memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + + req->status = addr_resolve_local(src_in, dst_in, addr); + if (req->status == -EADDRNOTAVAIL) + req->status = addr_resolve_remote(src_in, dst_in, addr); + + switch (req->status) { + case 0: + req->timeout = jiffies; + queue_req(req); + break; + case -ENODATA: + req->timeout = msecs_to_jiffies(timeout_ms) + jiffies; + queue_req(req); + addr_send_arp(dst_in); + break; + default: + ret = req->status; + kfree(req); + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_resolve_ip); + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *temp_req; + + mutex_lock(&lock); + list_for_each_entry_safe(req, temp_req, &req_list, list) { + if (req->addr == addr) { + req->status = -ECANCELED; + req->timeout = jiffies; + list_del(&req->list); + list_add(&req->list, &req_list); + set_timeout(req->timeout); + break; + } + } + mutex_unlock(&lock); +} +EXPORT_SYMBOL(rdma_addr_cancel); + +static int addr_arp_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pkt, struct net_device *orig_dev) +{ + struct arphdr *arp_hdr; + + arp_hdr = (struct arphdr *) skb->nh.raw; + + if (arp_hdr->ar_op == htons(ARPOP_REQUEST) || + arp_hdr->ar_op == htons(ARPOP_REPLY)) + set_timeout(jiffies); + + kfree_skb(skb); + return 0; +} + +static struct packet_type addr_arp = { + .type = __constant_htons(ETH_P_ARP), + .func = addr_arp_recv, + .af_packet_priv = (void*) 1, +}; + +static int addr_init(void) +{ + addr_wq = create_singlethread_workqueue("ib_addr_wq"); + if (!addr_wq) + return -ENOMEM; + + dev_add_pack(&addr_arp); + return 0; +} + +static void addr_cleanup(void) +{ + dev_remove_pack(&addr_arp); + destroy_workqueue(addr_wq); +} + +module_init(addr_init); +module_exit(addr_cleanup); -- cgit v1.2.3 From e51060f08a61965c4dd91516d82fe90617152590 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:29 -0700 Subject: IB: IP address based RDMA connection manager Kernel connection management agent over InfiniBand that connects based on IP addresses. The agent defines a generic RDMA connection abstraction to support clients wanting to connect over different RDMA devices. The agent also handles RDMA device hotplug events on behalf of clients. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/Makefile | 6 +- drivers/infiniband/core/cma.c | 1927 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1931 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/core/cma.c (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 05095919d16..68e73ec2d1f 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,7 +1,7 @@ -addr_trans-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o +infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o $(addr_trans-y) + ib_cm.o $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o @@ -14,6 +14,8 @@ ib_sa-y := sa_query.o ib_cm-y := cm.o +rdma_cm-y := cma.o + ib_addr-y := addr.o ib_umad-y := user_mad.o diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c new file mode 100644 index 00000000000..a76834edf60 --- /dev/null +++ b/drivers/infiniband/core/cma.c @@ -0,0 +1,1927 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("Generic RDMA CM Agent"); +MODULE_LICENSE("Dual BSD/GPL"); + +#define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_MAX_CM_RETRIES 3 + +static void cma_add_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device); + +static struct ib_client cma_client = { + .name = "cma", + .add = cma_add_one, + .remove = cma_remove_one +}; + +static LIST_HEAD(dev_list); +static LIST_HEAD(listen_any_list); +static DEFINE_MUTEX(lock); +static struct workqueue_struct *cma_wq; +static DEFINE_IDR(sdp_ps); +static DEFINE_IDR(tcp_ps); + +struct cma_device { + struct list_head list; + struct ib_device *device; + __be64 node_guid; + struct completion comp; + atomic_t refcount; + struct list_head id_list; +}; + +enum cma_state { + CMA_IDLE, + CMA_ADDR_QUERY, + CMA_ADDR_RESOLVED, + CMA_ROUTE_QUERY, + CMA_ROUTE_RESOLVED, + CMA_CONNECT, + CMA_DISCONNECT, + CMA_ADDR_BOUND, + CMA_LISTEN, + CMA_DEVICE_REMOVAL, + CMA_DESTROYING +}; + +struct rdma_bind_list { + struct idr *ps; + struct hlist_head owners; + unsigned short port; +}; + +/* + * Device removal can occur at anytime, so we need extra handling to + * serialize notifying the user of device removal with other callbacks. + * We do this by disabling removal notification while a callback is in process, + * and reporting it after the callback completes. + */ +struct rdma_id_private { + struct rdma_cm_id id; + + struct rdma_bind_list *bind_list; + struct hlist_node node; + struct list_head list; + struct list_head listen_list; + struct cma_device *cma_dev; + + enum cma_state state; + spinlock_t lock; + struct completion comp; + atomic_t refcount; + wait_queue_head_t wait_remove; + atomic_t dev_remove; + + int backlog; + int timeout_ms; + struct ib_sa_query *query; + int query_id; + union { + struct ib_cm_id *ib; + } cm_id; + + u32 seq_num; + u32 qp_num; + enum ib_qp_type qp_type; + u8 srq; +}; + +struct cma_work { + struct work_struct work; + struct rdma_id_private *id; + enum cma_state old_state; + enum cma_state new_state; + struct rdma_cm_event event; +}; + +union cma_ip_addr { + struct in6_addr ip6; + struct { + __u32 pad[3]; + __u32 addr; + } ip4; +}; + +struct cma_hdr { + u8 cma_version; + u8 ip_version; /* IP version: 7:4 */ + __u16 port; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +struct sdp_hh { + u8 bsdh[16]; + u8 sdp_version; /* Major version: 7:4 */ + u8 ip_version; /* IP version: 7:4 */ + u8 sdp_specific1[10]; + __u16 port; + __u16 sdp_specific2; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +struct sdp_hah { + u8 bsdh[16]; + u8 sdp_version; +}; + +#define CMA_VERSION 0x00 +#define SDP_MAJ_VERSION 0x2 + +static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + ret = (id_priv->state == comp); + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static int cma_comp_exch(struct rdma_id_private *id_priv, + enum cma_state comp, enum cma_state exch) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&id_priv->lock, flags); + if ((ret = (id_priv->state == comp))) + id_priv->state = exch; + spin_unlock_irqrestore(&id_priv->lock, flags); + return ret; +} + +static enum cma_state cma_exch(struct rdma_id_private *id_priv, + enum cma_state exch) +{ + unsigned long flags; + enum cma_state old; + + spin_lock_irqsave(&id_priv->lock, flags); + old = id_priv->state; + id_priv->state = exch; + spin_unlock_irqrestore(&id_priv->lock, flags); + return old; +} + +static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +{ + return hdr->ip_version >> 4; +} + +static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); +} + +static inline u8 sdp_get_majv(u8 sdp_version) +{ + return sdp_version >> 4; +} + +static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) +{ + return hh->ip_version >> 4; +} + +static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) +{ + hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); +} + +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); + id_priv->cma_dev = cma_dev; + id_priv->id.device = cma_dev->device; + list_add_tail(&id_priv->list, &cma_dev->id_list); +} + +static inline void cma_deref_dev(struct cma_device *cma_dev) +{ + if (atomic_dec_and_test(&cma_dev->refcount)) + complete(&cma_dev->comp); +} + +static void cma_detach_from_dev(struct rdma_id_private *id_priv) +{ + list_del(&id_priv->list); + cma_deref_dev(id_priv->cma_dev); + id_priv->cma_dev = NULL; +} + +static int cma_acquire_ib_dev(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + union ib_gid *gid; + int ret = -ENODEV; + + gid = ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = ib_find_cached_gid(cma_dev->device, gid, + &id_priv->id.port_num, NULL); + if (!ret) { + cma_attach_to_dev(id_priv, cma_dev); + break; + } + } + mutex_unlock(&lock); + return ret; +} + +static int cma_acquire_dev(struct rdma_id_private *id_priv) +{ + switch (id_priv->id.route.addr.dev_addr.dev_type) { + case IB_NODE_CA: + return cma_acquire_ib_dev(id_priv); + default: + return -ENODEV; + } +} + +static void cma_deref_id(struct rdma_id_private *id_priv) +{ + if (atomic_dec_and_test(&id_priv->refcount)) + complete(&id_priv->comp); +} + +static void cma_release_remove(struct rdma_id_private *id_priv) +{ + if (atomic_dec_and_test(&id_priv->dev_remove)) + wake_up(&id_priv->wait_remove); +} + +struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps) +{ + struct rdma_id_private *id_priv; + + id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); + if (!id_priv) + return ERR_PTR(-ENOMEM); + + id_priv->state = CMA_IDLE; + id_priv->id.context = context; + id_priv->id.event_handler = event_handler; + id_priv->id.ps = ps; + spin_lock_init(&id_priv->lock); + init_completion(&id_priv->comp); + atomic_set(&id_priv->refcount, 1); + init_waitqueue_head(&id_priv->wait_remove); + atomic_set(&id_priv->dev_remove, 0); + INIT_LIST_HEAD(&id_priv->listen_list); + get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); + + return &id_priv->id; +} +EXPORT_SYMBOL(rdma_create_id); + +static int cma_init_ib_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + struct rdma_dev_addr *dev_addr; + int ret; + + dev_addr = &id_priv->id.route.addr.dev_addr; + ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, + ib_addr_get_pkey(dev_addr), + &qp_attr.pkey_index); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = id_priv->id.port_num; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | IB_QP_PORT); +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct rdma_id_private *id_priv; + struct ib_qp *qp; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device != pd->device) + return -EINVAL; + + qp = ib_create_qp(pd, qp_init_attr); + if (IS_ERR(qp)) + return PTR_ERR(qp); + + switch (id->device->node_type) { + case IB_NODE_CA: + ret = cma_init_ib_qp(id_priv, qp); + break; + default: + ret = -ENOSYS; + break; + } + + if (ret) + goto err; + + id->qp = qp; + id_priv->qp_num = qp->qp_num; + id_priv->qp_type = qp->qp_type; + id_priv->srq = (qp->srq != NULL); + return 0; +err: + ib_destroy_qp(qp); + return ret; +} +EXPORT_SYMBOL(rdma_create_qp); + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + ib_destroy_qp(id->qp); +} +EXPORT_SYMBOL(rdma_destroy_qp); + +static int cma_modify_qp_rtr(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return 0; + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); +} + +static int cma_modify_qp_rts(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IB_QPS_RTS; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); +} + +static int cma_modify_qp_err(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE); +} + +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + switch (id_priv->id.device->node_type) { + case IB_NODE_CA: + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, + qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) + qp_attr->rq_psn = id_priv->seq_num; + break; + default: + ret = -ENOSYS; + break; + } + + return ret; +} +EXPORT_SYMBOL(rdma_init_qp_attr); + +static inline int cma_zero_addr(struct sockaddr *addr) +{ + struct in6_addr *ip6; + + if (addr->sa_family == AF_INET) + return ZERONET(((struct sockaddr_in *) addr)->sin_addr.s_addr); + else { + ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; + return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | + ip6->s6_addr32[3] | ip6->s6_addr32[4]) == 0; + } +} + +static inline int cma_loopback_addr(struct sockaddr *addr) +{ + return LOOPBACK(((struct sockaddr_in *) addr)->sin_addr.s_addr); +} + +static inline int cma_any_addr(struct sockaddr *addr) +{ + return cma_zero_addr(addr) || cma_loopback_addr(addr); +} + +static inline int cma_any_port(struct sockaddr *addr) +{ + return !((struct sockaddr_in *) addr)->sin_port; +} + +static int cma_get_net_info(void *hdr, enum rdma_port_space ps, + u8 *ip_ver, __u16 *port, + union cma_ip_addr **src, union cma_ip_addr **dst) +{ + switch (ps) { + case RDMA_PS_SDP: + if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != + SDP_MAJ_VERSION) + return -EINVAL; + + *ip_ver = sdp_get_ip_ver(hdr); + *port = ((struct sdp_hh *) hdr)->port; + *src = &((struct sdp_hh *) hdr)->src_addr; + *dst = &((struct sdp_hh *) hdr)->dst_addr; + break; + default: + if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) + return -EINVAL; + + *ip_ver = cma_get_ip_ver(hdr); + *port = ((struct cma_hdr *) hdr)->port; + *src = &((struct cma_hdr *) hdr)->src_addr; + *dst = &((struct cma_hdr *) hdr)->dst_addr; + break; + } + + if (*ip_ver != 4 && *ip_ver != 6) + return -EINVAL; + return 0; +} + +static void cma_save_net_info(struct rdma_addr *addr, + struct rdma_addr *listen_addr, + u8 ip_ver, __u16 port, + union cma_ip_addr *src, union cma_ip_addr *dst) +{ + struct sockaddr_in *listen4, *ip4; + struct sockaddr_in6 *listen6, *ip6; + + switch (ip_ver) { + case 4: + listen4 = (struct sockaddr_in *) &listen_addr->src_addr; + ip4 = (struct sockaddr_in *) &addr->src_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = dst->ip4.addr; + ip4->sin_port = listen4->sin_port; + + ip4 = (struct sockaddr_in *) &addr->dst_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = src->ip4.addr; + ip4->sin_port = port; + break; + case 6: + listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; + ip6 = (struct sockaddr_in6 *) &addr->src_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = dst->ip6; + ip6->sin6_port = listen6->sin6_port; + + ip6 = (struct sockaddr_in6 *) &addr->dst_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = src->ip6; + ip6->sin6_port = port; + break; + default: + break; + } +} + +static inline int cma_user_data_offset(enum rdma_port_space ps) +{ + switch (ps) { + case RDMA_PS_SDP: + return 0; + default: + return sizeof(struct cma_hdr); + } +} + +static int cma_notify_user(struct rdma_id_private *id_priv, + enum rdma_cm_event_type type, int status, + void *data, u8 data_len) +{ + struct rdma_cm_event event; + + event.event = type; + event.status = status; + event.private_data = data; + event.private_data_len = data_len; + + return id_priv->id.event_handler(&id_priv->id, &event); +} + +static void cma_cancel_route(struct rdma_id_private *id_priv) +{ + switch (id_priv->id.device->node_type) { + case IB_NODE_CA: + if (id_priv->query) + ib_sa_cancel_query(id_priv->query_id, id_priv->query); + break; + default: + break; + } +} + +static inline int cma_internal_listen(struct rdma_id_private *id_priv) +{ + return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev && + cma_any_addr(&id_priv->id.route.addr.src_addr); +} + +static void cma_destroy_listen(struct rdma_id_private *id_priv) +{ + cma_exch(id_priv, CMA_DESTROYING); + + if (id_priv->cma_dev) { + switch (id_priv->id.device->node_type) { + case IB_NODE_CA: + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + default: + break; + } + cma_detach_from_dev(id_priv); + } + list_del(&id_priv->listen_list); + + cma_deref_id(id_priv); + wait_for_completion(&id_priv->comp); + + kfree(id_priv); +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + struct rdma_id_private *dev_id_priv; + + mutex_lock(&lock); + list_del(&id_priv->list); + + while (!list_empty(&id_priv->listen_list)) { + dev_id_priv = list_entry(id_priv->listen_list.next, + struct rdma_id_private, listen_list); + cma_destroy_listen(dev_id_priv); + } + mutex_unlock(&lock); +} + +static void cma_cancel_operation(struct rdma_id_private *id_priv, + enum cma_state state) +{ + switch (state) { + case CMA_ADDR_QUERY: + rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); + break; + case CMA_ROUTE_QUERY: + cma_cancel_route(id_priv); + break; + case CMA_LISTEN: + if (cma_any_addr(&id_priv->id.route.addr.src_addr) && + !id_priv->cma_dev) + cma_cancel_listens(id_priv); + break; + default: + break; + } +} + +static void cma_release_port(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + + if (!bind_list) + return; + + mutex_lock(&lock); + hlist_del(&id_priv->node); + if (hlist_empty(&bind_list->owners)) { + idr_remove(bind_list->ps, bind_list->port); + kfree(bind_list); + } + mutex_unlock(&lock); +} + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + enum cma_state state; + + id_priv = container_of(id, struct rdma_id_private, id); + state = cma_exch(id_priv, CMA_DESTROYING); + cma_cancel_operation(id_priv, state); + + if (id_priv->cma_dev) { + switch (id->device->node_type) { + case IB_NODE_CA: + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + default: + break; + } + mutex_lock(&lock); + cma_detach_from_dev(id_priv); + mutex_unlock(&lock); + } + + cma_release_port(id_priv); + cma_deref_id(id_priv); + wait_for_completion(&id_priv->comp); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); +} +EXPORT_SYMBOL(rdma_destroy_id); + +static int cma_rep_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + goto reject; + + ret = cma_modify_qp_rts(&id_priv->id); + if (ret) + goto reject; + + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(&id_priv->id); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) +{ + if (id_priv->id.ps == RDMA_PS_SDP && + sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != + SDP_MAJ_VERSION) + return -EINVAL; + + return 0; +} + +static int cma_rtu_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rts(&id_priv->id); + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(&id_priv->id); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + enum rdma_cm_event_type event; + u8 private_data_len = 0; + int ret = 0, status = 0; + + atomic_inc(&id_priv->dev_remove); + if (!cma_comp(id_priv, CMA_CONNECT)) + goto out; + + switch (ib_event->event) { + case IB_CM_REQ_ERROR: + case IB_CM_REP_ERROR: + event = RDMA_CM_EVENT_UNREACHABLE; + status = -ETIMEDOUT; + break; + case IB_CM_REP_RECEIVED: + status = cma_verify_rep(id_priv, ib_event->private_data); + if (status) + event = RDMA_CM_EVENT_CONNECT_ERROR; + else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { + status = cma_rep_recv(id_priv); + event = status ? RDMA_CM_EVENT_CONNECT_ERROR : + RDMA_CM_EVENT_ESTABLISHED; + } else + event = RDMA_CM_EVENT_CONNECT_RESPONSE; + private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + break; + case IB_CM_RTU_RECEIVED: + status = cma_rtu_recv(id_priv); + event = status ? RDMA_CM_EVENT_CONNECT_ERROR : + RDMA_CM_EVENT_ESTABLISHED; + break; + case IB_CM_DREQ_ERROR: + status = -ETIMEDOUT; /* fall through */ + case IB_CM_DREQ_RECEIVED: + case IB_CM_DREP_RECEIVED: + if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT)) + goto out; + event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IB_CM_TIMEWAIT_EXIT: + case IB_CM_MRA_RECEIVED: + /* ignore event */ + goto out; + case IB_CM_REJ_RECEIVED: + cma_modify_qp_err(&id_priv->id); + status = ib_event->param.rej_rcvd.reason; + event = RDMA_CM_EVENT_REJECTED; + break; + default: + printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d", + ib_event->event); + goto out; + } + + ret = cma_notify_user(id_priv, event, status, ib_event->private_data, + private_data_len); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + cma_exch(id_priv, CMA_DESTROYING); + cma_release_remove(id_priv); + rdma_destroy_id(&id_priv->id); + return ret; + } +out: + cma_release_remove(id_priv); + return ret; +} + +static struct rdma_id_private *cma_new_id(struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + struct rdma_route *rt; + union cma_ip_addr *src, *dst; + __u16 port; + u8 ip_ver; + + id = rdma_create_id(listen_id->event_handler, listen_id->context, + listen_id->ps); + if (IS_ERR(id)) + return NULL; + + rt = &id->route; + rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL); + if (!rt->path_rec) + goto err; + + if (cma_get_net_info(ib_event->private_data, listen_id->ps, + &ip_ver, &port, &src, &dst)) + goto err; + + cma_save_net_info(&id->route.addr, &listen_id->route.addr, + ip_ver, port, src, dst); + rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; + if (rt->num_paths == 2) + rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; + + ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + rt->addr.dev_addr.dev_type = IB_NODE_CA; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->state = CMA_CONNECT; + return id_priv; +err: + rdma_destroy_id(id); + return NULL; +} + +static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *listen_id, *conn_id; + int offset, ret; + + listen_id = cm_id->context; + atomic_inc(&listen_id->dev_remove); + if (!cma_comp(listen_id, CMA_LISTEN)) { + ret = -ECONNABORTED; + goto out; + } + + conn_id = cma_new_id(&listen_id->id, ib_event); + if (!conn_id) { + ret = -ENOMEM; + goto out; + } + + atomic_inc(&conn_id->dev_remove); + ret = cma_acquire_ib_dev(conn_id); + if (ret) { + ret = -ENODEV; + cma_release_remove(conn_id); + rdma_destroy_id(&conn_id->id); + goto out; + } + + conn_id->cm_id.ib = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_ib_handler; + + offset = cma_user_data_offset(listen_id->id.ps); + ret = cma_notify_user(conn_id, RDMA_CM_EVENT_CONNECT_REQUEST, 0, + ib_event->private_data + offset, + IB_CM_REQ_PRIVATE_DATA_SIZE - offset); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + cma_exch(conn_id, CMA_DESTROYING); + cma_release_remove(conn_id); + rdma_destroy_id(&conn_id->id); + } +out: + cma_release_remove(listen_id); + return ret; +} + +static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) +{ + return cpu_to_be64(((u64)ps << 16) + + be16_to_cpu(((struct sockaddr_in *) addr)->sin_port)); +} + +static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, + struct ib_cm_compare_data *compare) +{ + struct cma_hdr *cma_data, *cma_mask; + struct sdp_hh *sdp_data, *sdp_mask; + __u32 ip4_addr; + struct in6_addr ip6_addr; + + memset(compare, 0, sizeof *compare); + cma_data = (void *) compare->data; + cma_mask = (void *) compare->mask; + sdp_data = (void *) compare->data; + sdp_mask = (void *) compare->mask; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; + if (ps == RDMA_PS_SDP) { + sdp_set_ip_ver(sdp_data, 4); + sdp_set_ip_ver(sdp_mask, 0xF); + sdp_data->dst_addr.ip4.addr = ip4_addr; + sdp_mask->dst_addr.ip4.addr = ~0; + } else { + cma_set_ip_ver(cma_data, 4); + cma_set_ip_ver(cma_mask, 0xF); + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = ~0; + } + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; + if (ps == RDMA_PS_SDP) { + sdp_set_ip_ver(sdp_data, 6); + sdp_set_ip_ver(sdp_mask, 0xF); + sdp_data->dst_addr.ip6 = ip6_addr; + memset(&sdp_mask->dst_addr.ip6, 0xFF, + sizeof sdp_mask->dst_addr.ip6); + } else { + cma_set_ip_ver(cma_data, 6); + cma_set_ip_ver(cma_mask, 0xF); + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof cma_mask->dst_addr.ip6); + } + break; + default: + break; + } +} + +static int cma_ib_listen(struct rdma_id_private *id_priv) +{ + struct ib_cm_compare_data compare_data; + struct sockaddr *addr; + __be64 svc_id; + int ret; + + id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler, + id_priv); + if (IS_ERR(id_priv->cm_id.ib)) + return PTR_ERR(id_priv->cm_id.ib); + + addr = &id_priv->id.route.addr.src_addr; + svc_id = cma_get_service_id(id_priv->id.ps, addr); + if (cma_any_addr(addr)) + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); + else { + cma_set_compare_data(id_priv->id.ps, addr, &compare_data); + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); + } + + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } + + return ret; +} + +static int cma_listen_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct rdma_id_private *id_priv = id->context; + + id->context = id_priv->id.context; + id->event_handler = id_priv->id.event_handler; + return id_priv->id.event_handler(id, event); +} + +static void cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + struct rdma_id_private *dev_id_priv; + struct rdma_cm_id *id; + int ret; + + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); + if (IS_ERR(id)) + return; + + dev_id_priv = container_of(id, struct rdma_id_private, id); + + dev_id_priv->state = CMA_ADDR_BOUND; + memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, + ip_addr_size(&id_priv->id.route.addr.src_addr)); + + cma_attach_to_dev(dev_id_priv, cma_dev); + list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); + + ret = rdma_listen(id, id_priv->backlog); + if (ret) + goto err; + + return; +err: + cma_destroy_listen(dev_id_priv); +} + +static void cma_listen_on_all(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + + mutex_lock(&lock); + list_add_tail(&id_priv->list, &listen_any_list); + list_for_each_entry(cma_dev, &dev_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mutex_unlock(&lock); +} + +static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af) +{ + struct sockaddr_in addr_in; + + memset(&addr_in, 0, sizeof addr_in); + addr_in.sin_family = af; + return rdma_bind_addr(id, (struct sockaddr *) &addr_in); +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ret = cma_bind_any(id, AF_INET); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) + return -EINVAL; + + id_priv->backlog = backlog; + if (id->device) { + switch (id->device->node_type) { + case IB_NODE_CA: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + default: + ret = -ENOSYS; + goto err; + } + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + +static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, + void *context) +{ + struct cma_work *work = context; + struct rdma_route *route; + + route = &work->id->id.route; + + if (!status) { + route->num_paths = 1; + *route->path_rec = *path_rec; + } else { + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + } + + queue_work(cma_wq, &work->work); +} + +static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, + struct cma_work *work) +{ + struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr; + struct ib_sa_path_rec path_rec; + + memset(&path_rec, 0, sizeof path_rec); + path_rec.sgid = *ib_addr_get_sgid(addr); + path_rec.dgid = *ib_addr_get_dgid(addr); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr)); + path_rec.numb_path = 1; + + id_priv->query_id = ib_sa_path_rec_get(id_priv->id.device, + id_priv->id.port_num, &path_rec, + IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH, + timeout_ms, GFP_KERNEL, + cma_query_handler, work, &id_priv->query); + + return (id_priv->query_id < 0) ? id_priv->query_id : 0; +} + +static void cma_work_handler(void *data) +{ + struct cma_work *work = data; + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + atomic_inc(&id_priv->dev_remove); + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, CMA_DESTROYING); + destroy = 1; + } +out: + cma_release_remove(id_priv); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + kfree(work); +} + +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +{ + struct rdma_route *route = &id_priv->id.route; + struct cma_work *work; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler, work); + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + + route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + if (!route->path_rec) { + ret = -ENOMEM; + goto err1; + } + + ret = cma_query_ib_route(id_priv, timeout_ms, work); + if (ret) + goto err2; + + return 0; +err2: + kfree(route->path_rec); + route->path_rec = NULL; +err1: + kfree(work); + return ret; +} + +int rdma_set_ib_paths(struct rdma_cm_id *id, + struct ib_sa_path_rec *path_rec, int num_paths) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED)) + return -EINVAL; + + id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL); + if (!id->route.path_rec) { + ret = -ENOMEM; + goto err; + } + + memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths); + return 0; +err: + cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_set_ib_paths); + +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY)) + return -EINVAL; + + atomic_inc(&id_priv->refcount); + switch (id->device->node_type) { + case IB_NODE_CA: + ret = cma_resolve_ib_route(id_priv, timeout_ms); + break; + default: + ret = -ENOSYS; + break; + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED); + cma_deref_id(id_priv); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_route); + +static int cma_bind_loopback(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + struct ib_port_attr port_attr; + union ib_gid *gid; + u16 pkey; + int ret; + u8 p; + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) + for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) + if (!ib_query_port (cma_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) + goto port_found; + + if (!list_empty(&dev_list)) { + p = 1; + cma_dev = list_entry(dev_list.next, struct cma_device, list); + } else { + ret = -ENODEV; + goto out; + } + +port_found: + gid = ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr); + ret = ib_get_cached_gid(cma_dev->device, p, 0, gid); + if (ret) + goto out; + + ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); + if (ret) + goto out; + + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); + id_priv->id.port_num = p; + cma_attach_to_dev(id_priv, cma_dev); +out: + mutex_unlock(&lock); + return ret; +} + +static void addr_handler(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *dev_addr, void *context) +{ + struct rdma_id_private *id_priv = context; + enum rdma_cm_event_type event; + + atomic_inc(&id_priv->dev_remove); + if (!id_priv->cma_dev && !status) + status = cma_acquire_dev(id_priv); + + if (status) { + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND)) + goto out; + event = RDMA_CM_EVENT_ADDR_ERROR; + } else { + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) + goto out; + memcpy(&id_priv->id.route.addr.src_addr, src_addr, + ip_addr_size(src_addr)); + event = RDMA_CM_EVENT_ADDR_RESOLVED; + } + + if (cma_notify_user(id_priv, event, status, NULL, 0)) { + cma_exch(id_priv, CMA_DESTROYING); + cma_release_remove(id_priv); + cma_deref_id(id_priv); + rdma_destroy_id(&id_priv->id); + return; + } +out: + cma_release_remove(id_priv); + cma_deref_id(id_priv); +} + +static int cma_resolve_loopback(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + struct sockaddr_in *src_in, *dst_in; + int ret; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + if (!id_priv->cma_dev) { + ret = cma_bind_loopback(id_priv); + if (ret) + goto err; + } + + ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, + ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr)); + + if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) { + src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr; + dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr; + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr; + } + + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler, work); + work->old_state = CMA_ADDR_QUERY; + work->new_state = CMA_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + queue_work(cma_wq, &work->work); + return 0; +err: + kfree(work); + return ret; +} + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr) +{ + if (src_addr && src_addr->sa_family) + return rdma_bind_addr(id, src_addr); + else + return cma_bind_any(id, dst_addr->sa_family); +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ret = cma_bind_addr(id, src_addr, dst_addr); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) + return -EINVAL; + + atomic_inc(&id_priv->refcount); + memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) + ret = cma_resolve_loopback(id_priv); + else + ret = rdma_resolve_ip(&id->route.addr.src_addr, dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND); + cma_deref_id(id_priv); + return ret; +} +EXPORT_SYMBOL(rdma_resolve_addr); + +static void cma_bind_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + sin->sin_port = htons(bind_list->port); + id_priv->bind_list = bind_list; + hlist_add_head(&id_priv->node, &bind_list->owners); +} + +static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, + unsigned short snum) +{ + struct rdma_bind_list *bind_list; + int port, start, ret; + + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; + + start = snum ? snum : sysctl_local_port_range[0]; + + do { + ret = idr_get_new_above(ps, bind_list, start, &port); + } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); + + if (ret) + goto err; + + if ((snum && port != snum) || + (!snum && port > sysctl_local_port_range[1])) { + idr_remove(ps, port); + ret = -EADDRNOTAVAIL; + goto err; + } + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); + return 0; +err: + kfree(bind_list); + return ret; +} + +static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + struct rdma_id_private *cur_id; + struct sockaddr_in *sin, *cur_sin; + struct rdma_bind_list *bind_list; + struct hlist_node *node; + unsigned short snum; + + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + snum = ntohs(sin->sin_port); + if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + bind_list = idr_find(ps, snum); + if (!bind_list) + return cma_alloc_port(ps, id_priv, snum); + + /* + * We don't support binding to any address if anyone is bound to + * a specific address on the same port. + */ + if (cma_any_addr(&id_priv->id.route.addr.src_addr)) + return -EADDRNOTAVAIL; + + hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { + if (cma_any_addr(&cur_id->id.route.addr.src_addr)) + return -EADDRNOTAVAIL; + + cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; + if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) + return -EADDRINUSE; + } + + cma_bind_port(bind_list, id_priv); + return 0; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + struct idr *ps; + int ret; + + switch (id_priv->id.ps) { + case RDMA_PS_SDP: + ps = &sdp_ps; + break; + case RDMA_PS_TCP: + ps = &tcp_ps; + break; + default: + return -EPROTONOSUPPORT; + } + + mutex_lock(&lock); + if (cma_any_port(&id_priv->id.route.addr.src_addr)) + ret = cma_alloc_port(ps, id_priv, 0); + else + ret = cma_use_port(ps, id_priv); + mutex_unlock(&lock); + + return ret; +} + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + int ret; + + if (addr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) + return -EINVAL; + + if (!cma_any_addr(addr)) { + ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); + if (!ret) + ret = cma_acquire_dev(id_priv); + if (ret) + goto err; + } + + memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + ret = cma_get_port(id_priv); + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE); + return ret; +} +EXPORT_SYMBOL(rdma_bind_addr); + +static int cma_format_hdr(void *hdr, enum rdma_port_space ps, + struct rdma_route *route) +{ + struct sockaddr_in *src4, *dst4; + struct cma_hdr *cma_hdr; + struct sdp_hh *sdp_hdr; + + src4 = (struct sockaddr_in *) &route->addr.src_addr; + dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + + switch (ps) { + case RDMA_PS_SDP: + sdp_hdr = hdr; + if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) + return -EINVAL; + sdp_set_ip_ver(sdp_hdr, 4); + sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + sdp_hdr->port = src4->sin_port; + break; + default: + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + break; + } + return 0; +} + +static int cma_connect_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_req_param req; + struct rdma_route *route; + void *private_data; + int offset, ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv->id.ps); + req.private_data_len = offset + conn_param->private_data_len; + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler, + id_priv); + if (IS_ERR(id_priv->cm_id.ib)) { + ret = PTR_ERR(id_priv->cm_id.ib); + goto out; + } + + route = &id_priv->id.route; + ret = cma_format_hdr(private_data, id_priv->id.ps, route); + if (ret) + goto out; + req.private_data = private_data; + + req.primary_path = &route->path_rec[0]; + if (route->num_paths == 2) + req.alternate_path = &route->path_rec[1]; + + req.service_id = cma_get_service_id(id_priv->id.ps, + &route->addr.dst_addr); + req.qp_num = id_priv->qp_num; + req.qp_type = id_priv->qp_type; + req.starting_psn = id_priv->seq_num; + req.responder_resources = conn_param->responder_resources; + req.initiator_depth = conn_param->initiator_depth; + req.flow_control = conn_param->flow_control; + req.retry_count = conn_param->retry_count; + req.rnr_retry_count = conn_param->rnr_retry_count; + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.max_cm_retries = CMA_MAX_CM_RETRIES; + req.srq = id_priv->srq ? 1 : 0; + + ret = ib_send_cm_req(id_priv->cm_id.ib, &req); +out: + kfree(private_data); + return ret; +} + +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT)) + return -EINVAL; + + if (!id->qp) { + id_priv->qp_num = conn_param->qp_num; + id_priv->qp_type = conn_param->qp_type; + id_priv->srq = conn_param->srq; + } + + switch (id->device->node_type) { + case IB_NODE_CA: + ret = cma_connect_ib(id_priv, conn_param); + break; + default: + ret = -ENOSYS; + break; + } + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED); + return ret; +} +EXPORT_SYMBOL(rdma_connect); + +static int cma_accept_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_rep_param rep; + int ret; + + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + return ret; + + memset(&rep, 0, sizeof rep); + rep.qp_num = id_priv->qp_num; + rep.starting_psn = id_priv->seq_num; + rep.private_data = conn_param->private_data; + rep.private_data_len = conn_param->private_data_len; + rep.responder_resources = conn_param->responder_resources; + rep.initiator_depth = conn_param->initiator_depth; + rep.target_ack_delay = CMA_CM_RESPONSE_TIMEOUT; + rep.failover_accepted = 0; + rep.flow_control = conn_param->flow_control; + rep.rnr_retry_count = conn_param->rnr_retry_count; + rep.srq = id_priv->srq ? 1 : 0; + + return ib_send_cm_rep(id_priv->cm_id.ib, &rep); +} + +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, CMA_CONNECT)) + return -EINVAL; + + if (!id->qp && conn_param) { + id_priv->qp_num = conn_param->qp_num; + id_priv->qp_type = conn_param->qp_type; + id_priv->srq = conn_param->srq; + } + + switch (id->device->node_type) { + case IB_NODE_CA: + if (conn_param) + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); + break; + default: + ret = -ENOSYS; + break; + } + + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id); + rdma_reject(id, NULL, 0); + return ret; +} +EXPORT_SYMBOL(rdma_accept); + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, CMA_CONNECT)) + return -EINVAL; + + switch (id->device->node_type) { + case IB_NODE_CA: + ret = ib_send_cm_rej(id_priv->cm_id.ib, + IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, + private_data, private_data_len); + break; + default: + ret = -ENOSYS; + break; + } + return ret; +} +EXPORT_SYMBOL(rdma_reject); + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, CMA_CONNECT) && + !cma_comp(id_priv, CMA_DISCONNECT)) + return -EINVAL; + + ret = cma_modify_qp_err(id); + if (ret) + goto out; + + switch (id->device->node_type) { + case IB_NODE_CA: + /* Initiate or respond to a disconnect. */ + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) + ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + break; + default: + break; + } +out: + return ret; +} +EXPORT_SYMBOL(rdma_disconnect); + +static void cma_add_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + + cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); + if (!cma_dev) + return; + + cma_dev->device = device; + cma_dev->node_guid = device->node_guid; + if (!cma_dev->node_guid) + goto err; + + init_completion(&cma_dev->comp); + atomic_set(&cma_dev->refcount, 1); + INIT_LIST_HEAD(&cma_dev->id_list); + ib_set_client_data(device, &cma_client, cma_dev); + + mutex_lock(&lock); + list_add_tail(&cma_dev->list, &dev_list); + list_for_each_entry(id_priv, &listen_any_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mutex_unlock(&lock); + return; +err: + kfree(cma_dev); +} + +static int cma_remove_id_dev(struct rdma_id_private *id_priv) +{ + enum cma_state state; + + /* Record that we want to remove the device */ + state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); + if (state == CMA_DESTROYING) + return 0; + + cma_cancel_operation(id_priv, state); + wait_event(id_priv->wait_remove, !atomic_read(&id_priv->dev_remove)); + + /* Check for destruction from another callback. */ + if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) + return 0; + + return cma_notify_user(id_priv, RDMA_CM_EVENT_DEVICE_REMOVAL, + 0, NULL, 0); +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + struct list_head remove_list; + struct rdma_id_private *id_priv; + int ret; + + INIT_LIST_HEAD(&remove_list); + + mutex_lock(&lock); + while (!list_empty(&cma_dev->id_list)) { + id_priv = list_entry(cma_dev->id_list.next, + struct rdma_id_private, list); + + if (cma_internal_listen(id_priv)) { + cma_destroy_listen(id_priv); + continue; + } + + list_del(&id_priv->list); + list_add_tail(&id_priv->list, &remove_list); + atomic_inc(&id_priv->refcount); + mutex_unlock(&lock); + + ret = cma_remove_id_dev(id_priv); + cma_deref_id(id_priv); + if (ret) + rdma_destroy_id(&id_priv->id); + + mutex_lock(&lock); + } + mutex_unlock(&lock); + + cma_deref_dev(cma_dev); + wait_for_completion(&cma_dev->comp); +} + +static void cma_remove_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + + cma_dev = ib_get_client_data(device, &cma_client); + if (!cma_dev) + return; + + mutex_lock(&lock); + list_del(&cma_dev->list); + mutex_unlock(&lock); + + cma_process_remove(cma_dev); + kfree(cma_dev); +} + +static int cma_init(void) +{ + int ret; + + cma_wq = create_singlethread_workqueue("rdma_cm_wq"); + if (!cma_wq) + return -ENOMEM; + + ret = ib_register_client(&cma_client); + if (ret) + goto err; + return 0; + +err: + destroy_workqueue(cma_wq); + return ret; +} + +static void cma_cleanup(void) +{ + ib_unregister_client(&cma_client); + destroy_workqueue(cma_wq); + idr_destroy(&sdp_ps); + idr_destroy(&tcp_ps); +} + +module_init(cma_init); +module_exit(cma_cleanup); -- cgit v1.2.3 From a26026c1221c3e78e6e9f156994c41fb74e99022 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 17 Jun 2006 20:37:29 -0700 Subject: IB/mthca: Remove dead code Kill some dead code in mthca_eq.c Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_eq.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 99f109c3815..d536217e700 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -695,10 +695,6 @@ static void mthca_unmap_reg(struct mthca_dev *dev, unsigned long offset, static int __devinit mthca_map_eq_regs(struct mthca_dev *dev) { - unsigned long mthca_base; - - mthca_base = pci_resource_start(dev->pdev, 0); - if (mthca_is_memfree(dev)) { /* * We assume that the EQ arm and EQ set CI registers -- cgit v1.2.3 From f5358a172f79e3f995919224401b25637f4324f6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:29 -0700 Subject: IB/srp: Use FMRs to map gather/scatter lists Create an SRP FMR pool on HCAs that support FMRs, and use FMRs to map gather/scatter lists that have more than one entry into a single memory region that appears virtually contiguous to the SRP target (which is the RDMA initiator). This patch bails out on FMR mapping for SCSI commands where the gather/scatter list cannot be mapped into a single FMR because there are sub-page-sized entries in middle of the list. An unaligned start or end of the list is OK. Based on a patch by Vu Pham . Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 290 ++++++++++++++++++++++++++---------- drivers/infiniband/ulp/srp/ib_srp.h | 25 +++- 2 files changed, 228 insertions(+), 87 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9cbdffa08dc..9c373139ee6 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -105,7 +105,8 @@ static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, if (!iu->buf) goto out_free_iu; - iu->dma = dma_map_single(host->dev->dma_device, iu->buf, size, direction); + iu->dma = dma_map_single(host->dev->dev->dma_device, + iu->buf, size, direction); if (dma_mapping_error(iu->dma)) goto out_free_buf; @@ -127,7 +128,8 @@ static void srp_free_iu(struct srp_host *host, struct srp_iu *iu) if (!iu) return; - dma_unmap_single(host->dev->dma_device, iu->dma, iu->size, iu->direction); + dma_unmap_single(host->dev->dev->dma_device, + iu->dma, iu->size, iu->direction); kfree(iu->buf); kfree(iu); } @@ -147,7 +149,7 @@ static int srp_init_qp(struct srp_target_port *target, if (!attr) return -ENOMEM; - ret = ib_find_cached_pkey(target->srp_host->dev, + ret = ib_find_cached_pkey(target->srp_host->dev->dev, target->srp_host->port, be16_to_cpu(target->path.pkey), &attr->pkey_index); @@ -179,7 +181,7 @@ static int srp_create_target_ib(struct srp_target_port *target) if (!init_attr) return -ENOMEM; - target->cq = ib_create_cq(target->srp_host->dev, srp_completion, + target->cq = ib_create_cq(target->srp_host->dev->dev, srp_completion, NULL, target, SRP_CQ_SIZE); if (IS_ERR(target->cq)) { ret = PTR_ERR(target->cq); @@ -198,7 +200,7 @@ static int srp_create_target_ib(struct srp_target_port *target) init_attr->send_cq = target->cq; init_attr->recv_cq = target->cq; - target->qp = ib_create_qp(target->srp_host->pd, init_attr); + target->qp = ib_create_qp(target->srp_host->dev->pd, init_attr); if (IS_ERR(target->qp)) { ret = PTR_ERR(target->qp); ib_destroy_cq(target->cq); @@ -250,7 +252,7 @@ static int srp_lookup_path(struct srp_target_port *target) init_completion(&target->done); - target->path_query_id = ib_sa_path_rec_get(target->srp_host->dev, + target->path_query_id = ib_sa_path_rec_get(target->srp_host->dev->dev, target->srp_host->port, &target->path, IB_SA_PATH_REC_DGID | @@ -421,6 +423,11 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; + if (req->fmr) { + ib_fmr_pool_unmap(req->fmr); + req->fmr = NULL; + } + /* * This handling of non-SG commands can be killed when the * SCSI midlayer no longer generates non-SG commands. @@ -433,7 +440,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, scat = &req->fake_sg; } - dma_unmap_sg(target->srp_host->dev->dma_device, scat, nents, + dma_unmap_sg(target->srp_host->dev->dev->dma_device, scat, nents, scmnd->sc_data_direction); } @@ -459,7 +466,7 @@ static int srp_reconnect_target(struct srp_target_port *target) * Now get a new local CM ID so that we avoid confusing the * target in case things are really fouled up. */ - new_cm_id = ib_create_cm_id(target->srp_host->dev, + new_cm_id = ib_create_cm_id(target->srp_host->dev->dev, srp_cm_handler, target); if (IS_ERR(new_cm_id)) { ret = PTR_ERR(new_cm_id); @@ -528,14 +535,79 @@ err: return ret; } +static int srp_map_fmr(struct srp_device *dev, struct scatterlist *scat, + int sg_cnt, struct srp_request *req, + struct srp_direct_buf *buf) +{ + u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt; + int i, j; + int ret; + + if (!dev->fmr_pool) + return -ENODEV; + + len = page_cnt = 0; + for (i = 0; i < sg_cnt; ++i) { + if (sg_dma_address(&scat[i]) & ~dev->fmr_page_mask) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((sg_dma_address(&scat[i]) + sg_dma_len(&scat[i])) & + ~dev->fmr_page_mask) { + if (i < sg_cnt - 1) + return -EINVAL; + else + ++page_cnt; + } + + len += sg_dma_len(&scat[i]); + } + + page_cnt += len >> dev->fmr_page_shift; + if (page_cnt > SRP_FMR_SIZE) + return -ENOMEM; + + dma_pages = kmalloc(sizeof (u64) * page_cnt, GFP_ATOMIC); + if (!dma_pages) + return -ENOMEM; + + page_cnt = 0; + for (i = 0; i < sg_cnt; ++i) + for (j = 0; j < sg_dma_len(&scat[i]); j += dev->fmr_page_size) + dma_pages[page_cnt++] = + (sg_dma_address(&scat[i]) & dev->fmr_page_mask) + j; + + req->fmr = ib_fmr_pool_map_phys(dev->fmr_pool, + dma_pages, page_cnt, &io_addr); + if (IS_ERR(req->fmr)) { + ret = PTR_ERR(req->fmr); + goto out; + } + + buf->va = cpu_to_be64(sg_dma_address(&scat[0]) & ~dev->fmr_page_mask); + buf->key = cpu_to_be32(req->fmr->fmr->rkey); + buf->len = cpu_to_be32(len); + + ret = 0; + +out: + kfree(dma_pages); + + return ret; +} + static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; int len, nents, count; - int i; - u8 fmt; + u8 fmt = SRP_DATA_DESC_DIRECT; if (!scmnd->request_buffer || scmnd->sc_data_direction == DMA_NONE) return sizeof (struct srp_cmd); @@ -560,53 +632,63 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, sg_init_one(scat, scmnd->request_buffer, scmnd->request_bufflen); } - count = dma_map_sg(target->srp_host->dev->dma_device, scat, nents, - scmnd->sc_data_direction); + count = dma_map_sg(target->srp_host->dev->dev->dma_device, + scat, nents, scmnd->sc_data_direction); + + fmt = SRP_DATA_DESC_DIRECT; + len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); if (count == 1) { + /* + * The midlayer only generated a single gather/scatter + * entry, or DMA mapping coalesced everything to a + * single entry. So a direct descriptor along with + * the DMA MR suffices. + */ struct srp_direct_buf *buf = (void *) cmd->add_data; - fmt = SRP_DATA_DESC_DIRECT; - buf->va = cpu_to_be64(sg_dma_address(scat)); - buf->key = cpu_to_be32(target->srp_host->mr->rkey); + buf->key = cpu_to_be32(target->srp_host->dev->mr->rkey); buf->len = cpu_to_be32(sg_dma_len(scat)); - - len = sizeof (struct srp_cmd) + - sizeof (struct srp_direct_buf); - } else { + } else if (srp_map_fmr(target->srp_host->dev, scat, count, req, + (void *) cmd->add_data)) { + /* + * FMR mapping failed, and the scatterlist has more + * than one entry. Generate an indirect memory + * descriptor. + */ struct srp_indirect_buf *buf = (void *) cmd->add_data; u32 datalen = 0; + int i; fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + count * sizeof (struct srp_direct_buf); + + for (i = 0; i < count; ++i) { + buf->desc_list[i].va = + cpu_to_be64(sg_dma_address(&scat[i])); + buf->desc_list[i].key = + cpu_to_be32(target->srp_host->dev->mr->rkey); + buf->desc_list[i].len = + cpu_to_be32(sg_dma_len(&scat[i])); + datalen += sg_dma_len(&scat[i]); + } if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->data_out_desc_cnt = count; else cmd->data_in_desc_cnt = count; - buf->table_desc.va = cpu_to_be64(req->cmd->dma + - sizeof *cmd + - sizeof *buf); + buf->table_desc.va = + cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf); buf->table_desc.key = - cpu_to_be32(target->srp_host->mr->rkey); + cpu_to_be32(target->srp_host->dev->mr->rkey); buf->table_desc.len = cpu_to_be32(count * sizeof (struct srp_direct_buf)); - for (i = 0; i < count; ++i) { - buf->desc_list[i].va = cpu_to_be64(sg_dma_address(&scat[i])); - buf->desc_list[i].key = - cpu_to_be32(target->srp_host->mr->rkey); - buf->desc_list[i].len = cpu_to_be32(sg_dma_len(&scat[i])); - - datalen += sg_dma_len(&scat[i]); - } - buf->len = cpu_to_be32(datalen); - - len = sizeof (struct srp_cmd) + - sizeof (struct srp_indirect_buf) + - count * sizeof (struct srp_direct_buf); } if (scmnd->sc_data_direction == DMA_TO_DEVICE) @@ -689,7 +771,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) iu = target->rx_ring[wc->wr_id & ~SRP_OP_RECV]; - dma_sync_single_for_cpu(target->srp_host->dev->dma_device, iu->dma, + dma_sync_single_for_cpu(target->srp_host->dev->dev->dma_device, iu->dma, target->max_ti_iu_len, DMA_FROM_DEVICE); opcode = *(u8 *) iu->buf; @@ -726,7 +808,7 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) break; } - dma_sync_single_for_device(target->srp_host->dev->dma_device, iu->dma, + dma_sync_single_for_device(target->srp_host->dev->dev->dma_device, iu->dma, target->max_ti_iu_len, DMA_FROM_DEVICE); } @@ -770,7 +852,7 @@ static int __srp_post_recv(struct srp_target_port *target) list.addr = iu->dma; list.length = iu->size; - list.lkey = target->srp_host->mr->lkey; + list.lkey = target->srp_host->dev->mr->lkey; wr.next = NULL; wr.sg_list = &list; @@ -828,7 +910,7 @@ static int __srp_post_send(struct srp_target_port *target, list.addr = iu->dma; list.length = len; - list.lkey = target->srp_host->mr->lkey; + list.lkey = target->srp_host->dev->mr->lkey; wr.next = NULL; wr.wr_id = target->tx_head & SRP_SQ_SIZE; @@ -870,7 +952,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, if (!iu) goto err; - dma_sync_single_for_cpu(target->srp_host->dev->dma_device, iu->dma, + dma_sync_single_for_cpu(target->srp_host->dev->dev->dma_device, iu->dma, SRP_MAX_IU_LEN, DMA_TO_DEVICE); req = list_entry(target->free_reqs.next, struct srp_request, list); @@ -903,7 +985,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, goto err_unmap; } - dma_sync_single_for_device(target->srp_host->dev->dma_device, iu->dma, + dma_sync_single_for_device(target->srp_host->dev->dev->dma_device, iu->dma, SRP_MAX_IU_LEN, DMA_TO_DEVICE); if (__srp_post_send(target, iu, len)) { @@ -1365,7 +1447,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) sprintf(target->target_name, "SRP.T10:%016llX", (unsigned long long) be64_to_cpu(target->id_ext)); - if (scsi_add_host(target->scsi_host, host->dev->dma_device)) + if (scsi_add_host(target->scsi_host, host->dev->dev->dma_device)) return -ENODEV; mutex_lock(&host->target_mutex); @@ -1558,7 +1640,7 @@ static ssize_t srp_create_target(struct class_device *class_dev, if (ret) goto err; - ib_get_cached_gid(host->dev, host->port, 0, &target->path.sgid); + ib_get_cached_gid(host->dev->dev, host->port, 0, &target->path.sgid); printk(KERN_DEBUG PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x " "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", @@ -1579,7 +1661,7 @@ static ssize_t srp_create_target(struct class_device *class_dev, if (ret) goto err; - target->cm_id = ib_create_cm_id(host->dev, srp_cm_handler, target); + target->cm_id = ib_create_cm_id(host->dev->dev, srp_cm_handler, target); if (IS_ERR(target->cm_id)) { ret = PTR_ERR(target->cm_id); goto err_free; @@ -1619,7 +1701,7 @@ static ssize_t show_ibdev(struct class_device *class_dev, char *buf) struct srp_host *host = container_of(class_dev, struct srp_host, class_dev); - return sprintf(buf, "%s\n", host->dev->name); + return sprintf(buf, "%s\n", host->dev->dev->name); } static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -1634,7 +1716,7 @@ static ssize_t show_port(struct class_device *class_dev, char *buf) static CLASS_DEVICE_ATTR(port, S_IRUGO, show_port, NULL); -static struct srp_host *srp_add_port(struct ib_device *device, u8 port) +static struct srp_host *srp_add_port(struct srp_device *device, u8 port) { struct srp_host *host; @@ -1649,26 +1731,15 @@ static struct srp_host *srp_add_port(struct ib_device *device, u8 port) host->port = port; host->initiator_port_id[7] = port; - memcpy(host->initiator_port_id + 8, &device->node_guid, 8); - - host->pd = ib_alloc_pd(device); - if (IS_ERR(host->pd)) - goto err_free; - - host->mr = ib_get_dma_mr(host->pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(host->mr)) - goto err_pd; + memcpy(host->initiator_port_id + 8, &device->dev->node_guid, 8); host->class_dev.class = &srp_class; - host->class_dev.dev = device->dma_device; + host->class_dev.dev = device->dev->dma_device; snprintf(host->class_dev.class_id, BUS_ID_SIZE, "srp-%s-%d", - device->name, port); + device->dev->name, port); if (class_device_register(&host->class_dev)) - goto err_mr; + goto free_host; if (class_device_create_file(&host->class_dev, &class_device_attr_add_target)) goto err_class; if (class_device_create_file(&host->class_dev, &class_device_attr_ibdev)) @@ -1681,13 +1752,7 @@ static struct srp_host *srp_add_port(struct ib_device *device, u8 port) err_class: class_device_unregister(&host->class_dev); -err_mr: - ib_dereg_mr(host->mr); - -err_pd: - ib_dealloc_pd(host->pd); - -err_free: +free_host: kfree(host); return NULL; @@ -1695,15 +1760,62 @@ err_free: static void srp_add_one(struct ib_device *device) { - struct list_head *dev_list; + struct srp_device *srp_dev; + struct ib_device_attr *dev_attr; + struct ib_fmr_pool_param fmr_param; struct srp_host *host; int s, e, p; - dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); - if (!dev_list) + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) return; - INIT_LIST_HEAD(dev_list); + if (ib_query_device(device, dev_attr)) { + printk(KERN_WARNING PFX "Query device failed for %s\n", + device->name); + goto free_attr; + } + + srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL); + if (!srp_dev) + goto free_attr; + + /* + * Use the smallest page size supported by the HCA, down to a + * minimum of 512 bytes (which is the smallest sector that a + * SCSI command will ever carry). + */ + srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); + srp_dev->fmr_page_size = 1 << srp_dev->fmr_page_shift; + srp_dev->fmr_page_mask = ~((unsigned long) srp_dev->fmr_page_size - 1); + + INIT_LIST_HEAD(&srp_dev->dev_list); + + srp_dev->dev = device; + srp_dev->pd = ib_alloc_pd(device); + if (IS_ERR(srp_dev->pd)) + goto free_dev; + + srp_dev->mr = ib_get_dma_mr(srp_dev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(srp_dev->mr)) + goto err_pd; + + memset(&fmr_param, 0, sizeof fmr_param); + fmr_param.pool_size = SRP_FMR_POOL_SIZE; + fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; + fmr_param.cache = 1; + fmr_param.max_pages_per_fmr = SRP_FMR_SIZE; + fmr_param.page_shift = srp_dev->fmr_page_shift; + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); + if (IS_ERR(srp_dev->fmr_pool)) + srp_dev->fmr_pool = NULL; if (device->node_type == IB_NODE_SWITCH) { s = 0; @@ -1714,25 +1826,36 @@ static void srp_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { - host = srp_add_port(device, p); + host = srp_add_port(srp_dev, p); if (host) - list_add_tail(&host->list, dev_list); + list_add_tail(&host->list, &srp_dev->dev_list); } - ib_set_client_data(device, &srp_client, dev_list); + ib_set_client_data(device, &srp_client, srp_dev); + + goto free_attr; + +err_pd: + ib_dealloc_pd(srp_dev->pd); + +free_dev: + kfree(srp_dev); + +free_attr: + kfree(dev_attr); } static void srp_remove_one(struct ib_device *device) { - struct list_head *dev_list; + struct srp_device *srp_dev; struct srp_host *host, *tmp_host; LIST_HEAD(target_list); struct srp_target_port *target, *tmp_target; unsigned long flags; - dev_list = ib_get_client_data(device, &srp_client); + srp_dev = ib_get_client_data(device, &srp_client); - list_for_each_entry_safe(host, tmp_host, dev_list, list) { + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { class_device_unregister(&host->class_dev); /* * Wait for the sysfs entry to go away, so that no new @@ -1770,12 +1893,15 @@ static void srp_remove_one(struct ib_device *device) scsi_host_put(target->scsi_host); } - ib_dereg_mr(host->mr); - ib_dealloc_pd(host->pd); kfree(host); } - kfree(dev_list); + if (srp_dev->fmr_pool) + ib_destroy_fmr_pool(srp_dev->fmr_pool); + ib_dereg_mr(srp_dev->mr); + ib_dealloc_pd(srp_dev->pd); + + kfree(srp_dev); } static int __init srp_init_module(void) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index c5cd43aae86..5ecda4ad890 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -46,6 +46,7 @@ #include #include #include +#include enum { SRP_PATH_REC_TIMEOUT_MS = 1000, @@ -62,7 +63,11 @@ enum { SRP_SQ_SIZE = SRP_RQ_SIZE - 1, SRP_CQ_SIZE = SRP_SQ_SIZE + SRP_RQ_SIZE, - SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1) + SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1), + + SRP_FMR_SIZE = 256, + SRP_FMR_POOL_SIZE = 1024, + SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4 }; #define SRP_OP_RECV (1 << 31) @@ -77,15 +82,24 @@ enum srp_target_state { SRP_TARGET_REMOVED }; -struct srp_host { - u8 initiator_port_id[16]; +struct srp_device { + struct list_head dev_list; struct ib_device *dev; - u8 port; struct ib_pd *pd; struct ib_mr *mr; + struct ib_fmr_pool *fmr_pool; + int fmr_page_shift; + int fmr_page_size; + unsigned long fmr_page_mask; +}; + +struct srp_host { + u8 initiator_port_id[16]; + struct srp_device *dev; + u8 port; struct class_device class_dev; struct list_head target_list; - struct mutex target_mutex; + struct mutex target_mutex; struct completion released; struct list_head list; }; @@ -95,6 +109,7 @@ struct srp_request { struct scsi_cmnd *scmnd; struct srp_iu *cmd; struct srp_iu *tsk_mgmt; + struct ib_pool_fmr *fmr; /* * Fake scatterlist used when scmnd->use_sg==0. Can be killed * when the SCSI midlayer no longer generates non-SG commands. -- cgit v1.2.3 From e9cd59418f049966a690372c4919e98c88bb119b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:30 -0700 Subject: IB/mthca: Convert FW commands to use wait_for_completion_timeout() The kernel has had wait_for_completion_timeout() for a long time now. mthca should use it to handle FW commands timing out, instead of implementing the same thing in a much more complicated way by using wait_for_completion() along with a timer that does complete(). Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cmd.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 798e13e14fa..d0f7731802c 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -174,7 +174,6 @@ enum { struct mthca_cmd_context { struct completion done; - struct timer_list timer; int result; int next; u64 out_param; @@ -362,15 +361,6 @@ void mthca_cmd_event(struct mthca_dev *dev, complete(&context->done); } -static void event_timeout(unsigned long context_ptr) -{ - struct mthca_cmd_context *context = - (struct mthca_cmd_context *) context_ptr; - - context->result = -EBUSY; - complete(&context->done); -} - static int mthca_cmd_wait(struct mthca_dev *dev, u64 in_param, u64 *out_param, @@ -401,11 +391,10 @@ static int mthca_cmd_wait(struct mthca_dev *dev, if (err) goto out; - context->timer.expires = jiffies + timeout; - add_timer(&context->timer); - - wait_for_completion(&context->done); - del_timer_sync(&context->timer); + if (!wait_for_completion_timeout(&context->done, timeout)) { + err = -EBUSY; + goto out; + } err = context->result; if (err) @@ -535,10 +524,6 @@ int mthca_cmd_use_events(struct mthca_dev *dev) for (i = 0; i < dev->cmd.max_cmds; ++i) { dev->cmd.context[i].token = i; dev->cmd.context[i].next = i + 1; - init_timer(&dev->cmd.context[i].timer); - dev->cmd.context[i].timer.data = - (unsigned long) &dev->cmd.context[i]; - dev->cmd.context[i].timer.function = event_timeout; } dev->cmd.context[dev->cmd.max_cmds - 1].next = -1; -- cgit v1.2.3 From 1962a4a1e4b3716aa836ebeb5b80c804a7f7c5ba Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 17 Jun 2006 20:37:30 -0700 Subject: IB/srp: Use SCAN_WILD_CARD from SCSI headers SCAN_WILD_CARD is indeed available from , which is already included. So get rid of private hack. Signed-off-by: Matthew Wilcox Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9c373139ee6..28d741e7805 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1456,9 +1456,8 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) target->state = SRP_TARGET_LIVE; - /* XXX: are we supposed to have a definition of SCAN_WILD_CARD ?? */ scsi_scan_target(&target->scsi_host->shost_gendev, - 0, target->scsi_id, ~0, 0); + 0, target->scsi_id, SCAN_WILD_CARD, 0); return 0; } -- cgit v1.2.3 From 549c5fc2c8149498118f2494a1b6a4938ca05985 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 17 Jun 2006 20:37:30 -0700 Subject: IB/srp: Get rid of unneeded use of list_for_each_entry_safe() list_for_each_entry_safe() is used in one place where the list isn't modified. So just change it to list_for_each_entry(). Signed-off-by: Matthew Wilcox Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 28d741e7805..7c0a3d9c905 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1867,8 +1867,7 @@ static void srp_remove_one(struct ib_device *device) * commands and don't try to reconnect. */ mutex_lock(&host->target_mutex); - list_for_each_entry_safe(target, tmp_target, - &host->target_list, list) { + list_for_each_entry(target, &host->target_list, list) { spin_lock_irqsave(target->scsi_host->host_lock, flags); if (target->state != SRP_TARGET_REMOVED) target->state = SRP_TARGET_REMOVED; -- cgit v1.2.3 From b3589fd49067bab9fe0c60430860e6befbd5ba37 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 17 Jun 2006 20:37:30 -0700 Subject: IB/srp: Change target_mutex to a spinlock The SRP driver never sleeps while holding target_mutex, and it's just used to protect some simple list operations, so hold times will be short. So just convert it to a spinlock, which is smaller and faster. Signed-off-by: Matthew Wilcox Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 14 +++++++------- drivers/infiniband/ulp/srp/ib_srp.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7c0a3d9c905..ae1ab6acf46 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -361,9 +361,9 @@ static void srp_remove_work(void *target_ptr) target->state = SRP_TARGET_REMOVED; spin_unlock_irq(target->scsi_host->host_lock); - mutex_lock(&target->srp_host->target_mutex); + spin_lock(&target->srp_host->target_lock); list_del(&target->list); - mutex_unlock(&target->srp_host->target_mutex); + spin_unlock(&target->srp_host->target_lock); scsi_remove_host(target->scsi_host); ib_destroy_cm_id(target->cm_id); @@ -1450,9 +1450,9 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) if (scsi_add_host(target->scsi_host, host->dev->dev->dma_device)) return -ENODEV; - mutex_lock(&host->target_mutex); + spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); - mutex_unlock(&host->target_mutex); + spin_unlock(&host->target_lock); target->state = SRP_TARGET_LIVE; @@ -1724,7 +1724,7 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) return NULL; INIT_LIST_HEAD(&host->target_list); - mutex_init(&host->target_mutex); + spin_lock_init(&host->target_lock); init_completion(&host->released); host->dev = device; host->port = port; @@ -1866,14 +1866,14 @@ static void srp_remove_one(struct ib_device *device) * Mark all target ports as removed, so we stop queueing * commands and don't try to reconnect. */ - mutex_lock(&host->target_mutex); + spin_lock(&host->target_lock); list_for_each_entry(target, &host->target_list, list) { spin_lock_irqsave(target->scsi_host->host_lock, flags); if (target->state != SRP_TARGET_REMOVED) target->state = SRP_TARGET_REMOVED; spin_unlock_irqrestore(target->scsi_host->host_lock, flags); } - mutex_unlock(&host->target_mutex); + spin_unlock(&host->target_lock); /* * Wait for any reconnection tasks that may have diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 5ecda4ad890..c071c303270 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -99,7 +99,7 @@ struct srp_host { u8 port; struct class_device class_dev; struct list_head target_list; - struct mutex target_mutex; + spinlock_t target_lock; struct completion released; struct list_head list; }; -- cgit v1.2.3 From 403a496fd4af3036c12e1f9c90a89cf846fadd35 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:31 -0700 Subject: IB: Make needlessly global ib_mad_cache static Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 3 +-- drivers/infiniband/core/mad_priv.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 5ad41a64314..92c7362f588 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -45,8 +45,7 @@ MODULE_DESCRIPTION("kernel IB MAD API"); MODULE_AUTHOR("Hal Rosenstock"); MODULE_AUTHOR("Sean Hefty"); - -kmem_cache_t *ib_mad_cache; +static kmem_cache_t *ib_mad_cache; static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index b4fa28d3160..d147f3bad2c 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -212,8 +212,6 @@ struct ib_mad_port_private { struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; }; -extern kmem_cache_t *ib_mad_cache; - int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr); struct ib_mad_send_wr_private * -- cgit v1.2.3 From 0c5b395239cdea4db3d9c23a5738fdaf3b9ada4c Mon Sep 17 00:00:00 2001 From: Ishai Rabinovitz Date: Sat, 17 Jun 2006 20:37:31 -0700 Subject: IB/srp: Clean up loop in srp_remove_one() Interrupts will always be enabled in srp_remove_one(), so spin_lock_irq() can be used instead of spin_lock_irqsave(). Also, the loop takes target->scsi_host->host_lock, so target->state can just be set to SRP_TARGET_REMOVED witout testing the old value. Signed-off-by: Ishai Rabinovitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index ae1ab6acf46..46292561c44 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1850,7 +1850,6 @@ static void srp_remove_one(struct ib_device *device) struct srp_host *host, *tmp_host; LIST_HEAD(target_list); struct srp_target_port *target, *tmp_target; - unsigned long flags; srp_dev = ib_get_client_data(device, &srp_client); @@ -1868,10 +1867,9 @@ static void srp_remove_one(struct ib_device *device) */ spin_lock(&host->target_lock); list_for_each_entry(target, &host->target_list, list) { - spin_lock_irqsave(target->scsi_host->host_lock, flags); - if (target->state != SRP_TARGET_REMOVED) - target->state = SRP_TARGET_REMOVED; - spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + spin_lock_irq(target->scsi_host->host_lock); + target->state = SRP_TARGET_REMOVED; + spin_unlock_irq(target->scsi_host->host_lock); } spin_unlock(&host->target_lock); -- cgit v1.2.3 From 52fb2b50c4baa1430064c1e6c1c7df473d469df1 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Sat, 17 Jun 2006 20:37:31 -0700 Subject: IB/srp: Allow cmd_per_lun to be set per target port Allow userspace to throttle traffic on a given connection to a target port by adding "max_cmd_per_lun=xyz" to lower the cmd_per_lun value set for that scsi_host. Signed-off-by: Vu Pham Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 46292561c44..8e6b1036971 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1491,6 +1491,7 @@ enum { SRP_OPT_PKEY = 1 << 3, SRP_OPT_SERVICE_ID = 1 << 4, SRP_OPT_MAX_SECT = 1 << 5, + SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1499,13 +1500,14 @@ enum { }; static match_table_t srp_opt_tokens = { - { SRP_OPT_ID_EXT, "id_ext=%s" }, - { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, - { SRP_OPT_DGID, "dgid=%s" }, - { SRP_OPT_PKEY, "pkey=%x" }, - { SRP_OPT_SERVICE_ID, "service_id=%s" }, - { SRP_OPT_MAX_SECT, "max_sect=%d" }, - { SRP_OPT_ERR, NULL } + { SRP_OPT_ID_EXT, "id_ext=%s" }, + { SRP_OPT_IOC_GUID, "ioc_guid=%s" }, + { SRP_OPT_DGID, "dgid=%s" }, + { SRP_OPT_PKEY, "pkey=%x" }, + { SRP_OPT_SERVICE_ID, "service_id=%s" }, + { SRP_OPT_MAX_SECT, "max_sect=%d" }, + { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_ERR, NULL } }; static int srp_parse_options(const char *buf, struct srp_target_port *target) @@ -1581,6 +1583,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->max_sectors = token; break; + case SRP_OPT_MAX_CMD_PER_LUN: + if (match_int(args, &token)) { + printk(KERN_WARNING PFX "bad max cmd_per_lun parameter '%s'\n", p); + goto out; + } + target->scsi_host->cmd_per_lun = min(token, SRP_SQ_SIZE); + break; + default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); -- cgit v1.2.3 From 74b0a15b5e18e44206c98419745a472c3d28e561 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Sat, 17 Jun 2006 20:37:32 -0700 Subject: IB/srp: Allow sg_tablesize to be adjusted Make the sg_tablesize used by SRP adjustable at module load time via a module parameter. Calculate the corresponding IU length required to support this. Signed-off-by: Vu Pham Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 21 ++++++++++++++++----- drivers/infiniband/ulp/srp/ib_srp.h | 5 +---- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 8e6b1036971..a01b73bb976 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -62,6 +62,13 @@ MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " "v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_LICENSE("Dual BSD/GPL"); +static int srp_sg_tablesize = SRP_DEF_SG_TABLESIZE; +static int srp_max_iu_len; + +module_param(srp_sg_tablesize, int, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, + "Max number of gather/scatter entries per I/O (default is 12)"); + static int topspin_workarounds = 1; module_param(topspin_workarounds, int, 0444); @@ -311,7 +318,7 @@ static int srp_send_req(struct srp_target_port *target) req->priv.opcode = SRP_LOGIN_REQ; req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(SRP_MAX_IU_LEN); + req->priv.req_it_iu_len = cpu_to_be32(srp_max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); memcpy(req->priv.initiator_port_id, target->srp_host->initiator_port_id, 16); @@ -953,7 +960,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, goto err; dma_sync_single_for_cpu(target->srp_host->dev->dev->dma_device, iu->dma, - SRP_MAX_IU_LEN, DMA_TO_DEVICE); + srp_max_iu_len, DMA_TO_DEVICE); req = list_entry(target->free_reqs.next, struct srp_request, list); @@ -986,7 +993,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd, } dma_sync_single_for_device(target->srp_host->dev->dev->dma_device, iu->dma, - SRP_MAX_IU_LEN, DMA_TO_DEVICE); + srp_max_iu_len, DMA_TO_DEVICE); if (__srp_post_send(target, iu, len)) { printk(KERN_ERR PFX "Send failed\n"); @@ -1018,7 +1025,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, - SRP_MAX_IU_LEN, + srp_max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); if (!target->tx_ring[i]) goto err; @@ -1436,7 +1443,6 @@ static struct scsi_host_template srp_template = { .eh_host_reset_handler = srp_reset_host, .can_queue = SRP_SQ_SIZE, .this_id = -1, - .sg_tablesize = SRP_MAX_INDIRECT, .cmd_per_lun = SRP_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs @@ -1914,6 +1920,11 @@ static int __init srp_init_module(void) { int ret; + srp_template.sg_tablesize = srp_sg_tablesize; + srp_max_iu_len = (sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + srp_sg_tablesize * 16); + ret = class_register(&srp_class); if (ret) { printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index c071c303270..033a44772ae 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -56,7 +56,7 @@ enum { SRP_DLID_REDIRECT = 2, SRP_MAX_LUN = 512, - SRP_MAX_IU_LEN = 256, + SRP_DEF_SG_TABLESIZE = 12, SRP_RQ_SHIFT = 6, SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, @@ -71,9 +71,6 @@ enum { }; #define SRP_OP_RECV (1 << 31) -#define SRP_MAX_INDIRECT ((SRP_MAX_IU_LEN - \ - sizeof (struct srp_cmd) - \ - sizeof (struct srp_indirect_buf)) / 16) enum srp_target_state { SRP_TARGET_LIVE, -- cgit v1.2.3 From b7ac4ab497e44cba75fb0e9e5afca06776518934 Mon Sep 17 00:00:00 2001 From: Ishai Rabinovitz Date: Sat, 17 Jun 2006 20:37:32 -0700 Subject: IB/srp: Handle DREQ events from CM Handle IB_CM_DREQ_ERROR and IB_CM_DREQ_RECEIVED events from the CM, instead of just printing "Unhandled CM event". In the case of DREQ_ERROR, just ignore the event -- a TIMEWAIT_EXIT will be generated also. For DREQ_RECEIVED, send a DREP in response to shut the connection down cleanly. Signed-off-by: Ishai Rabinovitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a01b73bb976..4ba790f54a2 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1196,11 +1196,10 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) srp_cm_rej_handler(cm_id, event, target); break; - case IB_CM_MRA_RECEIVED: - printk(KERN_ERR PFX "MRA received\n"); - break; - - case IB_CM_DREP_RECEIVED: + case IB_CM_DREQ_RECEIVED: + printk(KERN_WARNING PFX "DREQ received - connection closed\n"); + if (ib_send_cm_drep(cm_id, NULL, 0)) + printk(KERN_ERR PFX "Sending CM DREP failed\n"); break; case IB_CM_TIMEWAIT_EXIT: @@ -1210,6 +1209,11 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) target->status = 0; break; + case IB_CM_MRA_RECEIVED: + case IB_CM_DREQ_ERROR: + case IB_CM_DREP_RECEIVED: + break; + default: printk(KERN_WARNING PFX "Unhandled CM event %d\n", event->event); break; -- cgit v1.2.3 From 6bfa24fa3e189269e113197a80e12862c211b3d3 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:33 -0700 Subject: IB/srp: Get rid of "Target has req_lim 0" messages It's perfectly valid for a connection to an SRP target to have a request limit of 0, so get rid of the message about it, which can spam kernel logs even with printk_ratelimit(). Keep a count of such events in a "zero_req_lim" SCSI host attribute instead, so someone who cares can look at the statistics. Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 21 +++++++++++++++------ drivers/infiniband/ulp/srp/ib_srp.h | 2 ++ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4ba790f54a2..f1401e1f2f9 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -894,12 +894,8 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target) if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) return NULL; - if (unlikely(target->req_lim < 1)) { - if (printk_ratelimit()) - printk(KERN_DEBUG PFX "Target has req_lim %d\n", - target->req_lim); - return NULL; - } + if (unlikely(target->req_lim < 1)) + ++target->zero_req_lim; return target->tx_ring[target->tx_head & SRP_SQ_SIZE]; } @@ -1422,11 +1418,23 @@ static ssize_t show_dgid(struct class_device *cdev, char *buf) be16_to_cpu(((__be16 *) target->path.dgid.raw)[7])); } +static ssize_t show_zero_req_lim(struct class_device *cdev, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(cdev)); + + if (target->state == SRP_TARGET_DEAD || + target->state == SRP_TARGET_REMOVED) + return -ENODEV; + + return sprintf(buf, "%d\n", target->zero_req_lim); +} + static CLASS_DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static CLASS_DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static CLASS_DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); static CLASS_DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); +static CLASS_DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static struct class_device_attribute *srp_host_attrs[] = { &class_device_attr_id_ext, @@ -1434,6 +1442,7 @@ static struct class_device_attribute *srp_host_attrs[] = { &class_device_attr_service_id, &class_device_attr_pkey, &class_device_attr_dgid, + &class_device_attr_zero_req_lim, NULL }; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 033a44772ae..ad45e4856e1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -138,6 +138,8 @@ struct srp_target_port { int max_ti_iu_len; s32 req_lim; + int zero_req_lim; + unsigned rx_head; struct srp_iu *rx_ring[SRP_RQ_SIZE]; -- cgit v1.2.3 From 4be10c1e6dcafec993ac4f9abfa5fdcd83728302 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 25 May 2006 10:03:23 -0700 Subject: IB/ucm: convert semaphore to mutex Convert semaphore in ib_ucm_file to a real mutex. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucm.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 0136aee0faa..67caf36504e 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -64,7 +64,7 @@ struct ib_ucm_device { }; struct ib_ucm_file { - struct semaphore mutex; + struct mutex file_mutex; struct file *filp; struct ib_ucm_device *device; @@ -153,7 +153,7 @@ static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) { struct ib_ucm_event *uevent; - down(&ctx->file->mutex); + mutex_lock(&ctx->file->file_mutex); list_del(&ctx->file_list); while (!list_empty(&ctx->events)) { @@ -168,7 +168,7 @@ static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) kfree(uevent); } - up(&ctx->file->mutex); + mutex_unlock(&ctx->file->file_mutex); } static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) @@ -375,11 +375,11 @@ static int ib_ucm_event_handler(struct ib_cm_id *cm_id, if (result) goto err2; - down(&ctx->file->mutex); + mutex_lock(&ctx->file->file_mutex); list_add_tail(&uevent->file_list, &ctx->file->events); list_add_tail(&uevent->ctx_list, &ctx->events); wake_up_interruptible(&ctx->file->poll_wait); - up(&ctx->file->mutex); + mutex_unlock(&ctx->file->file_mutex); return 0; err2: @@ -405,7 +405,7 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - down(&file->mutex); + mutex_lock(&file->file_mutex); while (list_empty(&file->events)) { if (file->filp->f_flags & O_NONBLOCK) { @@ -420,9 +420,9 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, prepare_to_wait(&file->poll_wait, &wait, TASK_INTERRUPTIBLE); - up(&file->mutex); + mutex_unlock(&file->file_mutex); schedule(); - down(&file->mutex); + mutex_lock(&file->file_mutex); finish_wait(&file->poll_wait, &wait); } @@ -482,7 +482,7 @@ static ssize_t ib_ucm_event(struct ib_ucm_file *file, kfree(uevent->info); kfree(uevent); done: - up(&file->mutex); + mutex_unlock(&file->file_mutex); return result; } @@ -501,9 +501,9 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - down(&file->mutex); + mutex_lock(&file->file_mutex); ctx = ib_ucm_ctx_alloc(file); - up(&file->mutex); + mutex_unlock(&file->file_mutex); if (!ctx) return -ENOMEM; @@ -1177,7 +1177,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp) INIT_LIST_HEAD(&file->ctxs); init_waitqueue_head(&file->poll_wait); - init_MUTEX(&file->mutex); + mutex_init(&file->file_mutex); filp->private_data = file; file->filp = filp; @@ -1191,11 +1191,11 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) struct ib_ucm_file *file = filp->private_data; struct ib_ucm_context *ctx; - down(&file->mutex); + mutex_lock(&file->file_mutex); while (!list_empty(&file->ctxs)) { ctx = list_entry(file->ctxs.next, struct ib_ucm_context, file_list); - up(&file->mutex); + mutex_unlock(&file->file_mutex); mutex_lock(&ctx_id_mutex); idr_remove(&ctx_id_table, ctx->id); @@ -1205,9 +1205,9 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) ib_ucm_cleanup_events(ctx); kfree(ctx); - down(&file->mutex); + mutex_lock(&file->file_mutex); } - up(&file->mutex); + mutex_unlock(&file->file_mutex); kfree(file); return 0; } -- cgit v1.2.3 From 856c256f883f027a14b546164294b4a86fea81a4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 25 May 2006 14:51:33 -0700 Subject: IB/cm: remove unneeded flush_workqueue destroy_workqueue() already does flush_workqueue(). Signed-off-by: Michael S. Tsirkin Signed-off-by: Sean Hefty --- drivers/infiniband/core/cm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 490fd03766d..1c7463b7c5f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3358,7 +3358,6 @@ error: static void __exit ib_cm_cleanup(void) { - flush_workqueue(cm.wq); destroy_workqueue(cm.wq); ib_unregister_client(&cm_client); idr_destroy(&cm.local_id_table); -- cgit v1.2.3 From 6fb9cdbf2cdb2ea187e57ec2e16cc59df2adf86a Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sat, 17 Jun 2006 20:37:34 -0700 Subject: IB: Add caching of ports' LMC Add an LMC cache to struct ib_device, and add a function ib_get_cached_lmc() to query the cache. Signed-off-by: Jack Morgenstein Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cache.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 50364c0b090..e05ca2cdc73 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -191,6 +191,24 @@ int ib_find_cached_pkey(struct ib_device *device, } EXPORT_SYMBOL(ib_find_cached_pkey); +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc) +{ + unsigned long flags; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_lmc); + static void ib_cache_update(struct ib_device *device, u8 port) { @@ -251,6 +269,8 @@ static void ib_cache_update(struct ib_device *device, device->cache.pkey_cache[port - start_port(device)] = pkey_cache; device->cache.gid_cache [port - start_port(device)] = gid_cache; + device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; + write_unlock_irq(&device->cache.lock); kfree(old_pkey_cache); @@ -305,7 +325,13 @@ static void ib_cache_setup_one(struct ib_device *device) kmalloc(sizeof *device->cache.gid_cache * (end_port(device) - start_port(device) + 1), GFP_KERNEL); - if (!device->cache.pkey_cache || !device->cache.gid_cache) { + device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * + (end_port(device) - + start_port(device) + 1), + GFP_KERNEL); + + if (!device->cache.pkey_cache || !device->cache.gid_cache || + !device->cache.lmc_cache) { printk(KERN_WARNING "Couldn't allocate cache " "for %s\n", device->name); goto err; @@ -333,6 +359,7 @@ err_cache: err: kfree(device->cache.pkey_cache); kfree(device->cache.gid_cache); + kfree(device->cache.lmc_cache); } static void ib_cache_cleanup_one(struct ib_device *device) @@ -349,6 +376,7 @@ static void ib_cache_cleanup_one(struct ib_device *device) kfree(device->cache.pkey_cache); kfree(device->cache.gid_cache); + kfree(device->cache.lmc_cache); } static struct ib_client cache_client = { -- cgit v1.2.3 From 9874e746550fbd366484621b8838b98589bb2a15 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sat, 17 Jun 2006 20:37:34 -0700 Subject: IB/mad: Check GID/LID when matching requests Check GID/LID for requester side when searching for request which matches received response. This is in order to guarantee uniqueness if the same TID is used when requesting via multiple source LIDs (when LMC is not zero). Use ports' cached LMC to perform the check. Further, do not perform LID check for direct-routed packets, since the permissive LID makes a proper check impossible. Signed-off-by: Jack Morgenstein Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 94 +++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 31 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 92c7362f588..b38e02a5db3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -34,6 +34,7 @@ * $Id: mad.c 5596 2006-03-03 01:00:07Z sean.hefty $ */ #include +#include #include "mad_priv.h" #include "mad_rmpp.h" @@ -1672,20 +1673,21 @@ static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr, rwc->recv_buf.mad->mad_hdr.mgmt_class; } -static inline int rcv_has_same_gid(struct ib_mad_send_wr_private *wr, +static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv, + struct ib_mad_send_wr_private *wr, struct ib_mad_recv_wc *rwc ) { struct ib_ah_attr attr; u8 send_resp, rcv_resp; + union ib_gid sgid; + struct ib_device *device = mad_agent_priv->agent.device; + u8 port_num = mad_agent_priv->agent.port_num; + u8 lmc; send_resp = ((struct ib_mad *)(wr->send_buf.mad))-> mad_hdr.method & IB_MGMT_METHOD_RESP; rcv_resp = rwc->recv_buf.mad->mad_hdr.method & IB_MGMT_METHOD_RESP; - if (!send_resp && rcv_resp) - /* is request/response. GID/LIDs are both local (same). */ - return 1; - if (send_resp == rcv_resp) /* both requests, or both responses. GIDs different */ return 0; @@ -1694,48 +1696,78 @@ static inline int rcv_has_same_gid(struct ib_mad_send_wr_private *wr, /* Assume not equal, to avoid false positives. */ return 0; - if (!(attr.ah_flags & IB_AH_GRH) && !(rwc->wc->wc_flags & IB_WC_GRH)) - return attr.dlid == rwc->wc->slid; - else if ((attr.ah_flags & IB_AH_GRH) && - (rwc->wc->wc_flags & IB_WC_GRH)) - return memcmp(attr.grh.dgid.raw, - rwc->recv_buf.grh->sgid.raw, 16) == 0; - else + if (!!(attr.ah_flags & IB_AH_GRH) != + !!(rwc->wc->wc_flags & IB_WC_GRH)) /* one has GID, other does not. Assume different */ return 0; + + if (!send_resp && rcv_resp) { + /* is request/response. */ + if (!(attr.ah_flags & IB_AH_GRH)) { + if (ib_get_cached_lmc(device, port_num, &lmc)) + return 0; + return (!lmc || !((attr.src_path_bits ^ + rwc->wc->dlid_path_bits) & + ((1 << lmc) - 1))); + } else { + if (ib_get_cached_gid(device, port_num, + attr.grh.sgid_index, &sgid)) + return 0; + return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, + 16); + } + } + + if (!(attr.ah_flags & IB_AH_GRH)) + return attr.dlid == rwc->wc->slid; + else + return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, + 16); } + +static inline int is_direct(u8 class) +{ + return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE); +} + struct ib_mad_send_wr_private* ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, - struct ib_mad_recv_wc *mad_recv_wc) + struct ib_mad_recv_wc *wc) { - struct ib_mad_send_wr_private *mad_send_wr; + struct ib_mad_send_wr_private *wr; struct ib_mad *mad; - mad = (struct ib_mad *)mad_recv_wc->recv_buf.mad; + mad = (struct ib_mad *)wc->recv_buf.mad; - list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list, - agent_list) { - if ((mad_send_wr->tid == mad->mad_hdr.tid) && - rcv_has_same_class(mad_send_wr, mad_recv_wc) && - rcv_has_same_gid(mad_send_wr, mad_recv_wc)) - return mad_send_wr; + list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) { + if ((wr->tid == mad->mad_hdr.tid) && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) + return wr; } /* * It's possible to receive the response before we've * been notified that the send has completed */ - list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list, - agent_list) { - if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) && - mad_send_wr->tid == mad->mad_hdr.tid && - mad_send_wr->timeout && - rcv_has_same_class(mad_send_wr, mad_recv_wc) && - rcv_has_same_gid(mad_send_wr, mad_recv_wc)) { + list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) { + if (is_data_mad(mad_agent_priv, wr->send_buf.mad) && + wr->tid == mad->mad_hdr.tid && + wr->timeout && + rcv_has_same_class(wr, wc) && + /* + * Don't check GID for direct routed MADs. + * These might have permissive LIDs. + */ + (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) || + rcv_has_same_gid(mad_agent_priv, wr, wc))) /* Verify request has not been canceled */ - return (mad_send_wr->status == IB_WC_SUCCESS) ? - mad_send_wr : NULL; - } + return (wr->status == IB_WC_SUCCESS) ? wr : NULL; } return NULL; } -- cgit v1.2.3 From 31c02e215700c2b704d9441f629ae87bb9aeb561 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:34 -0700 Subject: IPoIB: Avoid using stale last_send counter when reaping AHs The comparisons of priv->tx_tail to ah->last_send in ipoib_free_ah() and ipoib_post_receive() are slightly unsafe, because priv->tx_lock is not held and hence a stale value of ah->last_send might be used, which would lead to freeing an AH before the driver was really done with it. The simple way to fix this is to the optimization of early free from ipoib_free_ah() and unconditionally queue AHs for reaping, and then take priv->tx_lock in __ipoib_reap_ah(). Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 8406839b91c..5033666b148 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -84,15 +84,9 @@ void ipoib_free_ah(struct kref *kref) unsigned long flags; - if ((int) priv->tx_tail - (int) ah->last_send >= 0) { - ipoib_dbg(priv, "Freeing ah %p\n", ah->ah); - ib_destroy_ah(ah->ah); - kfree(ah); - } else { - spin_lock_irqsave(&priv->lock, flags); - list_add_tail(&ah->list, &priv->dead_ahs); - spin_unlock_irqrestore(&priv->lock, flags); - } + spin_lock_irqsave(&priv->lock, flags); + list_add_tail(&ah->list, &priv->dead_ahs); + spin_unlock_irqrestore(&priv->lock, flags); } static int ipoib_ib_post_receive(struct net_device *dev, int id) @@ -377,19 +371,16 @@ static void __ipoib_reap_ah(struct net_device *dev) struct ipoib_ah *ah, *tah; LIST_HEAD(remove_list); - spin_lock_irq(&priv->lock); + spin_lock_irq(&priv->tx_lock); + spin_lock(&priv->lock); list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) if ((int) priv->tx_tail - (int) ah->last_send >= 0) { list_del(&ah->list); - list_add_tail(&ah->list, &remove_list); + ib_destroy_ah(ah->ah); + kfree(ah); } - spin_unlock_irq(&priv->lock); - - list_for_each_entry_safe(ah, tah, &remove_list, list) { - ipoib_dbg(priv, "Reaping ah %p\n", ah->ah); - ib_destroy_ah(ah->ah); - kfree(ah); - } + spin_unlock(&priv->lock); + spin_unlock_irq(&priv->tx_lock); } void ipoib_reap_ah(void *dev_ptr) -- cgit v1.2.3 From 37c22a77212c13201497378cc8becc5c95d0f3f5 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 29 May 2006 19:14:05 +0300 Subject: IPoIB: Fix kernel unaligned access on ia64 Fix misaligned access faults on ia64: never cast a misaligned neighbour->ha + 4 pointer to union ib_gid type; pass a void * pointer instead. The memcpy was being optimized to use full word accesses because the compiler thought that union ib_gid is always aligned. The cast in IPOIB_GID_ARG is safe, since it is fixed to access each byte separately. Signed-off-by: Jack Morgenstein Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 34 +++++++++++++++++--------- drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 +++++++++------------ drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 ++++----- 3 files changed, 39 insertions(+), 34 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 12a1e0572ef..491d2afaf5b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -272,8 +272,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); void ipoib_mcast_join_task(void *dev_ptr); -void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, - struct sk_buff *skb); +void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); void ipoib_mcast_restart_task(void *dev_ptr); int ipoib_mcast_start_thread(struct net_device *dev); @@ -369,15 +368,26 @@ extern int ipoib_debug_level; #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */ -#define IPOIB_GID_FMT "%x:%x:%x:%x:%x:%x:%x:%x" - -#define IPOIB_GID_ARG(gid) be16_to_cpup((__be16 *) ((gid).raw + 0)), \ - be16_to_cpup((__be16 *) ((gid).raw + 2)), \ - be16_to_cpup((__be16 *) ((gid).raw + 4)), \ - be16_to_cpup((__be16 *) ((gid).raw + 6)), \ - be16_to_cpup((__be16 *) ((gid).raw + 8)), \ - be16_to_cpup((__be16 *) ((gid).raw + 10)), \ - be16_to_cpup((__be16 *) ((gid).raw + 12)), \ - be16_to_cpup((__be16 *) ((gid).raw + 14)) +#define IPOIB_GID_FMT "%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:" \ + "%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x:%2.2x%2.2x" + +#define IPOIB_GID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7], \ + ((u8 *)(gid))[8], \ + ((u8 *)(gid))[9], \ + ((u8 *)(gid))[10],\ + ((u8 *)(gid))[11],\ + ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] + +#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index cb078a7d0bf..1c6ea1c682a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -185,8 +185,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static struct ipoib_path *__path_find(struct net_device *dev, - union ib_gid *gid) +static struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->path_tree.rb_node; @@ -196,7 +195,7 @@ static struct ipoib_path *__path_find(struct net_device *dev, while (n) { path = rb_entry(n, struct ipoib_path, rb_node); - ret = memcmp(gid->raw, path->pathrec.dgid.raw, + ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) @@ -424,8 +423,7 @@ static void path_rec_completion(int status, } } -static struct ipoib_path *path_rec_create(struct net_device *dev, - union ib_gid *gid) +static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; @@ -440,7 +438,7 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, INIT_LIST_HEAD(&path->neigh_list); - memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid)); + memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; @@ -498,10 +496,9 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) */ spin_lock(&priv->lock); - path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4)); + path = __path_find(dev, skb->dst->neighbour->ha + 4); if (!path) { - path = path_rec_create(dev, - (union ib_gid *) (skb->dst->neighbour->ha + 4)); + path = path_rec_create(dev, skb->dst->neighbour->ha + 4); if (!path) goto err_path; @@ -551,7 +548,7 @@ static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) /* Add in the P_Key for multicasts */ skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; skb->dst->neighbour->ha[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb); + ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, @@ -566,10 +563,9 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, */ spin_lock(&priv->lock); - path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4)); + path = __path_find(dev, phdr->hwaddr + 4); if (!path) { - path = path_rec_create(dev, - (union ib_gid *) (phdr->hwaddr + 4)); + path = path_rec_create(dev, phdr->hwaddr + 4); if (path) { /* put pseudoheader back on for next time */ skb_push(skb, sizeof *phdr); @@ -660,7 +656,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb); + ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); } else { /* unicast GID -- should be ARP or RARP reply */ @@ -671,7 +667,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->dst ? "neigh" : "dst", be16_to_cpup((__be16 *) skb->data), be32_to_cpup((__be32 *) phdr->hwaddr), - IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4))); + IPOIB_GID_RAW_ARG(phdr->hwaddr + 4)); dev_kfree_skb_any(skb); ++priv->stats.tx_dropped; goto out; @@ -754,7 +750,7 @@ static void ipoib_neigh_destructor(struct neighbour *n) ipoib_dbg(priv, "neigh_destructor for %06x " IPOIB_GID_FMT "\n", be32_to_cpup((__be32 *) n->ha), - IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4)))); + IPOIB_GID_RAW_ARG(n->ha + 4)); spin_lock_irqsave(&priv->lock, flags); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 1dae4b23825..ec41c8f46c7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -154,7 +154,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, return mcast; } -static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_gid *mgid) +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->multicast_tree.rb_node; @@ -165,7 +165,7 @@ static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, union ib_g mcast = rb_entry(n, struct ipoib_mcast, rb_node); - ret = memcmp(mgid->raw, mcast->mcmember.mgid.raw, + ret = memcmp(mgid, mcast->mcmember.mgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; @@ -694,8 +694,7 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) return 0; } -void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, - struct sk_buff *skb) +void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_mcast *mcast; @@ -718,7 +717,7 @@ void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, if (!mcast) { /* Let's create a new send only group now */ ipoib_dbg_mcast(priv, "setting up send only multicast group for " - IPOIB_GID_FMT "\n", IPOIB_GID_ARG(*mgid)); + IPOIB_GID_FMT "\n", IPOIB_GID_RAW_ARG(mgid)); mcast = ipoib_mcast_alloc(dev, 0); if (!mcast) { @@ -730,7 +729,7 @@ void ipoib_mcast_send(struct net_device *dev, union ib_gid *mgid, } set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); - mcast->mcmember.mgid = *mgid; + memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); __ipoib_mcast_add(dev, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } -- cgit v1.2.3 From 508e434123b136c96d1bf989e8d4ab0c8d7498b1 Mon Sep 17 00:00:00 2001 From: Leonid Arsh Date: Sat, 17 Jun 2006 20:37:36 -0700 Subject: IPoIB: Handle client reregister events Handle client reregister events by treating them just like LID or SM changes -- flush all cached paths and rejoin multicast groups. Signed-off-by: Leonid Arsh Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 1d49d1643c5..7b717c648f7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -255,7 +255,8 @@ void ipoib_event(struct ib_event_handler *handler, record->event == IB_EVENT_PKEY_CHANGE || record->event == IB_EVENT_PORT_ACTIVE || record->event == IB_EVENT_LID_CHANGE || - record->event == IB_EVENT_SM_CHANGE) { + record->event == IB_EVENT_SM_CHANGE || + record->event == IB_EVENT_CLIENT_REREGISTER) { ipoib_dbg(priv, "Port state change event\n"); queue_work(ipoib_workqueue, &priv->flush_task); } -- cgit v1.2.3 From da2ab62ab5e430e6ffafc2d0e6046dcd2780f570 Mon Sep 17 00:00:00 2001 From: Leonid Arsh Date: Sat, 17 Jun 2006 20:37:36 -0700 Subject: IB: Move struct port_info from ipath to Move ipath's struct port_info into , so that it can be used by mthca to implement client reregister support. Remove the __attribute__((packed)) because all the members of the struct are naturally aligned anyway. Signed-off-by: Leonid Arsh Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_mad.c | 40 ++------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index f7f8391fe43..49acd1e1570 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -137,47 +137,11 @@ static int recv_subn_get_guidinfo(struct ib_smp *smp, return reply(smp); } -struct port_info { - __be64 mkey; - __be64 gid_prefix; - __be16 lid; - __be16 sm_lid; - __be32 cap_mask; - __be16 diag_code; - __be16 mkey_lease_period; - u8 local_port_num; - u8 link_width_enabled; - u8 link_width_supported; - u8 link_width_active; - u8 linkspeed_portstate; /* 4 bits, 4 bits */ - u8 portphysstate_linkdown; /* 4 bits, 4 bits */ - u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */ - u8 linkspeedactive_enabled; /* 4 bits, 4 bits */ - u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */ - u8 vlcap_inittype; /* 4 bits, 4 bits */ - u8 vl_high_limit; - u8 vl_arb_high_cap; - u8 vl_arb_low_cap; - u8 inittypereply_mtucap; /* 4 bits, 4 bits */ - u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */ - u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */ - __be16 mkey_violations; - __be16 pkey_violations; - __be16 qkey_violations; - u8 guid_cap; - u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */ - u8 resv_resptimevalue; /* 3 bits, 5 bits */ - u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */ - __be16 max_credit_hint; - u8 resv; - u8 link_roundtrip_latency[3]; -} __attribute__ ((packed)); - static int recv_subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, u8 port) { struct ipath_ibdev *dev; - struct port_info *pip = (struct port_info *)smp->data; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; u16 lid; u8 ibcstat; u8 mtu; @@ -312,7 +276,7 @@ static int recv_subn_set_guidinfo(struct ib_smp *smp, static int recv_subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev, u8 port) { - struct port_info *pip = (struct port_info *)smp->data; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; struct ib_event event; struct ipath_ibdev *dev; u32 flags; -- cgit v1.2.3 From 12bbb2b7be7f5564952ebe0196623e97464b8ac5 Mon Sep 17 00:00:00 2001 From: Leonid Arsh Date: Sat, 17 Jun 2006 20:37:36 -0700 Subject: IB/mthca: Add client reregister event generation Change the mthca snoop of MADs that set PortInfo to check if the SM has set the client reregister bit, and if it has, generate a client reregister event. If the bit is not set, just generate a LID change event as usual. Signed-off-by: Leonid Arsh Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_mad.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 4730863ece9..d9bc030bccc 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -114,14 +114,22 @@ static void smp_snoop(struct ib_device *ibdev, mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && mad->mad_hdr.method == IB_MGMT_METHOD_SET) { if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { + struct ib_port_info *pinfo = + (struct ib_port_info *) ((struct ib_smp *) mad)->data; + mthca_update_rate(to_mdev(ibdev), port_num); update_sm_ah(to_mdev(ibdev), port_num, - be16_to_cpup((__be16 *) (mad->data + 58)), - (*(u8 *) (mad->data + 76)) & 0xf); + be16_to_cpu(pinfo->lid), + pinfo->neighbormtu_mastersmsl & 0xf); event.device = ibdev; - event.event = IB_EVENT_LID_CHANGE; event.element.port_num = port_num; + + if(pinfo->clientrereg_resv_subnetto & 0x80) + event.event = IB_EVENT_CLIENT_REREGISTER; + else + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); } -- cgit v1.2.3 From 6eddb5cb906ac5c9a17a1b76464eadacd88b6c92 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:37 -0700 Subject: IB/ipath: Add client reregister event generation Generate a client reregister event instead of a LID change event when client reregister bit is set. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 49acd1e1570..1a9d0a2c33c 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -409,7 +409,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, if (pip->clientrereg_resv_subnetto & 0x80) { clientrereg = 1; - event.event = IB_EVENT_LID_CHANGE; + event.event = IB_EVENT_CLIENT_REREGISTER; ib_dispatch_event(&event); } -- cgit v1.2.3 From d4cb0784fd1ea99ef3d20526811bd5608146fe60 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sat, 17 Jun 2006 20:37:37 -0700 Subject: IB/mthca: Fill in max_map_per_fmr device attribute Report the true max_map_per_fmr value from mthca_query_device(), taking into account the change in FMR remapping introduced by the Sinai performance optimization. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_provider.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index a2eae8a3016..8f89ba7c914 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -115,6 +115,16 @@ static int mthca_query_device(struct ib_device *ibdev, props->max_mcast_qp_attach = MTHCA_QP_PER_MGM; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; + /* + * If Sinai memory key optimization is being used, then only + * the 8-bit key portion will change. For other HCAs, the + * unused index bits will also be used for FMR remapping. + */ + if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) + props->max_map_per_fmr = 255; + else + props->max_map_per_fmr = + (1 << (32 - long_log2(mdev->limits.num_mpts))) - 1; err = 0; out: -- cgit v1.2.3 From 6c8c1aa25d213a288df381f431ce5b6a155146ec Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sat, 17 Jun 2006 20:37:37 -0700 Subject: IB/fmr: Use device's max_map_map_per_fmr attribute in FMR pool. When creating a FMR pool, query the IB device and use the returned max_map_map_per_fmr attribute as for the max number of FMR remaps. If the device does not suport querying this attribute, use the original IB_FMR_MAX_REMAPS (32) default. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/fmr_pool.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 838bf54458d..615fe9cc6c5 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -54,7 +54,7 @@ enum { /* * If an FMR is not in use, then the list member will point to either * its pool's free_list (if the FMR can be mapped again; that is, - * remap_count < IB_FMR_MAX_REMAPS) or its pool's dirty_list (if the + * remap_count < pool->max_remaps) or its pool's dirty_list (if the * FMR needs to be unmapped before being remapped). In either of * these cases it is a bug if the ref_count is not 0. In other words, * if ref_count is > 0, then the list member must not be linked into @@ -84,6 +84,7 @@ struct ib_fmr_pool { int pool_size; int max_pages; + int max_remaps; int dirty_watermark; int dirty_len; struct list_head free_list; @@ -214,8 +215,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, { struct ib_device *device; struct ib_fmr_pool *pool; + struct ib_device_attr *attr; int i; int ret; + int max_remaps; if (!params) return ERR_PTR(-EINVAL); @@ -228,6 +231,26 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, return ERR_PTR(-ENOSYS); } + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) { + printk(KERN_WARNING "couldn't allocate device attr struct"); + return ERR_PTR(-ENOMEM); + } + + ret = ib_query_device(device, attr); + if (ret) { + printk(KERN_WARNING "couldn't query device"); + kfree(attr); + return ERR_PTR(ret); + } + + if (!attr->max_map_per_fmr) + max_remaps = IB_FMR_MAX_REMAPS; + else + max_remaps = attr->max_map_per_fmr; + + kfree(attr); + pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { printk(KERN_WARNING "couldn't allocate pool struct"); @@ -258,6 +281,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool->pool_size = 0; pool->max_pages = params->max_pages_per_fmr; + pool->max_remaps = max_remaps; pool->dirty_watermark = params->dirty_watermark; pool->dirty_len = 0; spin_lock_init(&pool->pool_lock); @@ -279,7 +303,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, struct ib_pool_fmr *fmr; struct ib_fmr_attr attr = { .max_pages = params->max_pages_per_fmr, - .max_maps = IB_FMR_MAX_REMAPS, + .max_maps = pool->max_remaps, .page_shift = params->page_shift }; @@ -489,7 +513,7 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) --fmr->ref_count; if (!fmr->ref_count) { - if (fmr->remap_count < IB_FMR_MAX_REMAPS) { + if (fmr->remap_count < pool->max_remaps) { list_add_tail(&fmr->list, &pool->free_list); } else { list_add_tail(&fmr->list, &pool->dirty_list); -- cgit v1.2.3 From 0c0450db31481aa01a04e7faecc93ee6841972d6 Mon Sep 17 00:00:00 2001 From: Ramachandra K Date: Sat, 17 Jun 2006 20:37:38 -0700 Subject: IB/srp: Support SRP rev. 10 targets There has been a change in the format of port identifiers between revision 10 of the SRP specification and the current revision 16A. Revision 10 specifies port identifier format as lower 8 bytes : GUID upper 8 bytes : Extension Whereas revision 16A specifies it as lower 8 bytes : Extension upper 8 bytes : GUID There are older targets (e.g. SilverStorm Virtual Fibre Channel Bridge) which conform to revision 10 of the SRP specification. The I/O class of revision 10 is 0xFF00 and the I/O class of revision 16A is 0x0100. For supporting older targets, this patch: 1) Adds a new optional target creation parameter "io_class". Default value of io_class is 0x0100 (i.e. revision 16A) 2) Uses the correct port identifier format for targets with IO class of 0xFF00 (i.e. conforming to revision 10) Signed-off-by: Ramachandra K Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 44 ++++++++++++++++++++++++++++++++++--- drivers/infiniband/ulp/srp/ib_srp.h | 1 + 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f1401e1f2f9..e7fa8b680cb 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -321,7 +321,29 @@ static int srp_send_req(struct srp_target_port *target) req->priv.req_it_iu_len = cpu_to_be32(srp_max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); - memcpy(req->priv.initiator_port_id, target->srp_host->initiator_port_id, 16); + /* + * In the published SRP specification (draft rev. 16a), the + * port identifier format is 8 bytes of ID extension followed + * by 8 bytes of GUID. Older drafts put the two halves in the + * opposite order, so that the GUID comes first. + * + * Targets conforming to these obsolete drafts can be + * recognized by the I/O Class they report. + */ + if (target->io_class == SRP_REV10_IB_IO_CLASS) { + memcpy(req->priv.initiator_port_id, + target->srp_host->initiator_port_id + 8, 8); + memcpy(req->priv.initiator_port_id + 8, + target->srp_host->initiator_port_id, 8); + memcpy(req->priv.target_port_id, &target->ioc_guid, 8); + memcpy(req->priv.target_port_id + 8, &target->id_ext, 8); + } else { + memcpy(req->priv.initiator_port_id, + target->srp_host->initiator_port_id, 16); + memcpy(req->priv.target_port_id, &target->id_ext, 8); + memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); + } + /* * Topspin/Cisco SRP targets will reject our login unless we * zero out the first 8 bytes of our initiator port ID. The @@ -334,8 +356,6 @@ static int srp_send_req(struct srp_target_port *target) (unsigned long long) be64_to_cpu(target->ioc_guid)); memset(req->priv.initiator_port_id, 0, 8); } - memcpy(req->priv.target_port_id, &target->id_ext, 8); - memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); status = ib_send_cm_req(target->cm_id, &req->param); @@ -1511,6 +1531,7 @@ enum { SRP_OPT_SERVICE_ID = 1 << 4, SRP_OPT_MAX_SECT = 1 << 5, SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, + SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1526,6 +1547,7 @@ static match_table_t srp_opt_tokens = { { SRP_OPT_SERVICE_ID, "service_id=%s" }, { SRP_OPT_MAX_SECT, "max_sect=%d" }, { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_ERR, NULL } }; @@ -1610,6 +1632,21 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->cmd_per_lun = min(token, SRP_SQ_SIZE); break; + case SRP_OPT_IO_CLASS: + if (match_hex(args, &token)) { + printk(KERN_WARNING PFX "bad IO class parameter '%s' \n", p); + goto out; + } + if (token != SRP_REV10_IB_IO_CLASS && + token != SRP_REV16A_IB_IO_CLASS) { + printk(KERN_WARNING PFX "unknown IO class parameter value" + " %x specified (use %x or %x).\n", + token, SRP_REV10_IB_IO_CLASS, SRP_REV16A_IB_IO_CLASS); + goto out; + } + target->io_class = token; + break; + default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); @@ -1652,6 +1689,7 @@ static ssize_t srp_create_target(struct class_device *class_dev, target = host_to_target(target_host); memset(target, 0, sizeof *target); + target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index ad45e4856e1..5b581fb8eb0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -122,6 +122,7 @@ struct srp_target_port { __be64 id_ext; __be64 ioc_guid; __be64 service_id; + u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; char target_name[32]; -- cgit v1.2.3 From 526b4caa0a48382115fa9d8f7d8caf68dbcaa2bf Mon Sep 17 00:00:00 2001 From: Ishai Rabinovitz Date: Sat, 17 Jun 2006 20:37:38 -0700 Subject: IB/srp: Factor out common request reset code Misc cleanups in ib_srp: 1) I think that it is more efficient to move the req entries from req_list to free_list in srp_reconnect_target (rather than rebuild the free_list). (In any case this code is shorter). 2) This allows us to reuse code in srp_reset_device and srp_reconnect_target and call a new function srp_reset_req. Signed-off-by: Ishai Rabinovitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 40 +++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e7fa8b680cb..4e22afef720 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -471,14 +471,26 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, scmnd->sc_data_direction); } +static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) +{ + srp_unmap_data(req->scmnd, target, req); + list_move_tail(&req->list, &target->free_reqs); +} + +static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +{ + req->scmnd->result = DID_RESET << 16; + req->scmnd->scsi_done(req->scmnd); + srp_remove_req(target, req); +} + static int srp_reconnect_target(struct srp_target_port *target) { struct ib_cm_id *new_cm_id; struct ib_qp_attr qp_attr; - struct srp_request *req; + struct srp_request *req, *tmp; struct ib_wc wc; int ret; - int i; spin_lock_irq(target->scsi_host->host_lock); if (target->state != SRP_TARGET_LIVE) { @@ -514,19 +526,12 @@ static int srp_reconnect_target(struct srp_target_port *target) while (ib_poll_cq(target->cq, 1, &wc) > 0) ; /* nothing */ - list_for_each_entry(req, &target->req_queue, list) { - req->scmnd->result = DID_RESET << 16; - req->scmnd->scsi_done(req->scmnd); - srp_unmap_data(req->scmnd, target, req); - } + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req); target->rx_head = 0; target->tx_head = 0; target->tx_tail = 0; - INIT_LIST_HEAD(&target->free_reqs); - INIT_LIST_HEAD(&target->req_queue); - for (i = 0; i < SRP_SQ_SIZE; ++i) - list_add_tail(&target->req_ring[i].list, &target->free_reqs); ret = srp_connect_target(target); if (ret) @@ -726,12 +731,6 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, return len; } -static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) -{ - srp_unmap_data(req->scmnd, target, req); - list_move_tail(&req->list, &target->free_reqs); -} - static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) { struct srp_request *req; @@ -1348,11 +1347,8 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) spin_lock_irq(target->scsi_host->host_lock); list_for_each_entry_safe(req, tmp, &target->req_queue, list) - if (req->scmnd->device == scmnd->device) { - req->scmnd->result = DID_RESET << 16; - req->scmnd->scsi_done(req->scmnd); - srp_remove_req(target, req); - } + if (req->scmnd->device == scmnd->device) + srp_reset_req(target, req); spin_unlock_irq(target->scsi_host->host_lock); -- cgit v1.2.3 From 75af9088514432ef0c1052ba3767ceb0beb6f101 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:39 -0700 Subject: IB/ucm: Get rid of duplicate P_Key parameter The P_Key is provided into a SIDR REQ in two places, once as a parameter, and again in the path record. Remove the P_Key as a parameter and always use the one given in the path record. This change has no practical effect on ABI functionality. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/ucm.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 1c7463b7c5f..629ed26a017 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -2615,7 +2615,7 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); sidr_req_msg->request_id = cm_id_priv->id.local_id; - sidr_req_msg->pkey = cpu_to_be16(param->pkey); + sidr_req_msg->pkey = cpu_to_be16(param->path->pkey); sidr_req_msg->service_id = param->service_id; if (param->private_data && param->private_data_len) diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 67caf36504e..c1c6fda9452 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -1044,7 +1044,6 @@ static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, param.service_id = cmd.sid; param.timeout_ms = cmd.timeout; param.max_cm_retries = cmd.max_cm_retries; - param.pkey = cmd.pkey; ctx = ib_ucm_ctx_get(file, cmd.id); if (!IS_ERR(ctx)) { -- cgit v1.2.3 From 4e00d69454a8747798de11dc4eeef1edeee5ce98 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:39 -0700 Subject: IB: Add ib_init_ah_from_wc() Add a function to initialize address handle attributes from a work completion. This functionality is duplicated by both verbs and the CM. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/verbs.c | 44 ++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b78e7dc6933..468999c3880 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -125,35 +125,47 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) } EXPORT_SYMBOL(ib_create_ah); -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num) +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, + struct ib_grh *grh, struct ib_ah_attr *ah_attr) { - struct ib_ah_attr ah_attr; u32 flow_class; u16 gid_index; int ret; - memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.dlid = wc->slid; - ah_attr.sl = wc->sl; - ah_attr.src_path_bits = wc->dlid_path_bits; - ah_attr.port_num = port_num; + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; if (wc->wc_flags & IB_WC_GRH) { - ah_attr.ah_flags = IB_AH_GRH; - ah_attr.grh.dgid = grh->sgid; + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = grh->sgid; - ret = ib_find_cached_gid(pd->device, &grh->dgid, &port_num, + ret = ib_find_cached_gid(device, &grh->dgid, &port_num, &gid_index); if (ret) - return ERR_PTR(ret); + return ret; - ah_attr.grh.sgid_index = (u8) gid_index; + ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); - ah_attr.grh.flow_label = flow_class & 0xFFFFF; - ah_attr.grh.traffic_class = (flow_class >> 20) & 0xFF; - ah_attr.grh.hop_limit = grh->hop_limit; + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = grh->hop_limit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; } + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_wc); + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, + struct ib_grh *grh, u8 port_num) +{ + struct ib_ah_attr ah_attr; + int ret; + + ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); return ib_create_ah(pd, &ah_attr); } -- cgit v1.2.3 From 6d969a471ba107d94cf03dab3c69f45b9733f500 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:39 -0700 Subject: IB/sa: Add ib_init_ah_from_path() Add a call to initialize address handle attributes given a path record. This is used by the CM, and would be useful for users of UD QPs. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/sa_query.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 501cc054cb3..08d9dd5e048 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -47,6 +47,7 @@ #include #include +#include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); @@ -441,6 +442,36 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) } EXPORT_SYMBOL(ib_sa_cancel_query); +int ib_init_ah_from_path(struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) +{ + int ret; + u16 gid_index; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = be16_to_cpu(rec->dlid); + ah_attr->sl = rec->sl; + ah_attr->src_path_bits = be16_to_cpu(rec->slid) & 0x7f; + ah_attr->port_num = port_num; + + if (rec->hop_limit > 1) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = rec->dgid; + + ret = ib_find_cached_gid(device, &rec->sgid, &port_num, + &gid_index); + if (ret) + return ret; + + ah_attr->grh.sgid_index = gid_index; + ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); + ah_attr->grh.hop_limit = rec->hop_limit; + ah_attr->grh.traffic_class = rec->traffic_class; + } + return 0; +} +EXPORT_SYMBOL(ib_init_ah_from_path); + static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) { unsigned long flags; -- cgit v1.2.3 From ca222c6b2c48e1e0be330a55611ba394251330cb Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 17 Jun 2006 20:37:40 -0700 Subject: IB/cm: Use address handle helpers Use new ib_init_ah_from_wc() and ib_init_ah_from_path() helper functions to clean up the IB CM. Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 25 ++++++++----------------- drivers/infiniband/core/sa_query.c | 6 +++--- 2 files changed, 11 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 629ed26a017..450adfe0a4f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -254,23 +254,13 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static void cm_set_ah_attr(struct ib_ah_attr *ah_attr, u8 port_num, - u16 dlid, u8 sl, u16 src_path_bits) -{ - memset(ah_attr, 0, sizeof ah_attr); - ah_attr->dlid = dlid; - ah_attr->sl = sl; - ah_attr->src_path_bits = src_path_bits; - ah_attr->port_num = port_num; -} - -static void cm_init_av_for_response(struct cm_port *port, - struct ib_wc *wc, struct cm_av *av) +static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; - cm_set_ah_attr(&av->ah_attr, port->port_num, wc->slid, - wc->sl, wc->dlid_path_bits); + ib_init_ah_from_wc(port->cm_dev->device, port->port_num, wc, + grh, &av->ah_attr); } static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) @@ -300,9 +290,8 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) return ret; av->port = port; - cm_set_ah_attr(&av->ah_attr, av->port->port_num, - be16_to_cpu(path->dlid), path->sl, - be16_to_cpu(path->slid) & 0x7F); + ib_init_ah_from_path(cm_dev->device, port->port_num, path, + &av->ah_attr); av->packet_life_time = path->packet_life_time; return 0; } @@ -1342,6 +1331,7 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); @@ -2707,6 +2697,7 @@ static int cm_sidr_req_handler(struct cm_work *work) cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 08d9dd5e048..e911c99ff84 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -463,9 +463,9 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, if (ret) return ret; - ah_attr->grh.sgid_index = gid_index; - ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); - ah_attr->grh.hop_limit = rec->hop_limit; + ah_attr->grh.sgid_index = gid_index; + ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); + ah_attr->grh.hop_limit = rec->hop_limit; ah_attr->grh.traffic_class = rec->traffic_class; } return 0; -- cgit v1.2.3 From 77f76013e3ffda605b20184db5862ce1efcb6f5a Mon Sep 17 00:00:00 2001 From: Ganapathi CH Date: Sat, 17 Jun 2006 20:37:40 -0700 Subject: IB/uverbs: Release lock on error path If ibdev->alloc_ucontext() fails then ib_uverbs_get_context() does not unlock file->mutex before returning error. Signed-off by: Ganapathi CH Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9f69bd48eb1..4266d2e5eba 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -80,8 +80,10 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, in_len - sizeof cmd, out_len - sizeof resp); ucontext = ibdev->alloc_ucontext(ibdev, &udata); - if (IS_ERR(ucontext)) - return PTR_ERR(file->ucontext); + if (IS_ERR(ucontext)) { + ret = PTR_ERR(file->ucontext); + goto err; + } ucontext->device = ibdev; INIT_LIST_HEAD(&ucontext->pd_list); -- cgit v1.2.3 From 92b1582268e269b3a9e54e186e740396f0f2012b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:40 -0700 Subject: IB/uverbs: Don't decrement usecnt on error paths In error paths when destroying an object, uverbs should not decrement associated objects' usecnt, since ib_dereg_mr(), ib_destroy_qp(), etc. already do that. Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4266d2e5eba..37824f68e37 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -492,7 +492,6 @@ err_idr: err_unreg: ib_dereg_mr(mr); - atomic_dec(&pd->usecnt); err_up: mutex_unlock(&ib_uverbs_idr_mutex); @@ -986,11 +985,6 @@ err_idr: err_destroy: ib_destroy_qp(qp); - atomic_dec(&pd->usecnt); - atomic_dec(&attr.send_cq->usecnt); - atomic_dec(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_dec(&attr.srq->usecnt); err_up: mutex_unlock(&ib_uverbs_idr_mutex); @@ -1888,7 +1882,6 @@ err_idr: err_destroy: ib_destroy_srq(srq); - atomic_dec(&pd->usecnt); err_up: mutex_unlock(&ib_uverbs_idr_mutex); -- cgit v1.2.3 From 3463175d6ee55fdbd5cda2a03415e2068599b2b7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:40 -0700 Subject: IB/uverbs: Factor out common idr code Factor out common code for adding a userspace object to an idr into a function idr_add_uobj(). This shrinks both the source and object code: add/remove: 1/0 grow/shrink: 0/6 up/down: 57/-220 (-163) function old new delta idr_add_uobj - 57 +57 ib_uverbs_create_ah 543 512 -31 ib_uverbs_create_srq 662 630 -32 ib_uverbs_reg_mr 737 699 -38 ib_uverbs_create_cq 639 600 -39 ib_uverbs_alloc_pd 485 446 -39 ib_uverbs_create_qp 1020 979 -41 Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 82 ++++++++++-------------------------- 1 file changed, 22 insertions(+), 60 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 37824f68e37..403dd811ec7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -50,6 +50,22 @@ (udata)->outlen = (olen); \ } while (0) +static int idr_add_uobj(struct idr *idr, void *obj, struct ib_uobject *uobj) +{ + int ret; + +retry: + if (!idr_pre_get(idr, GFP_KERNEL)) + return -ENOMEM; + + ret = idr_get_new(idr, uobj, &uobj->id); + + if (ret == -EAGAIN) + goto retry; + + return ret; +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -295,16 +311,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, mutex_lock(&ib_uverbs_idr_mutex); -retry: - if (!idr_pre_get(&ib_uverbs_pd_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_up; - } - - ret = idr_get_new(&ib_uverbs_pd_idr, pd, &uobj->id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_pd_idr, pd, uobj); if (ret) goto err_up; @@ -458,16 +465,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, resp.lkey = mr->lkey; resp.rkey = mr->rkey; -retry: - if (!idr_pre_get(&ib_uverbs_mr_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_unreg; - } - - ret = idr_get_new(&ib_uverbs_mr_idr, mr, &obj->uobject.id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_mr_idr, mr, &obj->uobject); if (ret) goto err_unreg; @@ -632,16 +630,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, mutex_lock(&ib_uverbs_idr_mutex); -retry: - if (!idr_pre_get(&ib_uverbs_cq_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_up; - } - - ret = idr_get_new(&ib_uverbs_cq_idr, cq, &uobj->uobject.id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_cq_idr, cq, &uobj->uobject); if (ret) goto err_up; @@ -946,16 +935,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, memset(&resp, 0, sizeof resp); resp.qpn = qp->qp_num; -retry: - if (!idr_pre_get(&ib_uverbs_qp_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_destroy; - } - - ret = idr_get_new(&ib_uverbs_qp_idr, qp, &uobj->uevent.uobject.id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_qp_idr, qp, &uobj->uevent.uobject); if (ret) goto err_destroy; @@ -1614,16 +1594,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, ah->uobject = uobj; -retry: - if (!idr_pre_get(&ib_uverbs_ah_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_destroy; - } - - ret = idr_get_new(&ib_uverbs_ah_idr, ah, &uobj->id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_ah_idr, ah, uobj); if (ret) goto err_destroy; @@ -1846,16 +1817,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, memset(&resp, 0, sizeof resp); -retry: - if (!idr_pre_get(&ib_uverbs_srq_idr, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_destroy; - } - - ret = idr_get_new(&ib_uverbs_srq_idr, srq, &uobj->uobject.id); - - if (ret == -EAGAIN) - goto retry; + ret = idr_add_uobj(&ib_uverbs_srq_idr, srq, &uobj->uobject); if (ret) goto err_destroy; -- cgit v1.2.3 From c9c5d9feef86debee4d8e77a738ad86877cf371a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:41 -0700 Subject: IB/mthca: Fix memory leak on modify_qp error paths Some error paths after the mthca_alloc_mailbox() call in mthca_modify_qp() just do a "return -EINVAL" without freeing the mailbox. Convert these returns to "goto out" to avoid leaking the mailbox storage. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_qp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 07c13be07a4..322bc32a93f 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -534,7 +534,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) struct mthca_qp_context *qp_context; u32 sqd_event = 0; u8 status; - int err; + int err = -EINVAL; if (attr_mask & IB_QP_CUR_STATE) { cur_state = attr->cur_qp_state; @@ -618,7 +618,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) { mthca_dbg(dev, "path MTU (%u) is invalid\n", attr->path_mtu); - return -EINVAL; + goto out; } qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31; } @@ -672,7 +672,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr_mask & IB_QP_AV) { if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) - return -EINVAL; + goto out; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); } @@ -686,18 +686,18 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr->alt_pkey_index >= dev->limits.pkey_table_len) { mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n", attr->alt_pkey_index, dev->limits.pkey_table_len-1); - return -EINVAL; + goto out; } if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) { mthca_dbg(dev, "Alternate port number (%u) is invalid\n", attr->alt_port_num); - return -EINVAL; + goto out; } if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, attr->alt_ah_attr.port_num)) - return -EINVAL; + goto out; qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | attr->alt_port_num << 24); -- cgit v1.2.3 From c93b6fbaa99bb3a1552e14317296be14dde51dfb Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:37:41 -0700 Subject: IB/mthca: Make all device methods truly reentrant Documentation/infiniband/core_locking.txt says: All of the methods in struct ib_device exported by a low-level driver must be fully reentrant. The low-level driver is required to perform all synchronization necessary to maintain consistency, even if multiple function calls using the same object are run simultaneously. However, mthca's modify_qp, modify_srq and resize_cq methods are currently not reentrant. Add a mutex to the QP, SRQ and CQ structures so that these calls can be properly serialized. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cq.c | 1 + drivers/infiniband/hw/mthca/mthca_provider.c | 23 ++++++++++++----- drivers/infiniband/hw/mthca/mthca_provider.h | 3 +++ drivers/infiniband/hw/mthca/mthca_qp.c | 38 +++++++++++++++++----------- drivers/infiniband/hw/mthca/mthca_srq.c | 5 ++++ 5 files changed, 48 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index 87a8f1166a3..3e27a084257 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -822,6 +822,7 @@ int mthca_init_cq(struct mthca_dev *dev, int nent, spin_lock_init(&cq->lock); cq->refcount = 1; init_waitqueue_head(&cq->wait); + mutex_init(&cq->mutex); memset(cq_context, 0, sizeof *cq_context); cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK | diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 8f89ba7c914..230ae21db8f 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -793,18 +793,24 @@ static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *uda if (entries < 1 || entries > dev->limits.max_cqes) return -EINVAL; + mutex_lock(&cq->mutex); + entries = roundup_pow_of_two(entries + 1); - if (entries == ibcq->cqe + 1) - return 0; + if (entries == ibcq->cqe + 1) { + ret = 0; + goto out; + } if (cq->is_kernel) { ret = mthca_alloc_resize_buf(dev, cq, entries); if (ret) - return ret; + goto out; lkey = cq->resize_buf->buf.mr.ibmr.lkey; } else { - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) - return -EFAULT; + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + ret = -EFAULT; + goto out; + } lkey = ucmd.lkey; } @@ -821,7 +827,7 @@ static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *uda cq->resize_buf = NULL; spin_unlock_irq(&cq->lock); } - return ret; + goto out; } if (cq->is_kernel) { @@ -848,7 +854,10 @@ static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *uda } else ibcq->cqe = entries - 1; - return 0; +out: + mutex_unlock(&cq->mutex); + + return ret; } static int mthca_destroy_cq(struct ib_cq *cq) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 179a8f610d0..8de2887ba15 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -214,6 +214,7 @@ struct mthca_cq { int arm_sn; wait_queue_head_t wait; + struct mutex mutex; }; struct mthca_srq { @@ -237,6 +238,7 @@ struct mthca_srq { struct mthca_mr mr; wait_queue_head_t wait; + struct mutex mutex; }; struct mthca_wq { @@ -278,6 +280,7 @@ struct mthca_qp { union mthca_buf queue; wait_queue_head_t wait; + struct mutex mutex; }; struct mthca_sqp { diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 322bc32a93f..16c387d8170 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -536,6 +536,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) u8 status; int err = -EINVAL; + mutex_lock(&qp->mutex); + if (attr_mask & IB_QP_CUR_STATE) { cur_state = attr->cur_qp_state; } else { @@ -553,39 +555,41 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, attr_mask); - return -EINVAL; + goto out; } if ((attr_mask & IB_QP_PKEY_INDEX) && attr->pkey_index >= dev->limits.pkey_table_len) { mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n", attr->pkey_index, dev->limits.pkey_table_len-1); - return -EINVAL; + goto out; } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) { mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num); - return -EINVAL; + goto out; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->limits.max_qp_init_rdma) { mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n", attr->max_rd_atomic, dev->limits.max_qp_init_rdma); - return -EINVAL; + goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) { mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n", attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift); - return -EINVAL; + goto out; } mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto out; + } qp_param = mailbox->buf; qp_context = &qp_param->context; memset(qp_param, 0, sizeof *qp_param); @@ -618,7 +622,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) { mthca_dbg(dev, "path MTU (%u) is invalid\n", attr->path_mtu); - goto out; + goto out_mailbox; } qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31; } @@ -672,7 +676,7 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr_mask & IB_QP_AV) { if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) - goto out; + goto out_mailbox; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); } @@ -686,18 +690,18 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) if (attr->alt_pkey_index >= dev->limits.pkey_table_len) { mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n", attr->alt_pkey_index, dev->limits.pkey_table_len-1); - goto out; + goto out_mailbox; } if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) { mthca_dbg(dev, "Alternate port number (%u) is invalid\n", attr->alt_port_num); - goto out; + goto out_mailbox; } if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path, attr->alt_ah_attr.port_num)) - goto out; + goto out_mailbox; qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index | attr->alt_port_num << 24); @@ -793,12 +797,12 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0, mailbox, sqd_event, &status); if (err) - goto out; + goto out_mailbox; if (status) { mthca_warn(dev, "modify QP %d->%d returned status %02x.\n", cur_state, new_state, status); err = -EINVAL; - goto out; + goto out_mailbox; } qp->state = new_state; @@ -853,8 +857,11 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) } } -out: +out_mailbox: mthca_free_mailbox(dev, mailbox); + +out: + mutex_unlock(&qp->mutex); return err; } @@ -1100,6 +1107,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev, qp->refcount = 1; init_waitqueue_head(&qp->wait); + mutex_init(&qp->mutex); qp->state = IB_QPS_RESET; qp->atomic_rd_en = 0; qp->resp_depth = 0; diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index b292fefa3b4..fab417c5cf4 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -243,6 +243,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, spin_lock_init(&srq->lock); srq->refcount = 1; init_waitqueue_head(&srq->wait); + mutex_init(&srq->mutex); if (mthca_is_memfree(dev)) mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf); @@ -371,7 +372,11 @@ int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (attr_mask & IB_SRQ_LIMIT) { if (attr->srq_limit > srq->max) return -EINVAL; + + mutex_lock(&srq->mutex); ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit, &status); + mutex_unlock(&srq->mutex); + if (ret) return ret; if (status) -- cgit v1.2.3 From 9ead190bfde2a434c74ea604382d08acb2eceef5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 17 Jun 2006 20:44:49 -0700 Subject: IB/uverbs: Don't serialize with ib_uverbs_idr_mutex Currently, all userspace verbs operations that call into the kernel are serialized by ib_uverbs_idr_mutex. This can be a scalability issue for some workloads, especially for devices driven by the ipath driver, which needs to call into the kernel even for datapath operations. Fix this by adding reference counts to the userspace objects, and then converting ib_uverbs_idr_mutex into a spinlock that only protects the idrs long enough to take a reference on the object being looked up. Because remove operations may fail, we have to do a slightly funky two-step deletion, which is described in the comments at the top of uverbs_cmd.c. This also still leaves ib_uverbs_idr_lock as a single lock that is possibly subject to contention. However, the lock hold time will only be a single idr operation, so multiple threads should still be able to make progress, even if ib_uverbs_idr_lock is being ping-ponged. Surprisingly, these changes even shrink the object code: add/remove: 23/5 grow/shrink: 4/21 up/down: 633/-693 (-60) Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 4 +- drivers/infiniband/core/uverbs_cmd.c | 890 ++++++++++++++++++++-------------- drivers/infiniband/core/uverbs_main.c | 35 +- 3 files changed, 553 insertions(+), 376 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 3372d67ff13..bb9bee56a82 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -132,7 +132,7 @@ struct ib_ucq_object { u32 async_events_reported; }; -extern struct mutex ib_uverbs_idr_mutex; +extern spinlock_t ib_uverbs_idr_lock; extern struct idr ib_uverbs_pd_idr; extern struct idr ib_uverbs_mr_idr; extern struct idr ib_uverbs_mw_idr; @@ -141,6 +141,8 @@ extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; +void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); + struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, int is_async, int *fd); void ib_uverbs_release_event_file(struct kref *ref); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 403dd811ec7..76bf61e9b55 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -50,7 +50,64 @@ (udata)->outlen = (olen); \ } while (0) -static int idr_add_uobj(struct idr *idr, void *obj, struct ib_uobject *uobj) +/* + * The ib_uobject locking scheme is as follows: + * + * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it + * needs to be held during all idr operations. When an object is + * looked up, a reference must be taken on the object's kref before + * dropping this lock. + * + * - Each object also has an rwsem. This rwsem must be held for + * reading while an operation that uses the object is performed. + * For example, while registering an MR, the associated PD's + * uobject.mutex must be held for reading. The rwsem must be held + * for writing while initializing or destroying an object. + * + * - In addition, each object has a "live" flag. If this flag is not + * set, then lookups of the object will fail even if it is found in + * the idr. This handles a reader that blocks and does not acquire + * the rwsem until after the object is destroyed. The destroy + * operation will set the live flag to 0 and then drop the rwsem; + * this will allow the reader to acquire the rwsem, see that the + * live flag is 0, and then drop the rwsem and its reference to + * object. The underlying storage will not be freed until the last + * reference to the object is dropped. + */ + +static void init_uobj(struct ib_uobject *uobj, u64 user_handle, + struct ib_ucontext *context) +{ + uobj->user_handle = user_handle; + uobj->context = context; + kref_init(&uobj->ref); + init_rwsem(&uobj->mutex); + uobj->live = 0; +} + +static void release_uobj(struct kref *kref) +{ + kfree(container_of(kref, struct ib_uobject, ref)); +} + +static void put_uobj(struct ib_uobject *uobj) +{ + kref_put(&uobj->ref, release_uobj); +} + +static void put_uobj_read(struct ib_uobject *uobj) +{ + up_read(&uobj->mutex); + put_uobj(uobj); +} + +static void put_uobj_write(struct ib_uobject *uobj) +{ + up_write(&uobj->mutex); + put_uobj(uobj); +} + +static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) { int ret; @@ -58,7 +115,9 @@ retry: if (!idr_pre_get(idr, GFP_KERNEL)) return -ENOMEM; + spin_lock(&ib_uverbs_idr_lock); ret = idr_get_new(idr, uobj, &uobj->id); + spin_unlock(&ib_uverbs_idr_lock); if (ret == -EAGAIN) goto retry; @@ -66,6 +125,121 @@ retry: return ret; } +void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) +{ + spin_lock(&ib_uverbs_idr_lock); + idr_remove(idr, uobj->id); + spin_unlock(&ib_uverbs_idr_lock); +} + +static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + spin_lock(&ib_uverbs_idr_lock); + uobj = idr_find(idr, id); + if (uobj) + kref_get(&uobj->ref); + spin_unlock(&ib_uverbs_idr_lock); + + return uobj; +} + +static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = __idr_get_uobj(idr, id, context); + if (!uobj) + return NULL; + + down_read(&uobj->mutex); + if (!uobj->live) { + put_uobj_read(uobj); + return NULL; + } + + return uobj; +} + +static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, + struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = __idr_get_uobj(idr, id, context); + if (!uobj) + return NULL; + + down_write(&uobj->mutex); + if (!uobj->live) { + put_uobj_write(uobj); + return NULL; + } + + return uobj; +} + +static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context) +{ + struct ib_uobject *uobj; + + uobj = idr_read_uobj(idr, id, context); + return uobj ? uobj->object : NULL; +} + +static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context); +} + +static void put_pd_read(struct ib_pd *pd) +{ + put_uobj_read(pd->uobject); +} + +static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context); +} + +static void put_cq_read(struct ib_cq *cq) +{ + put_uobj_read(cq->uobject); +} + +static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context); +} + +static void put_ah_read(struct ib_ah *ah) +{ + put_uobj_read(ah->uobject); +} + +static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context); +} + +static void put_qp_read(struct ib_qp *qp) +{ + put_uobj_read(qp->uobject); +} + +static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context); +} + +static void put_srq_read(struct ib_srq *srq) +{ + put_uobj_read(srq->uobject); +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -296,7 +470,8 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - uobj->context = file->ucontext; + init_uobj(uobj, 0, file->ucontext); + down_write(&uobj->mutex); pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, file->ucontext, &udata); @@ -309,11 +484,10 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, pd->uobject = uobj; atomic_set(&pd->usecnt, 0); - mutex_lock(&ib_uverbs_idr_mutex); - - ret = idr_add_uobj(&ib_uverbs_pd_idr, pd, uobj); + uobj->object = pd; + ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); if (ret) - goto err_up; + goto err_idr; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; @@ -321,26 +495,27 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->pd_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + uobj->live = 1; + + up_write(&uobj->mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_pd_idr, uobj->id); +err_copy: + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); +err_idr: ib_dealloc_pd(pd); err: - kfree(uobj); + put_uobj_write(uobj); return ret; } @@ -349,37 +524,34 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; - struct ib_pd *pd; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); + uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); + if (!uobj) + return -EINVAL; - pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle); - if (!pd || pd->uobject->context != file->ucontext) - goto out; + ret = ib_dealloc_pd(uobj->object); + if (!ret) + uobj->live = 0; - uobj = pd->uobject; + put_uobj_write(uobj); - ret = ib_dealloc_pd(pd); if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_pd_idr, cmd.pd_handle); + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); - kfree(uobj); + put_uobj(uobj); -out: - mutex_unlock(&ib_uverbs_idr_mutex); - - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, @@ -419,7 +591,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - obj->uobject.context = file->ucontext; + init_uobj(&obj->uobject, 0, file->ucontext); + down_write(&obj->uobject.mutex); /* * We ask for writable memory if any access flags other than @@ -436,23 +609,14 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, obj->umem.virt_base = cmd.hca_va; - mutex_lock(&ib_uverbs_idr_mutex); - - pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle); - if (!pd || pd->uobject->context != file->ucontext) { - ret = -EINVAL; - goto err_up; - } - - if (!pd->device->reg_user_mr) { - ret = -ENOSYS; - goto err_up; - } + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) + goto err_release; mr = pd->device->reg_user_mr(pd, &obj->umem, cmd.access_flags, &udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); - goto err_up; + goto err_put; } mr->device = pd->device; @@ -461,43 +625,48 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, atomic_inc(&pd->usecnt); atomic_set(&mr->usecnt, 0); - memset(&resp, 0, sizeof resp); - resp.lkey = mr->lkey; - resp.rkey = mr->rkey; - - ret = idr_add_uobj(&ib_uverbs_mr_idr, mr, &obj->uobject); + obj->uobject.object = mr; + ret = idr_add_uobj(&ib_uverbs_mr_idr, &obj->uobject); if (ret) goto err_unreg; + memset(&resp, 0, sizeof resp); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; resp.mr_handle = obj->uobject.id; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } + put_pd_read(pd); + mutex_lock(&file->mutex); list_add_tail(&obj->uobject.list, &file->ucontext->mr_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + obj->uobject.live = 1; + + up_write(&obj->uobject.mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_mr_idr, obj->uobject.id); +err_copy: + idr_remove_uobj(&ib_uverbs_mr_idr, &obj->uobject); err_unreg: ib_dereg_mr(mr); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); +err_put: + put_pd_read(pd); +err_release: ib_umem_release(file->device->ib_dev, &obj->umem); err_free: - kfree(obj); + put_uobj_write(&obj->uobject); return ret; } @@ -507,37 +676,40 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; + struct ib_uobject *uobj; struct ib_umem_object *memobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - - mr = idr_find(&ib_uverbs_mr_idr, cmd.mr_handle); - if (!mr || mr->uobject->context != file->ucontext) - goto out; + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); + if (!uobj) + return -EINVAL; - memobj = container_of(mr->uobject, struct ib_umem_object, uobject); + memobj = container_of(uobj, struct ib_umem_object, uobject); + mr = uobj->object; ret = ib_dereg_mr(mr); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_mr_idr, cmd.mr_handle); + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); mutex_lock(&file->mutex); - list_del(&memobj->uobject.list); + list_del(&uobj->list); mutex_unlock(&file->mutex); ib_umem_release(file->device->ib_dev, &memobj->umem); - kfree(memobj); -out: - mutex_unlock(&ib_uverbs_idr_mutex); + put_uobj(uobj); - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, @@ -576,7 +748,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, struct ib_uverbs_create_cq cmd; struct ib_uverbs_create_cq_resp resp; struct ib_udata udata; - struct ib_ucq_object *uobj; + struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file = NULL; struct ib_cq *cq; int ret; @@ -594,10 +766,13 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, if (cmd.comp_vector >= file->device->num_comp_vectors) return -EINVAL; - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) return -ENOMEM; + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); + down_write(&obj->uobject.mutex); + if (cmd.comp_channel >= 0) { ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel); if (!ev_file) { @@ -606,63 +781,64 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, } } - uobj->uobject.user_handle = cmd.user_handle; - uobj->uobject.context = file->ucontext; - uobj->uverbs_file = file; - uobj->comp_events_reported = 0; - uobj->async_events_reported = 0; - INIT_LIST_HEAD(&uobj->comp_list); - INIT_LIST_HEAD(&uobj->async_list); + obj->uverbs_file = file; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe, file->ucontext, &udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); - goto err; + goto err_file; } cq->device = file->device->ib_dev; - cq->uobject = &uobj->uobject; + cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file; atomic_set(&cq->usecnt, 0); - mutex_lock(&ib_uverbs_idr_mutex); - - ret = idr_add_uobj(&ib_uverbs_cq_idr, cq, &uobj->uobject); + obj->uobject.object = cq; + ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); if (ret) - goto err_up; + goto err_free; memset(&resp, 0, sizeof resp); - resp.cq_handle = uobj->uobject.id; + resp.cq_handle = obj->uobject.id; resp.cqe = cq->cqe; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } mutex_lock(&file->mutex); - list_add_tail(&uobj->uobject.list, &file->ucontext->cq_list); + list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + obj->uobject.live = 1; + + up_write(&obj->uobject.mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_cq_idr, uobj->uobject.id); +err_copy: + idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); + +err_free: ib_destroy_cq(cq); -err: +err_file: if (ev_file) - ib_uverbs_release_ucq(file, ev_file, uobj); - kfree(uobj); + ib_uverbs_release_ucq(file, ev_file, obj); + +err: + put_uobj_write(&obj->uobject); return ret; } @@ -683,11 +859,9 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - mutex_lock(&ib_uverbs_idr_mutex); - - cq = idr_find(&ib_uverbs_cq_idr, cmd.cq_handle); - if (!cq || cq->uobject->context != file->ucontext || !cq->device->resize_cq) - goto out; + cq = idr_read_cq(cmd.cq_handle, file->ucontext); + if (!cq) + return -EINVAL; ret = cq->device->resize_cq(cq, cmd.cqe, &udata); if (ret) @@ -701,7 +875,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, ret = -EFAULT; out: - mutex_unlock(&ib_uverbs_idr_mutex); + put_cq_read(cq); return ret ? ret : in_len; } @@ -712,6 +886,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp *resp; + struct ib_uobject *uobj; struct ib_cq *cq; struct ib_wc *wc; int ret = 0; @@ -732,15 +907,17 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, goto out_wc; } - mutex_lock(&ib_uverbs_idr_mutex); - cq = idr_find(&ib_uverbs_cq_idr, cmd.cq_handle); - if (!cq || cq->uobject->context != file->ucontext) { + uobj = idr_read_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); + if (!uobj) { ret = -EINVAL; goto out; } + cq = uobj->object; resp->count = ib_poll_cq(cq, cmd.ne, wc); + put_uobj_read(uobj); + for (i = 0; i < resp->count; i++) { resp->wc[i].wr_id = wc[i].wr_id; resp->wc[i].status = wc[i].status; @@ -762,7 +939,6 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, ret = -EFAULT; out: - mutex_unlock(&ib_uverbs_idr_mutex); kfree(resp); out_wc: @@ -775,22 +951,23 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_req_notify_cq cmd; + struct ib_uobject *uobj; struct ib_cq *cq; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - cq = idr_find(&ib_uverbs_cq_idr, cmd.cq_handle); - if (cq && cq->uobject->context == file->ucontext) { - ib_req_notify_cq(cq, cmd.solicited_only ? - IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); - ret = in_len; - } - mutex_unlock(&ib_uverbs_idr_mutex); + uobj = idr_read_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + cq = uobj->object; - return ret; + ib_req_notify_cq(cq, cmd.solicited_only ? + IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); + + put_uobj_read(uobj); + + return in_len; } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, @@ -799,52 +976,50 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, { struct ib_uverbs_destroy_cq cmd; struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj; struct ib_cq *cq; - struct ib_ucq_object *uobj; + struct ib_ucq_object *obj; struct ib_uverbs_event_file *ev_file; - u64 user_handle; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - memset(&resp, 0, sizeof resp); - - mutex_lock(&ib_uverbs_idr_mutex); + uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + cq = uobj->object; + ev_file = cq->cq_context; + obj = container_of(cq->uobject, struct ib_ucq_object, uobject); - cq = idr_find(&ib_uverbs_cq_idr, cmd.cq_handle); - if (!cq || cq->uobject->context != file->ucontext) - goto out; + ret = ib_destroy_cq(cq); + if (!ret) + uobj->live = 0; - user_handle = cq->uobject->user_handle; - uobj = container_of(cq->uobject, struct ib_ucq_object, uobject); - ev_file = cq->cq_context; + put_uobj_write(uobj); - ret = ib_destroy_cq(cq); if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_cq_idr, cmd.cq_handle); + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); mutex_lock(&file->mutex); - list_del(&uobj->uobject.list); + list_del(&uobj->list); mutex_unlock(&file->mutex); - ib_uverbs_release_ucq(file, ev_file, uobj); + ib_uverbs_release_ucq(file, ev_file, obj); - resp.comp_events_reported = uobj->comp_events_reported; - resp.async_events_reported = uobj->async_events_reported; + memset(&resp, 0, sizeof resp); + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; - kfree(uobj); + put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) - ret = -EFAULT; - -out: - mutex_unlock(&ib_uverbs_idr_mutex); + return -EFAULT; - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, @@ -854,7 +1029,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, struct ib_uverbs_create_qp cmd; struct ib_uverbs_create_qp_resp resp; struct ib_udata udata; - struct ib_uqp_object *uobj; + struct ib_uqp_object *obj; struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; @@ -872,23 +1047,21 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) return -ENOMEM; - mutex_lock(&ib_uverbs_idr_mutex); + init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext); + down_write(&obj->uevent.uobject.mutex); - pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle); - scq = idr_find(&ib_uverbs_cq_idr, cmd.send_cq_handle); - rcq = idr_find(&ib_uverbs_cq_idr, cmd.recv_cq_handle); - srq = cmd.is_srq ? idr_find(&ib_uverbs_srq_idr, cmd.srq_handle) : NULL; + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + scq = idr_read_cq(cmd.send_cq_handle, file->ucontext); + rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext); + srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; - if (!pd || pd->uobject->context != file->ucontext || - !scq || scq->uobject->context != file->ucontext || - !rcq || rcq->uobject->context != file->ucontext || - (cmd.is_srq && (!srq || srq->uobject->context != file->ucontext))) { + if (!pd || !scq || !rcq || (cmd.is_srq && !srq)) { ret = -EINVAL; - goto err_up; + goto err_put; } attr.event_handler = ib_uverbs_qp_event_handler; @@ -905,16 +1078,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, attr.cap.max_recv_sge = cmd.max_recv_sge; attr.cap.max_inline_data = cmd.max_inline_data; - uobj->uevent.uobject.user_handle = cmd.user_handle; - uobj->uevent.uobject.context = file->ucontext; - uobj->uevent.events_reported = 0; - INIT_LIST_HEAD(&uobj->uevent.event_list); - INIT_LIST_HEAD(&uobj->mcast_list); + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); qp = pd->device->create_qp(pd, &attr, &udata); if (IS_ERR(qp)) { ret = PTR_ERR(qp); - goto err_up; + goto err_put; } qp->device = pd->device; @@ -922,7 +1093,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; qp->srq = attr.srq; - qp->uobject = &uobj->uevent.uobject; + qp->uobject = &obj->uevent.uobject; qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; @@ -932,14 +1103,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, if (attr.srq) atomic_inc(&attr.srq->usecnt); - memset(&resp, 0, sizeof resp); - resp.qpn = qp->qp_num; - - ret = idr_add_uobj(&ib_uverbs_qp_idr, qp, &uobj->uevent.uobject); + obj->uevent.uobject.object = qp; + ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); if (ret) goto err_destroy; - resp.qp_handle = uobj->uevent.uobject.id; + memset(&resp, 0, sizeof resp); + resp.qpn = qp->qp_num; + resp.qp_handle = obj->uevent.uobject.id; resp.max_recv_sge = attr.cap.max_recv_sge; resp.max_send_sge = attr.cap.max_send_sge; resp.max_recv_wr = attr.cap.max_recv_wr; @@ -949,27 +1120,42 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } + put_pd_read(pd); + put_cq_read(scq); + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + mutex_lock(&file->mutex); - list_add_tail(&uobj->uevent.uobject.list, &file->ucontext->qp_list); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + obj->uevent.uobject.live = 1; + + up_write(&obj->uevent.uobject.mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_qp_idr, uobj->uevent.uobject.id); +err_copy: + idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); err_destroy: ib_destroy_qp(qp); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); - - kfree(uobj); +err_put: + if (pd) + put_pd_read(pd); + if (scq) + put_cq_read(scq); + if (rcq) + put_cq_read(rcq); + if (srq) + put_srq_read(srq); + + put_uobj_write(&obj->uevent.uobject); return ret; } @@ -994,15 +1180,15 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, goto out; } - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (qp && qp->uobject->context == file->ucontext) - ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); - else + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { ret = -EINVAL; + goto out; + } + + ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); - mutex_unlock(&ib_uverbs_idr_mutex); + put_qp_read(qp); if (ret) goto out; @@ -1089,10 +1275,8 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, if (!attr) return -ENOMEM; - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) { + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { ret = -EINVAL; goto out; } @@ -1144,13 +1328,15 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; ret = ib_modify_qp(qp, attr, cmd.attr_mask); + + put_qp_read(qp); + if (ret) goto out; ret = in_len; out: - mutex_unlock(&ib_uverbs_idr_mutex); kfree(attr); return ret; @@ -1162,8 +1348,9 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; + struct ib_uobject *uobj; struct ib_qp *qp; - struct ib_uqp_object *uobj; + struct ib_uqp_object *obj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1171,43 +1358,43 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, memset(&resp, 0, sizeof resp); - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) - goto out; - - uobj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); + if (!uobj) + return -EINVAL; + qp = uobj->object; + obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); - if (!list_empty(&uobj->mcast_list)) { - ret = -EBUSY; - goto out; + if (!list_empty(&obj->mcast_list)) { + put_uobj_write(uobj); + return -EBUSY; } ret = ib_destroy_qp(qp); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_qp_idr, cmd.qp_handle); + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); mutex_lock(&file->mutex); - list_del(&uobj->uevent.uobject.list); + list_del(&uobj->list); mutex_unlock(&file->mutex); - ib_uverbs_release_uevent(file, &uobj->uevent); + ib_uverbs_release_uevent(file, &obj->uevent); - resp.events_reported = uobj->uevent.events_reported; + resp.events_reported = obj->uevent.events_reported; - kfree(uobj); + put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) - ret = -EFAULT; - -out: - mutex_unlock(&ib_uverbs_idr_mutex); + return -EFAULT; - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, @@ -1220,6 +1407,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; struct ib_qp *qp; int i, sg_ind; + int is_ud; ssize_t ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -1236,12 +1424,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, if (!user_wr) return -ENOMEM; - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) goto out; + is_ud = qp->qp_type == IB_QPT_UD; sg_ind = 0; last = NULL; for (i = 0; i < cmd.wr_count; ++i) { @@ -1249,12 +1436,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, buf + sizeof cmd + i * cmd.wqe_size, cmd.wqe_size)) { ret = -EFAULT; - goto out; + goto out_put; } if (user_wr->num_sge + sg_ind > cmd.sge_count) { ret = -EINVAL; - goto out; + goto out_put; } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + @@ -1262,7 +1449,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, GFP_KERNEL); if (!next) { ret = -ENOMEM; - goto out; + goto out_put; } if (!last) @@ -1278,12 +1465,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->send_flags = user_wr->send_flags; next->imm_data = (__be32 __force) user_wr->imm_data; - if (qp->qp_type == IB_QPT_UD) { - next->wr.ud.ah = idr_find(&ib_uverbs_ah_idr, - user_wr->wr.ud.ah); + if (is_ud) { + next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, + file->ucontext); if (!next->wr.ud.ah) { ret = -EINVAL; - goto out; + goto out_put; } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; @@ -1320,7 +1507,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, sg_ind * sizeof (struct ib_sge), next->num_sge * sizeof (struct ib_sge))) { ret = -EFAULT; - goto out; + goto out_put; } sg_ind += next->num_sge; } else @@ -1340,10 +1527,13 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, &resp, sizeof resp)) ret = -EFAULT; -out: - mutex_unlock(&ib_uverbs_idr_mutex); +out_put: + put_qp_read(qp); +out: while (wr) { + if (is_ud && wr->wr.ud.ah) + put_ah_read(wr->wr.ud.ah); next = wr->next; kfree(wr); wr = next; @@ -1458,14 +1648,15 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) goto out; resp.bad_wr = 0; ret = qp->device->post_recv(qp, wr, &bad_wr); + + put_qp_read(qp); + if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; @@ -1479,8 +1670,6 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, ret = -EFAULT; out: - mutex_unlock(&ib_uverbs_idr_mutex); - while (wr) { next = wr->next; kfree(wr); @@ -1509,14 +1698,15 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - mutex_lock(&ib_uverbs_idr_mutex); - - srq = idr_find(&ib_uverbs_srq_idr, cmd.srq_handle); - if (!srq || srq->uobject->context != file->ucontext) + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) goto out; resp.bad_wr = 0; ret = srq->device->post_srq_recv(srq, wr, &bad_wr); + + put_srq_read(srq); + if (ret) for (next = wr; next; next = next->next) { ++resp.bad_wr; @@ -1530,8 +1720,6 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, ret = -EFAULT; out: - mutex_unlock(&ib_uverbs_idr_mutex); - while (wr) { next = wr->next; kfree(wr); @@ -1563,17 +1751,15 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (!uobj) return -ENOMEM; - mutex_lock(&ib_uverbs_idr_mutex); + init_uobj(uobj, cmd.user_handle, file->ucontext); + down_write(&uobj->mutex); - pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle); - if (!pd || pd->uobject->context != file->ucontext) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { ret = -EINVAL; - goto err_up; + goto err; } - uobj->user_handle = cmd.user_handle; - uobj->context = file->ucontext; - attr.dlid = cmd.attr.dlid; attr.sl = cmd.attr.sl; attr.src_path_bits = cmd.attr.src_path_bits; @@ -1589,12 +1775,13 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, ah = ib_create_ah(pd, &attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); - goto err_up; + goto err; } - ah->uobject = uobj; + ah->uobject = uobj; + uobj->object = ah; - ret = idr_add_uobj(&ib_uverbs_ah_idr, ah, uobj); + ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); if (ret) goto err_destroy; @@ -1603,27 +1790,29 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } + put_pd_read(pd); + mutex_lock(&file->mutex); list_add_tail(&uobj->list, &file->ucontext->ah_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + uobj->live = 1; + + up_write(&uobj->mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_ah_idr, uobj->id); +err_copy: + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); err_destroy: ib_destroy_ah(ah); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); - - kfree(uobj); +err: + put_uobj_write(uobj); return ret; } @@ -1633,35 +1822,34 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); + uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); + if (!uobj) + return -EINVAL; + ah = uobj->object; - ah = idr_find(&ib_uverbs_ah_idr, cmd.ah_handle); - if (!ah || ah->uobject->context != file->ucontext) - goto out; + ret = ib_destroy_ah(ah); + if (!ret) + uobj->live = 0; - uobj = ah->uobject; + put_uobj_write(uobj); - ret = ib_destroy_ah(ah); if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_ah_idr, cmd.ah_handle); + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); mutex_lock(&file->mutex); list_del(&uobj->list); mutex_unlock(&file->mutex); - kfree(uobj); + put_uobj(uobj); -out: - mutex_unlock(&ib_uverbs_idr_mutex); - - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, @@ -1670,47 +1858,43 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, { struct ib_uverbs_attach_mcast cmd; struct ib_qp *qp; - struct ib_uqp_object *uobj; + struct ib_uqp_object *obj; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) - goto out; + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) + return -EINVAL; - uobj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); - list_for_each_entry(mcast, &uobj->mcast_list, list) + list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { ret = 0; - goto out; + goto out_put; } mcast = kmalloc(sizeof *mcast, GFP_KERNEL); if (!mcast) { ret = -ENOMEM; - goto out; + goto out_put; } mcast->lid = cmd.mlid; memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw); ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid); - if (!ret) { - uobj = container_of(qp->uobject, struct ib_uqp_object, - uevent.uobject); - list_add_tail(&mcast->list, &uobj->mcast_list); - } else + if (!ret) + list_add_tail(&mcast->list, &obj->mcast_list); + else kfree(mcast); -out: - mutex_unlock(&ib_uverbs_idr_mutex); +out_put: + put_qp_read(qp); return ret ? ret : in_len; } @@ -1720,7 +1904,7 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_detach_mcast cmd; - struct ib_uqp_object *uobj; + struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; int ret = -EINVAL; @@ -1728,19 +1912,17 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - - qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle); - if (!qp || qp->uobject->context != file->ucontext) - goto out; + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) + return -EINVAL; ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid); if (ret) - goto out; + goto out_put; - uobj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); - list_for_each_entry(mcast, &uobj->mcast_list, list) + list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { list_del(&mcast->list); @@ -1748,8 +1930,8 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, break; } -out: - mutex_unlock(&ib_uverbs_idr_mutex); +out_put: + put_qp_read(qp); return ret ? ret : in_len; } @@ -1761,7 +1943,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, struct ib_uverbs_create_srq cmd; struct ib_uverbs_create_srq_resp resp; struct ib_udata udata; - struct ib_uevent_object *uobj; + struct ib_uevent_object *obj; struct ib_pd *pd; struct ib_srq *srq; struct ib_srq_init_attr attr; @@ -1777,17 +1959,17 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) return -ENOMEM; - mutex_lock(&ib_uverbs_idr_mutex); - - pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle); + init_uobj(&obj->uobject, 0, file->ucontext); + down_write(&obj->uobject.mutex); - if (!pd || pd->uobject->context != file->ucontext) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { ret = -EINVAL; - goto err_up; + goto err; } attr.event_handler = ib_uverbs_srq_event_handler; @@ -1796,59 +1978,59 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, attr.attr.max_sge = cmd.max_sge; attr.attr.srq_limit = cmd.srq_limit; - uobj->uobject.user_handle = cmd.user_handle; - uobj->uobject.context = file->ucontext; - uobj->events_reported = 0; - INIT_LIST_HEAD(&uobj->event_list); + obj->events_reported = 0; + INIT_LIST_HEAD(&obj->event_list); srq = pd->device->create_srq(pd, &attr, &udata); if (IS_ERR(srq)) { ret = PTR_ERR(srq); - goto err_up; + goto err; } srq->device = pd->device; srq->pd = pd; - srq->uobject = &uobj->uobject; + srq->uobject = &obj->uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); - memset(&resp, 0, sizeof resp); - - ret = idr_add_uobj(&ib_uverbs_srq_idr, srq, &uobj->uobject); + obj->uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject); if (ret) goto err_destroy; - resp.srq_handle = uobj->uobject.id; + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uobject.id; resp.max_wr = attr.attr.max_wr; resp.max_sge = attr.attr.max_sge; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_idr; + goto err_copy; } + put_pd_read(pd); + mutex_lock(&file->mutex); - list_add_tail(&uobj->uobject.list, &file->ucontext->srq_list); + list_add_tail(&obj->uobject.list, &file->ucontext->srq_list); mutex_unlock(&file->mutex); - mutex_unlock(&ib_uverbs_idr_mutex); + obj->uobject.live = 1; + + up_write(&obj->uobject.mutex); return in_len; -err_idr: - idr_remove(&ib_uverbs_srq_idr, uobj->uobject.id); +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject); err_destroy: ib_destroy_srq(srq); -err_up: - mutex_unlock(&ib_uverbs_idr_mutex); - - kfree(uobj); +err: + put_uobj_write(&obj->uobject); return ret; } @@ -1864,21 +2046,16 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - - srq = idr_find(&ib_uverbs_srq_idr, cmd.srq_handle); - if (!srq || srq->uobject->context != file->ucontext) { - ret = -EINVAL; - goto out; - } + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; ret = ib_modify_srq(srq, &attr, cmd.attr_mask); -out: - mutex_unlock(&ib_uverbs_idr_mutex); + put_srq_read(srq); return ret ? ret : in_len; } @@ -1899,18 +2076,16 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq) + return -EINVAL; - srq = idr_find(&ib_uverbs_srq_idr, cmd.srq_handle); - if (srq && srq->uobject->context == file->ucontext) - ret = ib_query_srq(srq, &attr); - else - ret = -EINVAL; + ret = ib_query_srq(srq, &attr); - mutex_unlock(&ib_uverbs_idr_mutex); + put_srq_read(srq); if (ret) - goto out; + return ret; memset(&resp, 0, sizeof resp); @@ -1920,10 +2095,9 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) - ret = -EFAULT; + return -EFAULT; -out: - return ret ? ret : in_len; + return in_len; } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, @@ -1932,45 +2106,45 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, { struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; + struct ib_uobject *uobj; struct ib_srq *srq; - struct ib_uevent_object *uobj; + struct ib_uevent_object *obj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&ib_uverbs_idr_mutex); - - memset(&resp, 0, sizeof resp); + uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); + if (!uobj) + return -EINVAL; + srq = uobj->object; + obj = container_of(uobj, struct ib_uevent_object, uobject); - srq = idr_find(&ib_uverbs_srq_idr, cmd.srq_handle); - if (!srq || srq->uobject->context != file->ucontext) - goto out; + ret = ib_destroy_srq(srq); + if (!ret) + uobj->live = 0; - uobj = container_of(srq->uobject, struct ib_uevent_object, uobject); + put_uobj_write(uobj); - ret = ib_destroy_srq(srq); if (ret) - goto out; + return ret; - idr_remove(&ib_uverbs_srq_idr, cmd.srq_handle); + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); mutex_lock(&file->mutex); - list_del(&uobj->uobject.list); + list_del(&uobj->list); mutex_unlock(&file->mutex); - ib_uverbs_release_uevent(file, uobj); + ib_uverbs_release_uevent(file, obj); - resp.events_reported = uobj->events_reported; + memset(&resp, 0, sizeof resp); + resp.events_reported = obj->events_reported; - kfree(uobj); + put_uobj(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) ret = -EFAULT; -out: - mutex_unlock(&ib_uverbs_idr_mutex); - return ret ? ret : in_len; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index ff092a0a94d..5ec2d49e9bb 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -66,7 +66,7 @@ enum { static struct class *uverbs_class; -DEFINE_MUTEX(ib_uverbs_idr_mutex); +DEFINE_SPINLOCK(ib_uverbs_idr_lock); DEFINE_IDR(ib_uverbs_pd_idr); DEFINE_IDR(ib_uverbs_mr_idr); DEFINE_IDR(ib_uverbs_mw_idr); @@ -183,21 +183,21 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, if (!context) return 0; - mutex_lock(&ib_uverbs_idr_mutex); - list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { - struct ib_ah *ah = idr_find(&ib_uverbs_ah_idr, uobj->id); - idr_remove(&ib_uverbs_ah_idr, uobj->id); + struct ib_ah *ah = uobj->object; + + idr_remove_uobj(&ib_uverbs_ah_idr, uobj); ib_destroy_ah(ah); list_del(&uobj->list); kfree(uobj); } list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { - struct ib_qp *qp = idr_find(&ib_uverbs_qp_idr, uobj->id); + struct ib_qp *qp = uobj->object; struct ib_uqp_object *uqp = container_of(uobj, struct ib_uqp_object, uevent.uobject); - idr_remove(&ib_uverbs_qp_idr, uobj->id); + + idr_remove_uobj(&ib_uverbs_qp_idr, uobj); ib_uverbs_detach_umcast(qp, uqp); ib_destroy_qp(qp); list_del(&uobj->list); @@ -206,11 +206,12 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, } list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { - struct ib_cq *cq = idr_find(&ib_uverbs_cq_idr, uobj->id); + struct ib_cq *cq = uobj->object; struct ib_uverbs_event_file *ev_file = cq->cq_context; struct ib_ucq_object *ucq = container_of(uobj, struct ib_ucq_object, uobject); - idr_remove(&ib_uverbs_cq_idr, uobj->id); + + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); ib_destroy_cq(cq); list_del(&uobj->list); ib_uverbs_release_ucq(file, ev_file, ucq); @@ -218,10 +219,11 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, } list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { - struct ib_srq *srq = idr_find(&ib_uverbs_srq_idr, uobj->id); + struct ib_srq *srq = uobj->object; struct ib_uevent_object *uevent = container_of(uobj, struct ib_uevent_object, uobject); - idr_remove(&ib_uverbs_srq_idr, uobj->id); + + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); ib_destroy_srq(srq); list_del(&uobj->list); ib_uverbs_release_uevent(file, uevent); @@ -231,11 +233,11 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, /* XXX Free MWs */ list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { - struct ib_mr *mr = idr_find(&ib_uverbs_mr_idr, uobj->id); + struct ib_mr *mr = uobj->object; struct ib_device *mrdev = mr->device; struct ib_umem_object *memobj; - idr_remove(&ib_uverbs_mr_idr, uobj->id); + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); ib_dereg_mr(mr); memobj = container_of(uobj, struct ib_umem_object, uobject); @@ -246,15 +248,14 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, } list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { - struct ib_pd *pd = idr_find(&ib_uverbs_pd_idr, uobj->id); - idr_remove(&ib_uverbs_pd_idr, uobj->id); + struct ib_pd *pd = uobj->object; + + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); ib_dealloc_pd(pd); list_del(&uobj->list); kfree(uobj); } - mutex_unlock(&ib_uverbs_idr_mutex); - return context->device->dealloc_ucontext(context); } -- cgit v1.2.3