From a3285aa4eecd722508dab01c4932b11b4ba80134 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 9 May 2006 10:50:29 -0700 Subject: IB/mthca: Fix race in reference counting Fix races in in destroying various objects. If a destroy routine waits for an object to become free by doing wait_event(&obj->wait, !atomic_read(&obj->refcount)); /* now clean up and destroy the object */ and another place drops a reference to the object by doing if (atomic_dec_and_test(&obj->refcount)) wake_up(&obj->wait); then this is susceptible to a race where the wait_event() and final freeing of the object occur between the atomic_dec_and_test() and the wake_up(). And this is a use-after-free, since wake_up() will be called on part of the already-freed object. Fix this in mthca by replacing the atomic_t refcounts with plain old integers protected by a spinlock. This makes it possible to do the decrement of the reference count and the wake_up() so that it appears as a single atomic operation to the code waiting on the wait queue. While touching this code, also simplify mthca_cq_clean(): the CQ being cleaned cannot go away, because it still has a QP attached to it. So there's no reason to be paranoid and look up the CQ by number; it's perfectly safe to use the pointer that the callers already have. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cq.c | 41 ++++++++++++++-------------- drivers/infiniband/hw/mthca/mthca_dev.h | 2 +- drivers/infiniband/hw/mthca/mthca_provider.h | 22 ++++++++------- drivers/infiniband/hw/mthca/mthca_qp.c | 31 +++++++++++++++------ drivers/infiniband/hw/mthca/mthca_srq.c | 23 ++++++++++++---- 5 files changed, 74 insertions(+), 45 deletions(-) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index 312cf90731e..205854e9c66 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -238,9 +238,9 @@ void mthca_cq_event(struct mthca_dev *dev, u32 cqn, spin_lock(&dev->cq_table.lock); cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); - if (cq) - atomic_inc(&cq->refcount); + ++cq->refcount; + spin_unlock(&dev->cq_table.lock); if (!cq) { @@ -254,8 +254,10 @@ void mthca_cq_event(struct mthca_dev *dev, u32 cqn, if (cq->ibcq.event_handler) cq->ibcq.event_handler(&event, cq->ibcq.cq_context); - if (atomic_dec_and_test(&cq->refcount)) + spin_lock(&dev->cq_table.lock); + if (!--cq->refcount) wake_up(&cq->wait); + spin_unlock(&dev->cq_table.lock); } static inline int is_recv_cqe(struct mthca_cqe *cqe) @@ -267,23 +269,13 @@ static inline int is_recv_cqe(struct mthca_cqe *cqe) return !(cqe->is_send & 0x80); } -void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn, +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, struct mthca_srq *srq) { - struct mthca_cq *cq; struct mthca_cqe *cqe; u32 prod_index; int nfreed = 0; - spin_lock_irq(&dev->cq_table.lock); - cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); - if (cq) - atomic_inc(&cq->refcount); - spin_unlock_irq(&dev->cq_table.lock); - - if (!cq) - return; - spin_lock_irq(&cq->lock); /* @@ -301,7 +293,7 @@ void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn, if (0) mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n", - qpn, cqn, cq->cons_index, prod_index); + qpn, cq->cqn, cq->cons_index, prod_index); /* * Now sweep backwards through the CQ, removing CQ entries @@ -325,8 +317,6 @@ void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn, } spin_unlock_irq(&cq->lock); - if (atomic_dec_and_test(&cq->refcount)) - wake_up(&cq->wait); } void mthca_cq_resize_copy_cqes(struct mthca_cq *cq) @@ -821,7 +811,7 @@ int mthca_init_cq(struct mthca_dev *dev, int nent, } spin_lock_init(&cq->lock); - atomic_set(&cq->refcount, 1); + cq->refcount = 1; init_waitqueue_head(&cq->wait); memset(cq_context, 0, sizeof *cq_context); @@ -896,6 +886,17 @@ err_out: return err; } +static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq) +{ + int c; + + spin_lock_irq(&dev->cq_table.lock); + c = cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + return c; +} + void mthca_free_cq(struct mthca_dev *dev, struct mthca_cq *cq) { @@ -929,6 +930,7 @@ void mthca_free_cq(struct mthca_dev *dev, spin_lock_irq(&dev->cq_table.lock); mthca_array_clear(&dev->cq_table.cq, cq->cqn & (dev->limits.num_cqs - 1)); + --cq->refcount; spin_unlock_irq(&dev->cq_table.lock); if (dev->mthca_flags & MTHCA_FLAG_MSI_X) @@ -936,8 +938,7 @@ void mthca_free_cq(struct mthca_dev *dev, else synchronize_irq(dev->pdev->irq); - atomic_dec(&cq->refcount); - wait_event(cq->wait, !atomic_read(&cq->refcount)); + wait_event(cq->wait, !get_cq_refcount(dev, cq)); if (cq->is_kernel) { mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 4c1dcb4c182..f8160b8de09 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -496,7 +496,7 @@ void mthca_free_cq(struct mthca_dev *dev, void mthca_cq_completion(struct mthca_dev *dev, u32 cqn); void mthca_cq_event(struct mthca_dev *dev, u32 cqn, enum ib_event_type event_type); -void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn, +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, struct mthca_srq *srq); void mthca_cq_resize_copy_cqes(struct mthca_cq *cq); int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 6676a786d69..179a8f610d0 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -139,11 +139,12 @@ struct mthca_ah { * a qp may be locked, with the send cq locked first. No other * nesting should be done. * - * Each struct mthca_cq/qp also has an atomic_t ref count. The - * pointer from the cq/qp_table to the struct counts as one reference. - * This reference also is good for access through the consumer API, so - * modifying the CQ/QP etc doesn't need to take another reference. - * Access because of a completion being polled does need a reference. + * Each struct mthca_cq/qp also has an ref count, protected by the + * corresponding table lock. The pointer from the cq/qp_table to the + * struct counts as one reference. This reference also is good for + * access through the consumer API, so modifying the CQ/QP etc doesn't + * need to take another reference. Access to a QP because of a + * completion being polled does not need a reference either. * * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the * destroy function to sleep on. @@ -159,8 +160,9 @@ struct mthca_ah { * - decrement ref count; if zero, wake up waiters * * To destroy a CQ/QP, we can do the following: - * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock - * - decrement ref count + * - lock cq/qp_table + * - remove pointer and decrement ref count + * - unlock cq/qp_table lock * - wait_event until ref count is zero * * It is the consumer's responsibilty to make sure that no QP @@ -197,7 +199,7 @@ struct mthca_cq_resize { struct mthca_cq { struct ib_cq ibcq; spinlock_t lock; - atomic_t refcount; + int refcount; int cqn; u32 cons_index; struct mthca_cq_buf buf; @@ -217,7 +219,7 @@ struct mthca_cq { struct mthca_srq { struct ib_srq ibsrq; spinlock_t lock; - atomic_t refcount; + int refcount; int srqn; int max; int max_gs; @@ -254,7 +256,7 @@ struct mthca_wq { struct mthca_qp { struct ib_qp ibqp; - atomic_t refcount; + int refcount; u32 qpn; int is_direct; u8 port; /* for SQP and memfree use only */ diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index f37b0e36732..19765f6f8d5 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -240,7 +240,7 @@ void mthca_qp_event(struct mthca_dev *dev, u32 qpn, spin_lock(&dev->qp_table.lock); qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1)); if (qp) - atomic_inc(&qp->refcount); + ++qp->refcount; spin_unlock(&dev->qp_table.lock); if (!qp) { @@ -257,8 +257,10 @@ void mthca_qp_event(struct mthca_dev *dev, u32 qpn, if (qp->ibqp.event_handler) qp->ibqp.event_handler(&event, qp->ibqp.qp_context); - if (atomic_dec_and_test(&qp->refcount)) + spin_lock(&dev->qp_table.lock); + if (!--qp->refcount) wake_up(&qp->wait); + spin_unlock(&dev->qp_table.lock); } static int to_mthca_state(enum ib_qp_state ib_state) @@ -833,10 +835,10 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) { - mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn, + mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (qp->ibqp.send_cq != qp->ibqp.recv_cq) - mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn, + mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); mthca_wq_init(&qp->sq); @@ -1096,7 +1098,7 @@ static int mthca_alloc_qp_common(struct mthca_dev *dev, int ret; int i; - atomic_set(&qp->refcount, 1); + qp->refcount = 1; init_waitqueue_head(&qp->wait); qp->state = IB_QPS_RESET; qp->atomic_rd_en = 0; @@ -1318,6 +1320,17 @@ int mthca_alloc_sqp(struct mthca_dev *dev, return err; } +static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp) +{ + int c; + + spin_lock_irq(&dev->qp_table.lock); + c = qp->refcount; + spin_unlock_irq(&dev->qp_table.lock); + + return c; +} + void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp) { @@ -1339,14 +1352,14 @@ void mthca_free_qp(struct mthca_dev *dev, spin_lock(&dev->qp_table.lock); mthca_array_clear(&dev->qp_table.qp, qp->qpn & (dev->limits.num_qps - 1)); + --qp->refcount; spin_unlock(&dev->qp_table.lock); if (send_cq != recv_cq) spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); - atomic_dec(&qp->refcount); - wait_event(qp->wait, !atomic_read(&qp->refcount)); + wait_event(qp->wait, !get_qp_refcount(dev, qp)); if (qp->state != IB_QPS_RESET) mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0, @@ -1358,10 +1371,10 @@ void mthca_free_qp(struct mthca_dev *dev, * unref the mem-free tables and free the QPN in our table. */ if (!qp->ibqp.uobject) { - mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn, + mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (qp->ibqp.send_cq != qp->ibqp.recv_cq) - mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn, + mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); mthca_free_memfree(dev, qp); diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index adcaf85355a..1ea433291fa 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -241,7 +241,7 @@ int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd, goto err_out_mailbox; spin_lock_init(&srq->lock); - atomic_set(&srq->refcount, 1); + srq->refcount = 1; init_waitqueue_head(&srq->wait); if (mthca_is_memfree(dev)) @@ -308,6 +308,17 @@ err_out: return err; } +static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq) +{ + int c; + + spin_lock_irq(&dev->srq_table.lock); + c = srq->refcount; + spin_unlock_irq(&dev->srq_table.lock); + + return c; +} + void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq) { struct mthca_mailbox *mailbox; @@ -329,10 +340,10 @@ void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq) spin_lock_irq(&dev->srq_table.lock); mthca_array_clear(&dev->srq_table.srq, srq->srqn & (dev->limits.num_srqs - 1)); + --srq->refcount; spin_unlock_irq(&dev->srq_table.lock); - atomic_dec(&srq->refcount); - wait_event(srq->wait, !atomic_read(&srq->refcount)); + wait_event(srq->wait, !get_srq_refcount(dev, srq)); if (!srq->ibsrq.uobject) { mthca_free_srq_buf(dev, srq); @@ -414,7 +425,7 @@ void mthca_srq_event(struct mthca_dev *dev, u32 srqn, spin_lock(&dev->srq_table.lock); srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1)); if (srq) - atomic_inc(&srq->refcount); + ++srq->refcount; spin_unlock(&dev->srq_table.lock); if (!srq) { @@ -431,8 +442,10 @@ void mthca_srq_event(struct mthca_dev *dev, u32 srqn, srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context); out: - if (atomic_dec_and_test(&srq->refcount)) + spin_lock(&dev->srq_table.lock); + if (!--srq->refcount) wake_up(&srq->wait); + spin_unlock(&dev->srq_table.lock); } /* -- cgit v1.2.3 From ce477ae4f8c75c94587c3157deffad8219db09a0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 10 May 2006 17:58:41 +0300 Subject: IB/mthca: FMR ioremap fix Addresses for ioremap must be calculated off of pci_resource_start; we can't directly use the bus address as seen by the HCA. Fix the code that remaps device memory for FMR access. Based on patch by Klaus Smolin. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_mr.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 25e1c1db9a4..a486dec1707 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -761,6 +761,7 @@ void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) int __devinit mthca_init_mr_table(struct mthca_dev *dev) { + unsigned long addr; int err, i; err = mthca_alloc_init(&dev->mr_table.mpt_alloc, @@ -796,9 +797,12 @@ int __devinit mthca_init_mr_table(struct mthca_dev *dev) goto err_fmr_mpt; } + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mpt_base); + dev->mr_table.tavor_fmr.mpt_base = - ioremap(dev->mr_table.mpt_base, - (1 << i) * sizeof (struct mthca_mpt_entry)); + ioremap(addr, (1 << i) * sizeof(struct mthca_mpt_entry)); if (!dev->mr_table.tavor_fmr.mpt_base) { mthca_warn(dev, "MPT ioremap for FMR failed.\n"); @@ -806,9 +810,12 @@ int __devinit mthca_init_mr_table(struct mthca_dev *dev) goto err_fmr_mpt; } + addr = pci_resource_start(dev->pdev, 4) + + ((pci_resource_len(dev->pdev, 4) - 1) & + dev->mr_table.mtt_base); + dev->mr_table.tavor_fmr.mtt_base = - ioremap(dev->mr_table.mtt_base, - (1 << i) * MTHCA_MTT_SEG_SIZE); + ioremap(addr, (1 << i) * MTHCA_MTT_SEG_SIZE); if (!dev->mr_table.tavor_fmr.mtt_base) { mthca_warn(dev, "MTT ioremap for FMR failed.\n"); err = -ENOMEM; -- cgit v1.2.3 From 1db76c14d215c8b26024dd532de3dcaf66ea30f7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 17 May 2006 07:48:07 -0700 Subject: IB/mthca: Make fw_cmd_doorbell default to 0 Setting fw_cmd_doorbell allows FW command to be queued using posted writes instead of requiring polling on a "go" bit, so it should be a performance boost. However, the option causes problems with at least some device/firmware combinations, so set the default to 0 until we understand what's going on better. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 1985b5dfa48..798e13e14fa 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -182,7 +182,7 @@ struct mthca_cmd_context { u8 status; }; -static int fw_cmd_doorbell = 1; +static int fw_cmd_doorbell = 0; module_param(fw_cmd_doorbell, int, 0644); MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero " "(and supported by FW)"); -- cgit v1.2.3 From 23f3bc0f2c1e26215b671499c07047c325d54d9c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 18 May 2006 18:32:54 +0300 Subject: IB/mthca: Fix posting lists of 256 receive requests for Tavor If we post a list of length 256 exactly, nreq in doorbell gets set to 256 which is wrong: it should be encoded by 0. This is because we only zero it out on the next WR, which may not be there. The solution is to ring the doorbell after posting a WQE, not before posting the next one. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_qp.c | 35 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/hw/mthca') diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 19765f6f8d5..07c13be07a4 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1727,23 +1727,7 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, ind = qp->rq.next_ind; - for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { - nreq = 0; - - doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0); - doorbell[1] = cpu_to_be32(qp->qpn << 8); - - wmb(); - - mthca_write64(doorbell, - dev->kar + MTHCA_RECEIVE_DOORBELL, - MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); - - qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; - size0 = 0; - } - + for (nreq = 0; wr; wr = wr->next) { if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { mthca_err(dev, "RQ %06x full (%u head, %u tail," " %d max, %d nreq)\n", qp->qpn, @@ -1797,6 +1781,23 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, ++ind; if (unlikely(ind >= qp->rq.max)) ind -= qp->rq.max; + + ++nreq; + if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { + nreq = 0; + + doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0); + doorbell[1] = cpu_to_be32(qp->qpn << 8); + + wmb(); + + mthca_write64(doorbell, + dev->kar + MTHCA_RECEIVE_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + + qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB; + size0 = 0; + } } out: -- cgit v1.2.3