diff options
Diffstat (limited to 'drivers/infiniband/hw')
54 files changed, 1979 insertions, 1163 deletions
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 6af2c0f79a6..2acf9b62cf9 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -452,7 +452,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(-ENOMEM); c2mr->pd = c2pd; - c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc); + c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(c2mr->umem)) { err = PTR_ERR(c2mr->umem); kfree(c2mr); diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 66eb7030aea..3f441fc57c1 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -359,9 +359,10 @@ static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) cq->sw_wptr++; } -void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) { u32 ptr; + int flushed = 0; PDBG("%s wq %p cq %p\n", __func__, wq, cq); @@ -369,8 +370,11 @@ void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__, wq->rq_rptr, wq->rq_wptr, count); ptr = wq->rq_rptr + count; - while (ptr++ != wq->rq_wptr) + while (ptr++ != wq->rq_wptr) { insert_recv_cqe(wq, cq); + flushed++; + } + return flushed; } static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, @@ -394,18 +398,21 @@ static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, cq->sw_wptr++; } -void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) { __u32 ptr; + int flushed = 0; struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); ptr = wq->sq_rptr + count; - sqp += count; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); while (ptr != wq->sq_wptr) { insert_sq_cqe(wq, cq, sqp); - sqp++; ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + flushed++; } + return flushed; } /* @@ -456,7 +463,8 @@ void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) ptr = cq->sw_rptr; while (!Q_EMPTY(ptr, cq->sw_wptr)) { cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); - if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) && + if ((SQ_TYPE(*cqe) || + ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) && (CQE_QPID(*cqe) == wq->qpid)) (*count)++; ptr++; @@ -580,7 +588,7 @@ static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) * caller aquires the ctrl_qp lock before the call */ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, - u32 len, void *data, int completion) + u32 len, void *data) { u32 i, nr_wqe, copy_len; u8 *copy_data; @@ -616,7 +624,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, flag = 0; if (i == (nr_wqe - 1)) { /* last WQE */ - flag = completion ? T3_COMPLETION_FLAG : 0; + flag = T3_COMPLETION_FLAG; if (len % 32) utx_len = len / 32 + 1; else @@ -675,21 +683,20 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, return 0; } -/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size - * OUT: stag index, actual pbl_size, pbl_addr allocated. +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr + * OUT: stag index * TBD: shared memory region support */ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, u32 *stag, u8 stag_state, u32 pdid, enum tpt_mem_type type, enum tpt_mem_perm perm, - u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, - u32 *pbl_size, u32 *pbl_addr) + u32 zbva, u64 to, u32 len, u8 page_size, + u32 pbl_size, u32 pbl_addr) { int err; struct tpt_entry tpt; u32 stag_idx; u32 wptr; - int rereg = (*stag != T3_STAG_UNSET); stag_state = stag_state > 0; stag_idx = (*stag) >> 8; @@ -703,30 +710,8 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", __func__, stag_state, type, pdid, stag_idx); - if (reset_tpt_entry) - cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); - else if (!rereg) { - *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); - if (!*pbl_addr) { - return -ENOMEM; - } - } - mutex_lock(&rdev_p->ctrl_qp.lock); - /* write PBL first if any - update pbl only if pbl list exist */ - if (pbl) { - - PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", - __func__, *pbl_addr, rdev_p->rnic_info.pbl_base, - *pbl_size); - err = cxio_hal_ctrl_qp_write_mem(rdev_p, - (*pbl_addr >> 5), - (*pbl_size << 3), pbl, 0); - if (err) - goto ret; - } - /* write TPT entry */ if (reset_tpt_entry) memset(&tpt, 0, sizeof(tpt)); @@ -741,23 +726,23 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = cpu_to_be32(len); tpt.va_hi = cpu_to_be32((u32) (to >> 32)); tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + (rdev_p->rnic_info.tpt_base >> 5), - sizeof(tpt), &tpt, 1); + sizeof(tpt), &tpt); /* release the stag index to free pool */ if (reset_tpt_entry) cxio_hal_put_stag(rdev_p->rscp, stag_idx); -ret: + wptr = rdev_p->ctrl_qp.wptr; mutex_unlock(&rdev_p->ctrl_qp.lock); if (!err) @@ -768,44 +753,67 @@ ret: return err; } +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + u32 wptr; + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev_p->rnic_info.pbl_base, + pbl_size); + + mutex_lock(&rdev_p->ctrl_qp.lock); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3, + pbl); + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (err) + return err; + + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + + return 0; +} + int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, u32 pbl_addr) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - &pbl_size, &pbl_addr); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + pbl_size, pbl_addr); } int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) { - u32 pbl_size = 0; *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, - NULL, &pbl_size, NULL); + 0, 0); } int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - NULL, NULL); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + 0, 0); } int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) @@ -829,7 +837,8 @@ int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) wqe->mpaattrs = attr->mpaattrs; wqe->qpcaps = attr->qpcaps; wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); - wqe->flags = cpu_to_be32(attr->flags); + wqe->rqe_count = cpu_to_be16(attr->rqe_count); + wqe->flags_rtr_type = cpu_to_be16(attr->flags|V_RTR_TYPE(attr->rtr_type)); wqe->ord = cpu_to_be32(attr->ord); wqe->ird = cpu_to_be32(attr->ird); wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); @@ -1135,6 +1144,18 @@ int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { /* + * If this is an unsolicited read response, then the read + * was generated by the kernel driver as part of peer-2-peer + * connection setup. So ignore the completion. + */ + if (!wq->oldest_read) { + if (CQE_STATUS(*hw_cqe)) + wq->error = 1; + ret = -1; + goto skip_cqe; + } + + /* * Don't write to the HWCQ, so create a new read req CQE * in local memory. */ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 99543d63470..6e128f6bab0 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -53,6 +53,7 @@ #define T3_MAX_PBL_SIZE 256 #define T3_MAX_RQ_SIZE 1024 #define T3_MAX_NUM_STAG (1<<15) +#define T3_MAX_MR_SIZE 0x100000000ULL #define T3_STAG_UNSET 0xffffffff @@ -153,14 +154,14 @@ int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, struct cxio_ucontext *uctx); int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size); int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, u32 pbl_addr); int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); @@ -172,8 +173,8 @@ u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); int __init cxio_hal_init(void); void __exit cxio_hal_exit(void); -void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); -void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); void cxio_flush_hw_cq(struct t3_cq *cq); diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 45ed4f25ef7..bd233c08765 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -250,7 +250,6 @@ void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) */ #define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ -#define PBL_CHUNK 2*1024*1024 u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) { @@ -267,14 +266,35 @@ void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) { - unsigned long i; + unsigned pbl_start, pbl_chunk; + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); - if (rdev_p->pbl_pool) - for (i = rdev_p->rnic_info.pbl_base; - i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1; - i += PBL_CHUNK) - gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1); - return rdev_p->pbl_pool ? 0 : -ENOMEM; + if (!rdev_p->pbl_pool) + return -ENOMEM; + + pbl_start = rdev_p->rnic_info.pbl_base; + pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1; + + while (pbl_start < rdev_p->rnic_info.pbl_top) { + pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1, + pbl_chunk); + if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n", + __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; } void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h index 969d4d92845..f1a25a821a4 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_wr.h +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -278,6 +278,17 @@ enum t3_qp_caps { uP_RI_QP_STAG0_ENABLE = 0x10 } __attribute__ ((packed)); +enum rdma_init_rtr_types { + RTR_READ = 1, + RTR_WRITE = 2, + RTR_SEND = 3, +}; + +#define S_RTR_TYPE 2 +#define M_RTR_TYPE 0x3 +#define V_RTR_TYPE(x) ((x) << S_RTR_TYPE) +#define G_RTR_TYPE(x) ((((x) >> S_RTR_TYPE)) & M_RTR_TYPE) + struct t3_rdma_init_attr { u32 tid; u32 qpid; @@ -293,7 +304,9 @@ struct t3_rdma_init_attr { u32 ird; u64 qp_dma_addr; u32 qp_dma_size; - u32 flags; + enum rdma_init_rtr_types rtr_type; + u16 flags; + u16 rqe_count; u32 irs; }; @@ -309,8 +322,8 @@ struct t3_rdma_init_wr { u8 mpaattrs; /* 5 */ u8 qpcaps; __be16 ulpdu_size; - __be32 flags; /* bits 31-1 - reservered */ - /* bit 0 - set if RECV posted */ + __be16 flags_rtr_type; + __be16 rqe_count; __be32 ord; /* 6 */ __be32 ird; __be64 qp_dma_addr; /* 7 */ @@ -324,7 +337,7 @@ struct t3_genbit { }; enum rdma_init_wr_flags { - RECVS_POSTED = (1<<0), + MPA_INITIATOR = (1<<0), PRIV_QP = (1<<1), }; diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 6ba4138c8ec..71554eacb13 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -83,6 +83,7 @@ static void rnic_init(struct iwch_dev *rnicp) rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */ + rnicp->attr.max_mr_size = T3_MAX_MR_SIZE; rnicp->attr.can_resize_wq = 0; rnicp->attr.max_rdma_reads_per_qp = 8; rnicp->attr.max_rdma_read_resources = diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h index 9ad9b1e7c8c..d2409a505e8 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -66,6 +66,7 @@ struct iwch_rnic_attributes { * size (4k)^i. Phys block list mode unsupported. */ u32 mem_pgsizes_bitmask; + u64 max_mr_size; u8 can_resize_wq; /* diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index 72ca360c3db..c325c44807e 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -63,10 +63,14 @@ static char *states[] = { NULL, }; -static int ep_timeout_secs = 10; +int peer2peer = 0; +module_param(peer2peer, int, 0644); +MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)"); + +static int ep_timeout_secs = 60; module_param(ep_timeout_secs, int, 0644); MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " - "in seconds (default=10)"); + "in seconds (default=60)"); static int mpa_rev = 1; module_param(mpa_rev, int, 0644); @@ -125,6 +129,12 @@ static void start_ep_timer(struct iwch_ep *ep) static void stop_ep_timer(struct iwch_ep *ep) { PDBG("%s ep %p\n", __func__, ep); + if (!timer_pending(&ep->timer)) { + printk(KERN_ERR "%s timer stopped when its not running! ep %p state %u\n", + __func__, ep, ep->com.state); + WARN_ON(1); + return; + } del_timer_sync(&ep->timer); put_ep(&ep->com); } @@ -508,7 +518,7 @@ static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) skb_reset_transport_header(skb); len = skb->len; req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); req->wr_lo = htonl(V_WR_TID(ep->hwtid)); req->len = htonl(len); req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | @@ -559,7 +569,7 @@ static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) set_arp_failure_handler(skb, arp_failure_discard); skb_reset_transport_header(skb); req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); req->wr_lo = htonl(V_WR_TID(ep->hwtid)); req->len = htonl(mpalen); req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | @@ -611,7 +621,7 @@ static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) skb_reset_transport_header(skb); len = skb->len; req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); - req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL); req->wr_lo = htonl(V_WR_TID(ep->hwtid)); req->len = htonl(len); req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | @@ -879,6 +889,7 @@ static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) * the MPA header is valid. */ state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.initiator = 1; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -901,8 +912,14 @@ static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) /* bind QP and TID with INIT_WR */ err = iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); - if (!err) - goto out; + if (err) + goto err; + + if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) { + iwch_post_zb_read(ep->com.qp); + } + + goto out; err: abort_connection(ep, skb, GFP_KERNEL); out: @@ -995,6 +1012,7 @@ static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb) * If we get here we have accumulated the entire mpa * start reply message including private data. */ + ep->mpa_attr.initiator = 0; ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; ep->mpa_attr.recv_marker_enabled = markers_enabled; ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; @@ -1065,17 +1083,33 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) PDBG("%s ep %p credits %u\n", __func__, ep, credits); - if (credits == 0) + if (credits == 0) { + PDBG(KERN_ERR "%s 0 credit ack ep %p state %u\n", + __func__, ep, state_read(&ep->com)); return CPL_RET_BUF_DONE; + } + BUG_ON(credits != 1); - BUG_ON(ep->mpa_skb == NULL); - kfree_skb(ep->mpa_skb); - ep->mpa_skb = NULL; dst_confirm(ep->dst); - if (state_read(&ep->com) == MPA_REP_SENT) { - ep->com.rpl_done = 1; - PDBG("waking up ep %p\n", ep); - wake_up(&ep->com.waitq); + if (!ep->mpa_skb) { + PDBG("%s rdma_init wr_ack ep %p state %u\n", + __func__, ep, state_read(&ep->com)); + if (ep->mpa_attr.initiator) { + PDBG("%s initiator ep %p state %u\n", + __func__, ep, state_read(&ep->com)); + if (peer2peer) + iwch_post_zb_read(ep->com.qp); + } else { + PDBG("%s responder ep %p state %u\n", + __func__, ep, state_read(&ep->com)); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + } + } else { + PDBG("%s lsm ack ep %p state %u freeing skb\n", + __func__, ep, state_read(&ep->com)); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; } return CPL_RET_BUF_DONE; } @@ -1083,8 +1117,11 @@ static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) { struct iwch_ep *ep = ctx; + unsigned long flags; + int release = 0; PDBG("%s ep %p\n", __func__, ep); + BUG_ON(!ep); /* * We get 2 abort replies from the HW. The first one must @@ -1095,9 +1132,22 @@ static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) return CPL_RET_BUF_DONE; } - close_complete_upcall(ep); - state_set(&ep->com, DEAD); - release_ep_resources(ep); + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case ABORTING: + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + default: + printk(KERN_ERR "%s ep %p state %d\n", + __func__, ep, ep->com.state); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + + if (release) + release_ep_resources(ep); return CPL_RET_BUF_DONE; } @@ -1470,7 +1520,8 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) struct sk_buff *rpl_skb; struct iwch_qp_attributes attrs; int ret; - int state; + int release = 0; + unsigned long flags; if (is_neg_adv_abort(req->status)) { PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep, @@ -1488,9 +1539,9 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) return CPL_RET_BUF_DONE; } - state = state_read(&ep->com); - PDBG("%s ep %p state %u\n", __func__, ep, state); - switch (state) { + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state); + switch (ep->com.state) { case CONNECTING: break; case MPA_REQ_WAIT: @@ -1536,21 +1587,25 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) break; case DEAD: PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); + spin_unlock_irqrestore(&ep->com.lock, flags); return CPL_RET_BUF_DONE; default: BUG_ON(1); break; } dst_confirm(ep->dst); + if (ep->com.state != ABORTING) { + __state_set(&ep->com, DEAD); + release = 1; + } + spin_unlock_irqrestore(&ep->com.lock, flags); rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); if (!rpl_skb) { printk(KERN_ERR MOD "%s - cannot allocate skb!\n", __func__); - dst_release(ep->dst); - l2t_release(L2DATA(ep->com.tdev), ep->l2t); - put_ep(&ep->com); - return CPL_RET_BUF_DONE; + release = 1; + goto out; } rpl_skb->priority = CPL_PRIORITY_DATA; rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); @@ -1559,10 +1614,9 @@ static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); rpl->cmd = CPL_ABORT_NO_RST; cxgb3_ofld_send(ep->com.tdev, rpl_skb); - if (state != ABORTING) { - state_set(&ep->com, DEAD); +out: + if (release) release_ep_resources(ep); - } return CPL_RET_BUF_DONE; } @@ -1596,8 +1650,8 @@ static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) release = 1; break; case ABORTING: - break; case DEAD: + break; default: BUG_ON(1); break; @@ -1661,15 +1715,18 @@ static void ep_timeout(unsigned long arg) struct iwch_ep *ep = (struct iwch_ep *)arg; struct iwch_qp_attributes attrs; unsigned long flags; + int abort = 1; spin_lock_irqsave(&ep->com.lock, flags); PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid, ep->com.state); switch (ep->com.state) { case MPA_REQ_SENT: + __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: + __state_set(&ep->com, ABORTING); break; case CLOSING: case MORIBUND: @@ -1679,13 +1736,17 @@ static void ep_timeout(unsigned long arg) ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); } + __state_set(&ep->com, ABORTING); break; default: - BUG(); + printk(KERN_ERR "%s unexpected state ep %p state %u\n", + __func__, ep, ep->com.state); + WARN_ON(1); + abort = 0; } - __state_set(&ep->com, CLOSING); spin_unlock_irqrestore(&ep->com.lock, flags); - abort_connection(ep, NULL, GFP_ATOMIC); + if (abort) + abort_connection(ep, NULL, GFP_ATOMIC); put_ep(&ep->com); } @@ -1762,16 +1823,19 @@ int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (err) goto err; + /* if needed, wait for wr_ack */ + if (iwch_rqes_posted(qp)) { + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (err) + goto err; + } + err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) goto err; - /* wait for wr_ack */ - wait_event(ep->com.waitq, ep->com.rpl_done); - err = ep->com.rpl_err; - if (err) - goto err; state_set(&ep->com, FPDU_MODE); established_upcall(ep); @@ -1968,40 +2032,39 @@ int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, states[ep->com.state], abrupt); - if (ep->com.state == DEAD) { - PDBG("%s already dead ep %p\n", __func__, ep); - goto out; - } - - if (abrupt) { - if (ep->com.state != ABORTING) { - ep->com.state = ABORTING; - close = 1; - } - goto out; - } - switch (ep->com.state) { case MPA_REQ_WAIT: case MPA_REQ_SENT: case MPA_REQ_RCVD: case MPA_REP_SENT: case FPDU_MODE: - start_ep_timer(ep); - ep->com.state = CLOSING; close = 1; + if (abrupt) + ep->com.state = ABORTING; + else { + ep->com.state = CLOSING; + start_ep_timer(ep); + } break; case CLOSING: - ep->com.state = MORIBUND; close = 1; + if (abrupt) { + stop_ep_timer(ep); + ep->com.state = ABORTING; + } else + ep->com.state = MORIBUND; break; case MORIBUND: + case ABORTING: + case DEAD: + PDBG("%s ignoring disconnect ep %p state %u\n", + __func__, ep, ep->com.state); break; default: BUG(); break; } -out: + spin_unlock_irqrestore(&ep->com.lock, flags); if (close) { if (abrupt) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h index 2bb7fbdb3ff..d7c7e09f099 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.h +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -56,6 +56,7 @@ #define put_ep(ep) { \ PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__, \ ep, atomic_read(&((ep)->kref.refcount))); \ + WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \ kref_put(&((ep)->kref), __free_ep); \ } @@ -225,5 +226,6 @@ int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, st int __init iwch_cm_init(void); void __exit iwch_cm_term(void); +extern int peer2peer; #endif /* _IWCH_CM_H_ */ diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c index 58c3d61bcd1..ec49a5cbdeb 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_mem.c +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -35,17 +35,26 @@ #include <rdma/ib_verbs.h> #include "cxio_hal.h" +#include "cxio_resource.h" #include "iwch.h" #include "iwch_provider.h" -int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - __be64 *page_list) +static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) { - u32 stag; u32 mmid; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); +} + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift) +{ + u32 stag; if (cxio_register_phys_mem(&rhp->rdev, &stag, mhp->attr.pdid, @@ -53,28 +62,21 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + + iwch_finish_mem_reg(mhp, stag); + return 0; } int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages) { u32 stag; - u32 mmid; - /* We could support this... */ if (npages > mhp->attr.pbl_size) @@ -87,19 +89,40 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + + iwch_finish_mem_reg(mhp, stag); + + return 0; +} + +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + return 0; } +void iwch_free_pbl(struct iwch_mr *mhp) +{ + cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +} + +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) +{ + return cxio_write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (offset << 3), npages); +} + int build_phys_page_list(struct ib_phys_buf *buffer_list, int num_phys_buf, u64 *iova_start, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index ab4695c1dd5..8934178a23e 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -442,6 +442,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) mmid = mhp->attr.stag >> 8; cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, mhp->attr.pbl_addr); + iwch_free_pbl(mhp); remove_handle(rhp, &rhp->mmidr, mmid); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); @@ -475,6 +476,8 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->rhp = rhp; + /* First check that we have enough alignment */ if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { ret = -EINVAL; @@ -492,7 +495,17 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (ret) goto err; - mhp->rhp = rhp; + ret = iwch_alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err_pbl; + } + + ret = iwch_write_pbl(mhp, page_list, npages, 0); + kfree(page_list); + if (ret) + goto err_pbl; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; @@ -502,12 +515,15 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, mhp->attr.len = (u32) total_size; mhp->attr.pbl_size = npages; - ret = iwch_register_mem(rhp, php, mhp, shift, page_list); - kfree(page_list); - if (ret) { - goto err; - } + ret = iwch_register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + err: kfree(mhp); return ERR_PTR(ret); @@ -560,7 +576,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, return ret; } - ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); kfree(page_list); if (ret) { return ret; @@ -602,7 +618,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mhp) return ERR_PTR(-ENOMEM); - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc); + mhp->rhp = rhp; + + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); kfree(mhp); @@ -615,10 +633,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, list_for_each_entry(chunk, &mhp->umem->chunk_list, list) n += chunk->nents; - pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + err = iwch_alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); if (!pages) { err = -ENOMEM; - goto err; + goto err_pbl; } i = n = 0; @@ -630,25 +652,38 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, pages[i++] = cpu_to_be64(sg_dma_address( &chunk->page_list[j]) + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } } } - mhp->rhp = rhp; + if (i) + err = iwch_write_pbl(mhp, pages, i, n); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); mhp->attr.va_fbo = virt; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) length; - mhp->attr.pbl_size = i; - err = iwch_register_mem(rhp, php, mhp, shift, pages); - kfree(pages); + + err = iwch_register_mem(rhp, php, mhp, shift); if (err) - goto err; + goto err_pbl; if (udata && !t3a_device(rhp)) { uresp.pbl_addr = (mhp->attr.pbl_addr - - rhp->rdev.rnic_info.pbl_base) >> 3; + rhp->rdev.rnic_info.pbl_base) >> 3; PDBG("%s user resp pbl_addr 0x%x\n", __func__, uresp.pbl_addr); @@ -661,6 +696,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mhp->ibmr; +err_pbl: + iwch_free_pbl(mhp); + err: ib_umem_release(mhp->umem); kfree(mhp); @@ -998,7 +1036,7 @@ static int iwch_query_device(struct ib_device *ibdev, props->device_cap_flags = dev->device_cap_flags; props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; - props->max_mr_size = ~0ull; + props->max_mr_size = dev->attr.max_mr_size; props->max_qp = dev->attr.max_qps; props->max_qp_wr = dev->attr.max_wrs; props->max_sge = dev->attr.max_sge_per_wr; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index 61356f91109..836163fc542 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -118,6 +118,7 @@ enum IWCH_QP_FLAGS { }; struct iwch_mpa_attributes { + u8 initiator; u8 recv_marker_enabled; u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ u8 crc_enabled; @@ -322,6 +323,7 @@ enum iwch_qp_query_flags { IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ }; +u16 iwch_rqes_posted(struct iwch_qp *qhp); int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, @@ -331,20 +333,21 @@ int iwch_bind_mw(struct ib_qp *qp, struct ib_mw_bind *mw_bind); int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_post_zb_read(struct iwch_qp *qhp); int iwch_register_device(struct iwch_dev *dev); void iwch_unregister_device(struct iwch_dev *dev); int iwch_quiesce_qps(struct iwch_cq *chp); int iwch_resume_qps(struct iwch_cq *chp); void stop_read_rep_timer(struct iwch_qp *qhp); int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - __be64 *page_list); + struct iwch_mr *mhp, int shift); int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages); +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); +void iwch_free_pbl(struct iwch_mr *mhp); +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); int build_phys_page_list(struct ib_phys_buf *buffer_list, int num_phys_buf, u64 *iova_start, diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 8891c3b0a3d..79dbe5beae5 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -586,6 +586,36 @@ static inline void build_term_codes(struct respQ_msg_t *rsp_msg, } } +int iwch_post_zb_read(struct iwch_qp *qhp) +{ + union t3_wr *wqe; + struct sk_buff *skb; + u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + + PDBG("%s enter\n", __func__); + skb = alloc_skb(40, GFP_KERNEL); + if (!skb) { + printk(KERN_ERR "%s cannot send zb_read!!\n", __func__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr)); + memset(wqe, 0, sizeof(struct t3_rdma_read_wr)); + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.reserved[2] = 0; + wqe->read.rem_stag = cpu_to_be32(1); + wqe->read.rem_to = cpu_to_be64(1); + wqe->read.local_stag = cpu_to_be32(1); + wqe->read.local_len = cpu_to_be32(0); + wqe->read.local_to = cpu_to_be64(1); + wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ)); + wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid)| + V_FW_RIWR_LEN(flit_cnt)); + skb->priority = CPL_PRIORITY_DATA; + return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); +} + /* * This posts a TERMINATE with layer=RDMA, type=catastrophic. */ @@ -625,6 +655,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) { struct iwch_cq *rchp, *schp; int count; + int flushed; rchp = get_chp(qhp->rhp, qhp->attr.rcq); schp = get_chp(qhp->rhp, qhp->attr.scq); @@ -639,20 +670,22 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) spin_lock(&qhp->lock); cxio_flush_hw_cq(&rchp->cq); cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); - cxio_flush_rq(&qhp->wq, &rchp->cq, count); + flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, *flag); - (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); + if (flushed) + (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); /* locking heirarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, *flag); spin_lock(&qhp->lock); cxio_flush_hw_cq(&schp->cq); cxio_count_scqes(&schp->cq, &qhp->wq, &count); - cxio_flush_sq(&qhp->wq, &schp->cq, count); + flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, *flag); - (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); + if (flushed) + (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); /* deref */ if (atomic_dec_and_test(&qhp->refcnt)) @@ -671,11 +704,18 @@ static void flush_qp(struct iwch_qp *qhp, unsigned long *flag) /* - * Return non zero if at least one RECV was pre-posted. + * Return count of RECV WRs posted */ -static int rqes_posted(struct iwch_qp *qhp) +u16 iwch_rqes_posted(struct iwch_qp *qhp) { - return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; + union t3_wr *wqe = qhp->wq.queue; + u16 count = 0; + while ((count+1) != 0 && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) { + count++; + wqe++; + } + PDBG("%s qhp %p count %u\n", __func__, qhp, count); + return count; } static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, @@ -716,8 +756,17 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, init_attr.ird = qhp->attr.max_ird; init_attr.qp_dma_addr = qhp->wq.dma_addr; init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); - init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; + init_attr.rqe_count = iwch_rqes_posted(qhp); + init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0; init_attr.flags |= capable(CAP_NET_BIND_SERVICE) ? PRIV_QP : 0; + if (peer2peer) { + init_attr.rtr_type = RTR_READ; + if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator) + init_attr.ord = 1; + if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator) + init_attr.ird = 1; + } else + init_attr.rtr_type = 0; init_attr.irs = qhp->ep->rcv_seq; PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " "flags 0x%x qpcaps 0x%x\n", __func__, @@ -832,8 +881,8 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, abort=0; disconnect = 1; ep = qhp->ep; + get_ep(&ep->com); } - flush_qp(qhp, &flag); break; case IWCH_QP_STATE_TERMINATE: qhp->attr.state = IWCH_QP_STATE_TERMINATE; @@ -848,6 +897,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, abort=1; disconnect = 1; ep = qhp->ep; + get_ep(&ep->com); } goto err; break; @@ -863,6 +913,7 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, } switch (attrs->next_state) { case IWCH_QP_STATE_IDLE: + flush_qp(qhp, &flag); qhp->attr.state = IWCH_QP_STATE_IDLE; qhp->attr.llp_stream_handle = NULL; put_ep(&qhp->ep->com); @@ -929,8 +980,10 @@ out: * on the EP. This can be a normal close (RTS->CLOSING) or * an abnormal close (RTS/CLOSING->ERROR). */ - if (disconnect) + if (disconnect) { iwch_ep_disconnect(ep, abort, GFP_KERNEL); + put_ep(&ep->com); + } /* * If free is 1, then we've disassociated the EP from the QP diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 3d6d9461c31..1e9e99a1393 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -66,6 +66,7 @@ struct ehca_av; #include "ehca_irq.h" #define EHCA_EQE_CACHE_SIZE 20 +#define EHCA_MAX_NUM_QUEUES 0xffff struct ehca_eqe_cache_entry { struct ehca_eqe *eqe; @@ -127,6 +128,8 @@ struct ehca_shca { /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */ u32 hca_cap_mr_pgsize; int max_mtu; + atomic_t num_cqs; + atomic_t num_qps; }; struct ehca_pd { @@ -189,6 +192,8 @@ struct ehca_qp { int mtu_shift; u32 message_count; u32 packet_count; + atomic_t nr_events; /* events seen */ + wait_queue_head_t wait_completion; }; #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) @@ -344,6 +349,8 @@ extern int ehca_use_hp_mr; extern int ehca_scaling_code; extern int ehca_lock_hcalls; extern int ehca_nr_ports; +extern int ehca_max_cq; +extern int ehca_max_qp; struct ipzu_queue_resp { u32 qe_size; /* queue entry size */ diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c index ec0cfcf3073..5540b276a33 100644 --- a/drivers/infiniband/hw/ehca/ehca_cq.c +++ b/drivers/infiniband/hw/ehca/ehca_cq.c @@ -132,10 +132,19 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector, if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) return ERR_PTR(-EINVAL); + if (!atomic_add_unless(&shca->num_cqs, 1, ehca_max_cq)) { + ehca_err(device, "Unable to create CQ, max number of %i " + "CQs reached.", ehca_max_cq); + ehca_err(device, "To increase the maximum number of CQs " + "use the number_of_cqs module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL); if (!my_cq) { ehca_err(device, "Out of memory for ehca_cq struct device=%p", device); + atomic_dec(&shca->num_cqs); return ERR_PTR(-ENOMEM); } @@ -305,6 +314,7 @@ create_cq_exit2: create_cq_exit1: kmem_cache_free(cq_cache, my_cq); + atomic_dec(&shca->num_cqs); return cq; } @@ -359,6 +369,7 @@ int ehca_destroy_cq(struct ib_cq *cq) ipz_queue_dtor(NULL, &my_cq->ipz_queue); kmem_cache_free(cq_cache, my_cq); + atomic_dec(&shca->num_cqs); return 0; } diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c index b4ac617a70e..49660dfa186 100644 --- a/drivers/infiniband/hw/ehca/ehca_eq.c +++ b/drivers/infiniband/hw/ehca/ehca_eq.c @@ -54,7 +54,8 @@ int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq, const enum ehca_eq_type type, const u32 length) { - u64 ret; + int ret; + u64 h_ret; u32 nr_pages; u32 i; void *vpage; @@ -73,15 +74,15 @@ int ehca_create_eq(struct ehca_shca *shca, return -EINVAL; } - ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle, - &eq->pf, - type, - length, - &eq->ipz_eq_handle, - &eq->length, - &nr_pages, &eq->ist); + h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle, + &eq->pf, + type, + length, + &eq->ipz_eq_handle, + &eq->length, + &nr_pages, &eq->ist); - if (ret != H_SUCCESS) { + if (h_ret != H_SUCCESS) { ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq); return -EINVAL; } @@ -97,24 +98,22 @@ int ehca_create_eq(struct ehca_shca *shca, u64 rpage; vpage = ipz_qpageit_get_inc(&eq->ipz_queue); - if (!vpage) { - ret = H_RESOURCE; + if (!vpage) goto create_eq_exit2; - } rpage = virt_to_abs(vpage); - ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle, - eq->ipz_eq_handle, - &eq->pf, - 0, 0, rpage, 1); + h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle, + eq->ipz_eq_handle, + &eq->pf, + 0, 0, rpage, 1); if (i == (nr_pages - 1)) { /* last page */ vpage = ipz_qpageit_get_inc(&eq->ipz_queue); - if (ret != H_SUCCESS || vpage) + if (h_ret != H_SUCCESS || vpage) goto create_eq_exit2; } else { - if (ret != H_PAGE_REGISTERED || !vpage) + if (h_ret != H_PAGE_REGISTERED || !vpage) goto create_eq_exit2; } } diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index 2515cbde7e6..bc3b37d2070 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -101,7 +101,6 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) props->max_ee = limit_uint(rblock->max_rd_ee_context); props->max_rdd = limit_uint(rblock->max_rd_domain); props->max_fmr = limit_uint(rblock->max_mr); - props->local_ca_ack_delay = limit_uint(rblock->local_ca_ack_delay); props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp); props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context); props->max_res_rd_atom = limit_uint(rblock->max_rr_hca); @@ -115,7 +114,7 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) } props->max_pkeys = 16; - props->local_ca_ack_delay = limit_uint(rblock->local_ca_ack_delay); + props->local_ca_ack_delay = min_t(u8, rblock->local_ca_ack_delay, 255); props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp); props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp); props->max_mcast_grp = limit_uint(rblock->max_mcast_grp); @@ -136,7 +135,7 @@ query_device1: return ret; } -static int map_mtu(struct ehca_shca *shca, u32 fw_mtu) +static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu) { switch (fw_mtu) { case 0x1: @@ -156,7 +155,7 @@ static int map_mtu(struct ehca_shca *shca, u32 fw_mtu) } } -static int map_number_of_vls(struct ehca_shca *shca, u32 vl_cap) +static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap) { switch (vl_cap) { case 0x1: diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index ca5eb0cb628..ce1ab0571be 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -204,6 +204,8 @@ static void qp_event_callback(struct ehca_shca *shca, u64 eqe, read_lock(&ehca_qp_idr_lock); qp = idr_find(&ehca_qp_idr, token); + if (qp) + atomic_inc(&qp->nr_events); read_unlock(&ehca_qp_idr_lock); if (!qp) @@ -223,6 +225,8 @@ static void qp_event_callback(struct ehca_shca *shca, u64 eqe, if (fatal && qp->ext_type == EQPT_SRQBASE) dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED); + if (atomic_dec_and_test(&qp->nr_events)) + wake_up(&qp->wait_completion); return; } diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 65048976198..482103eb6ea 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -68,6 +68,8 @@ int ehca_port_act_time = 30; int ehca_static_rate = -1; int ehca_scaling_code = 0; int ehca_lock_hcalls = -1; +int ehca_max_cq = -1; +int ehca_max_qp = -1; module_param_named(open_aqp1, ehca_open_aqp1, bool, S_IRUGO); module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); @@ -79,6 +81,8 @@ module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO); module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO); module_param_named(lock_hcalls, ehca_lock_hcalls, bool, S_IRUGO); +module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO); +module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO); MODULE_PARM_DESC(open_aqp1, "Open AQP1 on startup (default: no)"); @@ -104,6 +108,12 @@ MODULE_PARM_DESC(scaling_code, MODULE_PARM_DESC(lock_hcalls, "Serialize all hCalls made by the driver " "(default: autodetect)"); +MODULE_PARM_DESC(number_of_cqs, + "Max number of CQs which can be allocated " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_qps, + "Max number of QPs which can be allocated " + "(default: autodetect)"); DEFINE_RWLOCK(ehca_qp_idr_lock); DEFINE_RWLOCK(ehca_cq_idr_lock); @@ -355,6 +365,25 @@ static int ehca_sense_attributes(struct ehca_shca *shca) if (rblock->memory_page_size_supported & pgsize_map[i]) shca->hca_cap_mr_pgsize |= pgsize_map[i + 1]; + /* Set maximum number of CQs and QPs to calculate EQ size */ + if (ehca_max_qp == -1) + ehca_max_qp = min_t(int, rblock->max_qp, EHCA_MAX_NUM_QUEUES); + else if (ehca_max_qp < 1 || ehca_max_qp > rblock->max_qp) { + ehca_gen_err("Requested number of QPs is out of range (1 - %i) " + "specified by HW", rblock->max_qp); + ret = -EINVAL; + goto sense_attributes1; + } + + if (ehca_max_cq == -1) + ehca_max_cq = min_t(int, rblock->max_cq, EHCA_MAX_NUM_QUEUES); + else if (ehca_max_cq < 1 || ehca_max_cq > rblock->max_cq) { + ehca_gen_err("Requested number of CQs is out of range (1 - %i) " + "specified by HW", rblock->max_cq); + ret = -EINVAL; + goto sense_attributes1; + } + /* query max MTU from first port -- it's the same for all ports */ port = (struct hipz_query_port *)rblock; h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); @@ -684,7 +713,7 @@ static int __devinit ehca_probe(struct of_device *dev, struct ehca_shca *shca; const u64 *handle; struct ib_pd *ibpd; - int ret, i; + int ret, i, eq_size; handle = of_get_property(dev->node, "ibm,hca-handle", NULL); if (!handle) { @@ -705,6 +734,8 @@ static int __devinit ehca_probe(struct of_device *dev, return -ENOMEM; } mutex_init(&shca->modify_mutex); + atomic_set(&shca->num_cqs, 0); + atomic_set(&shca->num_qps, 0); for (i = 0; i < ARRAY_SIZE(shca->sport); i++) spin_lock_init(&shca->sport[i].mod_sqp_lock); @@ -724,8 +755,9 @@ static int __devinit ehca_probe(struct of_device *dev, goto probe1; } + eq_size = 2 * ehca_max_cq + 4 * ehca_max_qp; /* create event queues */ - ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, 2048); + ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size); if (ret) { ehca_err(&shca->ib_device, "Cannot create EQ."); goto probe1; diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 46ae4eb2c4e..f974367cad4 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -323,7 +323,7 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } e_mr->umem = ib_umem_get(pd->uobject->context, start, length, - mr_access_flags); + mr_access_flags, 0); if (IS_ERR(e_mr->umem)) { ib_mr = (void *)e_mr->umem; goto reg_user_mr_exit1; diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 57bef1152cc..3f59587338e 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -421,8 +421,18 @@ static struct ehca_qp *internal_create_qp( u32 swqe_size = 0, rwqe_size = 0, ib_qp_num; unsigned long flags; - if (init_attr->create_flags) + if (!atomic_add_unless(&shca->num_qps, 1, ehca_max_qp)) { + ehca_err(pd->device, "Unable to create QP, max number of %i " + "QPs reached.", ehca_max_qp); + ehca_err(pd->device, "To increase the maximum number of QPs " + "use the number_of_qps module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + if (init_attr->create_flags) { + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); + } memset(&parms, 0, sizeof(parms)); qp_type = init_attr->qp_type; @@ -431,6 +441,7 @@ static struct ehca_qp *internal_create_qp( init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed", init_attr->sq_sig_type); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } @@ -455,6 +466,7 @@ static struct ehca_qp *internal_create_qp( if (is_llqp && has_srq) { ehca_err(pd->device, "LLQPs can't have an SRQ"); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } @@ -466,6 +478,7 @@ static struct ehca_qp *internal_create_qp( ehca_err(pd->device, "no more than three SGEs " "supported for SRQ pd=%p max_sge=%x", pd, init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } } @@ -477,6 +490,7 @@ static struct ehca_qp *internal_create_qp( qp_type != IB_QPT_SMI && qp_type != IB_QPT_GSI) { ehca_err(pd->device, "wrong QP Type=%x", qp_type); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } @@ -490,6 +504,7 @@ static struct ehca_qp *internal_create_qp( "or max_rq_wr=%x for RC LLQP", init_attr->cap.max_send_wr, init_attr->cap.max_recv_wr); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } break; @@ -497,6 +512,7 @@ static struct ehca_qp *internal_create_qp( if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) { ehca_err(pd->device, "UD LLQP not supported " "by this adapter"); + atomic_dec(&shca->num_qps); return ERR_PTR(-ENOSYS); } if (!(init_attr->cap.max_send_sge <= 5 @@ -508,20 +524,22 @@ static struct ehca_qp *internal_create_qp( "or max_recv_sge=%x for UD LLQP", init_attr->cap.max_send_sge, init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } else if (init_attr->cap.max_send_wr > 255) { ehca_err(pd->device, "Invalid Number of " "max_send_wr=%x for UD QP_TYPE=%x", init_attr->cap.max_send_wr, qp_type); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } break; default: ehca_err(pd->device, "unsupported LL QP Type=%x", qp_type); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); - break; } } else { int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI @@ -533,6 +551,7 @@ static struct ehca_qp *internal_create_qp( "send_sge=%x recv_sge=%x max_sge=%x", init_attr->cap.max_send_sge, init_attr->cap.max_recv_sge, max_sge); + atomic_dec(&shca->num_qps); return ERR_PTR(-EINVAL); } } @@ -543,9 +562,12 @@ static struct ehca_qp *internal_create_qp( my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL); if (!my_qp) { ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); + atomic_dec(&shca->num_qps); return ERR_PTR(-ENOMEM); } + atomic_set(&my_qp->nr_events, 0); + init_waitqueue_head(&my_qp->wait_completion); spin_lock_init(&my_qp->spinlock_s); spin_lock_init(&my_qp->spinlock_r); my_qp->qp_type = qp_type; @@ -823,6 +845,7 @@ create_qp_exit1: create_qp_exit0: kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); return ERR_PTR(ret); } @@ -1913,6 +1936,9 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, idr_remove(&ehca_qp_idr, my_qp->token); write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + /* now wait until all pending events have completed */ + wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events)); + h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); if (h_ret != H_SUCCESS) { ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%li " @@ -1948,6 +1974,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, if (HAS_SQ(my_qp)) ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); return 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index acf30c06a0c..daad09a4591 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1197,7 +1197,7 @@ void ipath_kreceive(struct ipath_portdata *pd) } reloop: - for (last = 0, i = 1; !last; i++) { + for (last = 0, i = 1; !last; i += !last) { hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); eflags = ipath_hdrget_err_flags(rhf_addr); etype = ipath_hdrget_rcv_type(rhf_addr); @@ -1428,6 +1428,40 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) spin_unlock_irqrestore(&ipath_pioavail_lock, flags); } +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, oldval, dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + /** * ipath_setrcvhdrsize - set the receive header size * @dd: the infinipath device @@ -1482,9 +1516,12 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd) */ ipath_stats.sps_nopiobufs++; if (!(++dd->ipath_consec_nopiobuf % 100000)) { - ipath_dbg("%u pio sends with no bufavail; dmacopy: " - "%llx %llx %llx %llx; shadow: %lx %lx %lx %lx\n", + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), (unsigned long long) le64_to_cpu(dma[0]), (unsigned long long) le64_to_cpu(dma[1]), (unsigned long long) le64_to_cpu(dma[2]), @@ -1496,14 +1533,17 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd) */ if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > (sizeof(shadow[0]) * 4 * 4)) - ipath_dbg("2nd group: dmacopy: %llx %llx " - "%llx %llx; shadow: %lx %lx %lx %lx\n", + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", (unsigned long long)le64_to_cpu(dma[4]), (unsigned long long)le64_to_cpu(dma[5]), (unsigned long long)le64_to_cpu(dma[6]), (unsigned long long)le64_to_cpu(dma[7]), - shadow[4], shadow[5], shadow[6], - shadow[7]); + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); } } @@ -1652,19 +1692,46 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, unsigned len, int avail) { unsigned long flags; - unsigned end; + unsigned end, cnt = 0, next; /* There are two bits per send buffer (busy and generation) */ start *= 2; - len *= 2; - end = start + len; + end = start + len * 2; - /* Set or clear the generation bits. */ spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ while (start < end) { if (avail) { - __clear_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, - dd->ipath_pioavailshadow); + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); __set_bit(start, dd->ipath_pioavailkernel); } else { __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, @@ -1673,7 +1740,44 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, } start += 2; } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + next = find_first_bit(dd->ipath_pioavailkernel, end); + while (next < end) { + cnt++; + next = find_next_bit(dd->ipath_pioavailkernel, end, + next + 1); + } + } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + /* + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. + */ + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } } /** @@ -1790,12 +1894,12 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) */ if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { int skip_cancel; - u64 *statp = &dd->ipath_sdma_status; + unsigned long *statp = &dd->ipath_sdma_status; spin_lock_irqsave(&dd->ipath_sdma_lock, flags); skip_cancel = - !test_bit(IPATH_SDMA_DISABLED, statp) && - test_and_set_bit(IPATH_SDMA_ABORTING, statp); + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); if (skip_cancel) goto bail; @@ -1826,6 +1930,9 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) ipath_disarm_piobufs(dd, 0, dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + if (restore_sendctrl) { /* else done by caller later if needed */ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); @@ -1845,7 +1952,6 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) /* only wait so long for intr */ dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; dd->ipath_sdma_reset_wait = 200; - __set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) tasklet_hi_schedule(&dd->ipath_sdma_abort_task); spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); @@ -2510,7 +2616,7 @@ int ipath_reset_device(int unit) ipath_dbg("unit %u port %d is in use " "(PID %u cmd %s), can't reset\n", unit, i, - dd->ipath_pd[i]->port_pid, + pid_nr(dd->ipath_pd[i]->port_pid), dd->ipath_pd[i]->port_comm); ret = -EBUSY; goto bail; @@ -2548,19 +2654,21 @@ bail: static int ipath_signal_procs(struct ipath_devdata *dd, int sig) { int i, sub, any = 0; - pid_t pid; + struct pid *pid; if (!dd->ipath_pd) return 0; for (i = 1; i < dd->ipath_cfgports; i++) { - if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt || - !dd->ipath_pd[i]->port_pid) + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) continue; pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "context %d in use " "(PID %u), sending signal %d\n", - i, pid, sig); - kill_proc(pid, sig, 1); + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); any++; for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { pid = dd->ipath_pd[i]->port_subpid[sub]; @@ -2568,8 +2676,8 @@ static int ipath_signal_procs(struct ipath_devdata *dd, int sig) continue; dev_info(&dd->pcidev->dev, "sub-context " "%d:%d in use (PID %u), sending " - "signal %d\n", i, sub, pid, sig); - kill_proc(pid, sig, 1); + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); any++; } } diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 8b1752202e7..b472b15637f 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -173,47 +173,25 @@ static int ipath_get_base_info(struct file *fp, (void *) dd->ipath_statusp - (void *) dd->ipath_pioavailregs_dma; if (!shared) { - kinfo->spi_piocnt = dd->ipath_pbufsport; + kinfo->spi_piocnt = pd->port_piocnt; kinfo->spi_piobufbase = (u64) pd->port_piobufs; kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; } else if (master) { - kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) + - (dd->ipath_pbufsport % subport_cnt); + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); /* Master's PIO buffers are after all the slave's */ kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * - (dd->ipath_pbufsport - kinfo->spi_piocnt); + (pd->port_piocnt - kinfo->spi_piocnt); } else { unsigned slave = subport_fp(fp) - 1; - kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt; + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * kinfo->spi_piocnt * slave; } - /* - * Set the PIO avail update threshold to no larger - * than the number of buffers per process. Note that - * we decrease it here, but won't ever increase it. - */ - if (dd->ipath_pioupd_thresh && - kinfo->spi_piocnt < dd->ipath_pioupd_thresh) { - unsigned long flags; - - dd->ipath_pioupd_thresh = kinfo->spi_piocnt; - ipath_dbg("Decreased pio update threshold to %u\n", - dd->ipath_pioupd_thresh); - spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); - dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK - << INFINIPATH_S_UPDTHRESH_SHIFT); - dd->ipath_sendctrl |= dd->ipath_pioupd_thresh - << INFINIPATH_S_UPDTHRESH_SHIFT; - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); - spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - } - if (shared) { kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; @@ -577,7 +555,7 @@ static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, p = dd->ipath_pageshadow[porttid + tid]; dd->ipath_pageshadow[porttid + tid] = NULL; ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", - pd->port_pid, tid); + pid_nr(pd->port_pid), tid); dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED, dd->ipath_tidinvalid); @@ -1309,19 +1287,19 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; if (!pd->port_subport_cnt) { /* port is not shared */ - piocnt = dd->ipath_pbufsport; + piocnt = pd->port_piocnt; piobufs = pd->port_piobufs; } else if (!subport_fp(fp)) { /* caller is the master */ - piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) + - (dd->ipath_pbufsport % pd->port_subport_cnt); + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); piobufs = pd->port_piobufs + - dd->ipath_palign * (dd->ipath_pbufsport - piocnt); + dd->ipath_palign * (pd->port_piocnt - piocnt); } else { unsigned slave = subport_fp(fp) - 1; /* caller is a slave */ - piocnt = dd->ipath_pbufsport / pd->port_subport_cnt; + piocnt = pd->port_piocnt / pd->port_subport_cnt; piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; } @@ -1631,11 +1609,8 @@ static int try_alloc_port(struct ipath_devdata *dd, int port, port); pd->port_cnt = 1; port_fp(fp) = pd; - pd->port_pid = current->pid; + pd->port_pid = get_pid(task_pid(current)); strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); - ipath_chg_pioavailkernel(dd, - dd->ipath_pbufsport * (pd->port_port - 1), - dd->ipath_pbufsport, 0); ipath_stats.sps_ports++; ret = 0; } else @@ -1818,14 +1793,15 @@ static int find_shared_port(struct file *fp, } port_fp(fp) = pd; subport_fp(fp) = pd->port_cnt++; - pd->port_subpid[subport_fp(fp)] = current->pid; + pd->port_subpid[subport_fp(fp)] = + get_pid(task_pid(current)); tidcursor_fp(fp) = 0; pd->active_slaves |= 1 << subport_fp(fp); ipath_cdbg(PROC, "%s[%u] %u sharing %s[%u] unit:port %u:%u\n", current->comm, current->pid, subport_fp(fp), - pd->port_comm, pd->port_pid, + pd->port_comm, pid_nr(pd->port_pid), dd->ipath_unit, pd->port_port); ret = 1; goto done; @@ -1938,11 +1914,25 @@ static int ipath_do_user_init(struct file *fp, /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); pd->port_piobufs = dd->ipath_piobufbase + - dd->ipath_pbufsport * (pd->port_port - 1) * dd->ipath_palign; - ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n", - pd->port_port, pd->port_piobufs); + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); /* * Now allocate the rcvhdr Q and eager TIDs; skip the TID @@ -2077,7 +2067,8 @@ static int ipath_close(struct inode *in, struct file *fp) * the slave(s) don't wait for receive data forever. */ pd->active_slaves &= ~(1 << fd->subport); - pd->port_subpid[fd->subport] = 0; + put_pid(pd->port_subpid[fd->subport]); + pd->port_subpid[fd->subport] = NULL; mutex_unlock(&ipath_mutex); goto bail; } @@ -2085,7 +2076,7 @@ static int ipath_close(struct inode *in, struct file *fp) if (pd->port_hdrqfull) { ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors " - "during run\n", pd->port_comm, pd->port_pid, + "during run\n", pd->port_comm, pid_nr(pd->port_pid), pd->port_hdrqfull); pd->port_hdrqfull = 0; } @@ -2107,7 +2098,6 @@ static int ipath_close(struct inode *in, struct file *fp) } if (dd->ipath_kregbase) { - int i; /* atomically clear receive enable port and intr avail. */ clear_bit(dd->ipath_r_portenable_shift + port, &dd->ipath_rcvctrl); @@ -2136,9 +2126,9 @@ static int ipath_close(struct inode *in, struct file *fp) ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, dd->ipath_dummy_hdrq_phys); - i = dd->ipath_pbufsport * (port - 1); - ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); - ipath_chg_pioavailkernel(dd, i, dd->ipath_pbufsport, 1); + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); dd->ipath_f_clear_tids(dd, pd->port_port); @@ -2146,11 +2136,12 @@ static int ipath_close(struct inode *in, struct file *fp) unlock_expected_tids(pd); ipath_stats.sps_ports--; ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", - pd->port_comm, pd->port_pid, + pd->port_comm, pid_nr(pd->port_pid), dd->ipath_unit, port); } - pd->port_pid = 0; + put_pid(pd->port_pid); + pd->port_pid = NULL; dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */ mutex_unlock(&ipath_mutex); ipath_free_pddata(dd, pd); /* after releasing the mutex */ diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index e3ec0d1bdf5..8eee7830f04 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -595,7 +595,7 @@ static void ipath_7220_txe_recover(struct ipath_devdata *dd) dev_info(&dd->pcidev->dev, "Recovering from TXE PIO parity error\n"); - ipath_disarm_senderrbufs(dd, 1); + ipath_disarm_senderrbufs(dd); } @@ -675,10 +675,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { /* - * Parity errors in send memory are recoverable, - * just cancel the send (if indicated in * sendbuffererror), - * count the occurrence, unfreeze (if no other handled - * hardware error bits are set), and continue. + * Parity errors in send memory are recoverable by h/w + * just do housekeeping, exit freeze mode and continue. */ if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) @@ -687,13 +685,6 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); - if (!hwerrs) { - /* else leave in freeze mode */ - ipath_write_kreg(dd, - dd->ipath_kregs->kr_control, - dd->ipath_control); - goto bail; - } } if (hwerrs) { /* @@ -723,8 +714,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, *dd->ipath_statusp |= IPATH_STATUS_HWERROR; dd->ipath_flags &= ~IPATH_INITTED; } else { - ipath_dbg("Clearing freezemode on ignored hardware " - "error\n"); + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); ipath_clear_freeze(dd); } } @@ -870,8 +861,9 @@ static int ipath_7220_boardname(struct ipath_devdata *dd, char *name, "revision %u.%u!\n", dd->ipath_majrev, dd->ipath_minrev); ret = 1; - } else if (dd->ipath_minrev == 1) { - /* Rev1 chips are prototype. Complain, but allow use */ + } else if (dd->ipath_minrev == 1 && + !(dd->ipath_flags & IPATH_INITTED)) { + /* Rev1 chips are prototype. Complain at init, but allow use */ ipath_dev_err(dd, "Unsupported hardware " "revision %u.%u, Contact support@qlogic.com\n", dd->ipath_majrev, dd->ipath_minrev); @@ -1966,7 +1958,7 @@ static void ipath_7220_config_ports(struct ipath_devdata *dd, ushort cfgports) dd->ipath_rcvctrl); dd->ipath_p0_rcvegrcnt = 2048; /* always */ if (dd->ipath_flags & IPATH_HAS_SEND_DMA) - dd->ipath_pioreserved = 1; /* reserve a buffer */ + dd->ipath_pioreserved = 3; /* kpiobufs used for PIO */ } diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 27dd8947666..3e5baa43fc8 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -41,7 +41,7 @@ /* * min buffers we want to have per port, after driver */ -#define IPATH_MIN_USER_PORT_BUFCNT 8 +#define IPATH_MIN_USER_PORT_BUFCNT 7 /* * Number of ports we are configured to use (to allow for more pio @@ -54,13 +54,9 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); /* * Number of buffers reserved for driver (verbs and layered drivers.) - * Reserved at end of buffer list. Initialized based on - * number of PIO buffers if not set via module interface. + * Initialized based on number of PIO buffers if not set via module interface. * The problem with this is that it's global, but we'll use different - * numbers for different chip types. So the default value is not - * very useful. I've redefined it for the 1.3 release so that it's - * zero unless set by the user to something else, in which case we - * try to respect it. + * numbers for different chip types. */ static ushort ipath_kpiobufs; @@ -546,9 +542,12 @@ static void enable_chip(struct ipath_devdata *dd, int reinit) pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; else pioavail = dd->ipath_pioavailregs_dma[i]; - dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail) | - (~dd->ipath_pioavailkernel[i] << - INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT); + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); } /* can get counters, stats, etc. */ dd->ipath_flags |= IPATH_PRESENT; @@ -708,12 +707,11 @@ static void verify_interrupt(unsigned long opaque) int ipath_init_chip(struct ipath_devdata *dd, int reinit) { int ret = 0; - u32 val32, kpiobufs; + u32 kpiobufs, defkbufs; u32 piobufs, uports; u64 val; struct ipath_portdata *pd; gfp_t gfp_flags = GFP_USER | __GFP_COMP; - unsigned long flags; ret = init_housekeeping(dd, reinit); if (ret) @@ -753,56 +751,46 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / (sizeof(u64) * BITS_PER_BYTE / 2); uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; - if (ipath_kpiobufs == 0) { - /* not set by user (this is default) */ - if (piobufs > 144) - kpiobufs = 32; - else - kpiobufs = 16; - } + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; else - kpiobufs = ipath_kpiobufs; + defkbufs = 16 + dd->ipath_pioreserved; - if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) { + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { int i = (int) piobufs - (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); if (i < 1) i = 1; dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " "%d for kernel leaves too few for %d user ports " - "(%d each); using %u\n", kpiobufs, + "(%d each); using %u\n", ipath_kpiobufs, piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); /* * shouldn't change ipath_kpiobufs, because could be * different for different devices... */ kpiobufs = i; - } + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; dd->ipath_lastport_piobuf = piobufs - kpiobufs; dd->ipath_pbufsport = uports ? dd->ipath_lastport_piobuf / uports : 0; - val32 = dd->ipath_lastport_piobuf - (dd->ipath_pbufsport * uports); - if (val32 > 0) { - ipath_dbg("allocating %u pbufs/port leaves %u unused, " - "add to kernel\n", dd->ipath_pbufsport, val32); - dd->ipath_lastport_piobuf -= val32; - kpiobufs += val32; - ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n", - dd->ipath_pbufsport, val32); - } + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); dd->ipath_lastpioindex = 0; dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; - ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + /* ipath_pioavailshadow initialized earlier */ ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " "each for %u user ports\n", kpiobufs, piobufs, dd->ipath_pbufsport, uports); - if (dd->ipath_pioupd_thresh) { - if (dd->ipath_pbufsport < dd->ipath_pioupd_thresh) - dd->ipath_pioupd_thresh = dd->ipath_pbufsport; - if (kpiobufs < dd->ipath_pioupd_thresh) - dd->ipath_pioupd_thresh = kpiobufs; - } - ret = dd->ipath_f_early_init(dd); if (ret) { ipath_dev_err(dd, "Early initialization failure\n"); @@ -810,13 +798,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) } /* - * Cancel any possible active sends from early driver load. - * Follows early_init because some chips have to initialize - * PIO buffers in early_init to avoid false parity errors. - */ - ipath_cancel_sends(dd, 0); - - /* * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be * done after early_init. */ @@ -836,6 +817,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, dd->ipath_pioavailregs_phys); + /* * this is to detect s/w errors, which the h/w works around by * ignoring the low 6 bits of address, if it wasn't aligned. @@ -862,12 +844,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); - spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); - dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE; - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); - ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - /* * before error clears, since we expect serdes pll errors during * this, the first time after reset @@ -940,6 +916,19 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) else enable_chip(dd, reinit); + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + if (!reinit) { /* * Used when we close a port, for DMA already in flight diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 1b58f4737c7..26900b3b7a4 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -38,42 +38,12 @@ #include "ipath_verbs.h" #include "ipath_common.h" -/* - * clear (write) a pio buffer, to clear a parity error. This routine - * should only be called when in freeze mode, and the buffer should be - * canceled afterwards. - */ -static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum) -{ - u32 __iomem *pbuf; - u32 dwcnt; /* dword count to write */ - if (pnum < dd->ipath_piobcnt2k) { - pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum * - dd->ipath_palign); - dwcnt = dd->ipath_piosize2k >> 2; - } - else { - pbuf = (u32 __iomem *) (dd->ipath_pio4kbase + - (pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign); - dwcnt = dd->ipath_piosize4k >> 2; - } - dev_info(&dd->pcidev->dev, - "Rewrite PIO buffer %u, to recover from parity error\n", - pnum); - - /* no flush required, since already in freeze */ - writel(dwcnt + 1, pbuf); - while (--dwcnt) - writel(0, pbuf++); -} /* * Called when we might have an error that is specific to a particular * PIO buffer, and may need to cancel that buffer, so it can be re-used. - * If rewrite is true, and bits are set in the sendbufferror registers, - * we'll write to the buffer, for error recovery on parity errors. */ -void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite) +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) { u32 piobcnt; unsigned long sbuf[4]; @@ -109,11 +79,8 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite) } for (i = 0; i < piobcnt; i++) - if (test_bit(i, sbuf)) { - if (rewrite) - ipath_clrpiobuf(dd, i); + if (test_bit(i, sbuf)) ipath_disarm_piobufs(dd, i, 1); - } /* ignore armlaunch errs for a bit */ dd->ipath_lastcancel = jiffies+3; } @@ -164,7 +131,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) { u64 ignore_this_time = 0; - ipath_disarm_senderrbufs(dd, 0); + ipath_disarm_senderrbufs(dd); if ((errs & E_SUM_LINK_PKTERRS) && !(dd->ipath_flags & IPATH_LINKACTIVE)) { /* @@ -909,8 +876,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * processes (causing armlaunch), send errors due to going into freeze mode, * etc., and try to avoid causing extra interrupts while doing so. * Forcibly update the in-memory pioavail register copies after cleanup - * because the chip won't do it for anything changing while in freeze mode - * (we don't want to wait for the next pio buffer state change). + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). * Make sure that we don't lose any important interrupts by using the chip * feature that says that writing 0 to a bit in *clear that is set in * *status will cause an interrupt to be generated again (if allowed by @@ -918,44 +885,23 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) */ void ipath_clear_freeze(struct ipath_devdata *dd) { - int i, im; - u64 val; - /* disable error interrupts, to avoid confusion */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); /* also disable interrupts; errormask is sometimes overwriten */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); - /* - * clear all sends, because they have may been - * completed by usercode while in freeze mode, and - * therefore would not be sent, and eventually - * might cause the process to run out of bufs - */ - ipath_cancel_sends(dd, 0); + ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ ipath_write_kreg(dd, dd->ipath_kregs->kr_control, dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - /* ensure pio avail updates continue */ + /* force in-memory update now we are out of freeze */ ipath_force_pio_avail_update(dd); /* - * We just enabled pioavailupdate, so dma copy is almost certainly - * not yet right, so read the registers directly. Similar to init - */ - for (i = 0; i < dd->ipath_pioavregs; i++) { - /* deal with 6110 chip bug */ - im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? - i ^ 1 : i; - val = ipath_read_kreg64(dd, (0x1000 / sizeof(u64)) + im); - dd->ipath_pioavailregs_dma[i] = cpu_to_le64(val); - dd->ipath_pioavailshadow[i] = val | - (~dd->ipath_pioavailkernel[i] << - INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT); - } - - /* * force new interrupt if any hwerr, error or interrupt bits are * still set, and clear "safe" send packet errors related to freeze * and cancelling sends. Re-enable error interrupts before possible @@ -1312,10 +1258,8 @@ irqreturn_t ipath_intr(int irq, void *data) ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) - handle_layer_pioavail(dd); - else - ipath_dbg("unexpected BUFAVAIL intr\n"); + /* always process; sdma verbs uses PIO for acks and VL15 */ + handle_layer_pioavail(dd); } ret = IRQ_HANDLED; diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 202337ae90d..59a8b254b97 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -117,6 +117,10 @@ struct ipath_portdata { u16 port_subport_cnt; /* non-zero if port is being shared. */ u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; /* chip offset of PIO buffers for this port */ u32 port_piobufs; /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ @@ -155,8 +159,8 @@ struct ipath_portdata { /* saved total number of polled urgent packets for poll edge trigger */ u32 port_urgent_poll; /* pid of process using this port */ - pid_t port_pid; - pid_t port_subpid[INFINIPATH_MAX_SUBPORT]; + struct pid *port_pid; + struct pid *port_subpid[INFINIPATH_MAX_SUBPORT]; /* same size as task_struct .comm[] */ char port_comm[16]; /* pkeys set by this use of this port */ @@ -384,6 +388,8 @@ struct ipath_devdata { u32 ipath_lastrpkts; /* pio bufs allocated per port */ u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; u32 ipath_pioupd_thresh; /* update threshold, some chips */ /* * number of ports configured as max; zero is set to number chip @@ -477,7 +483,7 @@ struct ipath_devdata { /* SendDMA related entries */ spinlock_t ipath_sdma_lock; - u64 ipath_sdma_status; + unsigned long ipath_sdma_status; unsigned long ipath_sdma_abort_jiffies; unsigned long ipath_sdma_abort_intr_timeout; unsigned long ipath_sdma_buf_jiffies; @@ -816,8 +822,8 @@ struct ipath_devdata { #define IPATH_SDMA_DISARMED 1 #define IPATH_SDMA_DISABLED 2 #define IPATH_SDMA_LAYERBUF 3 -#define IPATH_SDMA_RUNNING 62 -#define IPATH_SDMA_SHUTDOWN 63 +#define IPATH_SDMA_RUNNING 30 +#define IPATH_SDMA_SHUTDOWN 31 /* bit combinations that correspond to abort states */ #define IPATH_SDMA_ABORT_NONE 0 @@ -1011,7 +1017,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *); int ipath_update_eeprom_log(struct ipath_devdata *dd); void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); -void ipath_disarm_senderrbufs(struct ipath_devdata *, int); +void ipath_disarm_senderrbufs(struct ipath_devdata *); void ipath_force_pio_avail_update(struct ipath_devdata *); void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index db4ba92f79f..9d343b7c2f3 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -195,7 +195,8 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto bail; } - umem = ib_umem_get(pd->uobject->context, start, length, mr_access_flags); + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); if (IS_ERR(umem)) return (void *) umem; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index dd5b6e9d57c..4715911101e 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -242,7 +242,6 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) { struct ipath_qp *q, **qpp; unsigned long flags; - int fnd = 0; spin_lock_irqsave(&qpt->lock, flags); @@ -253,51 +252,40 @@ static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) *qpp = qp->next; qp->next = NULL; atomic_dec(&qp->refcount); - fnd = 1; break; } } spin_unlock_irqrestore(&qpt->lock, flags); - - if (!fnd) - return; - - free_qpn(qpt, qp->ibqp.qp_num); - - wait_event(qp->wait, !atomic_read(&qp->refcount)); } /** - * ipath_free_all_qps - remove all QPs from the table + * ipath_free_all_qps - check for QPs still in use * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. */ -void ipath_free_all_qps(struct ipath_qp_table *qpt) +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt) { unsigned long flags; - struct ipath_qp *qp, *nqp; - u32 n; + struct ipath_qp *qp; + u32 n, qp_inuse = 0; + spin_lock_irqsave(&qpt->lock, flags); for (n = 0; n < qpt->max; n++) { - spin_lock_irqsave(&qpt->lock, flags); qp = qpt->table[n]; qpt->table[n] = NULL; - spin_unlock_irqrestore(&qpt->lock, flags); - - while (qp) { - nqp = qp->next; - free_qpn(qpt, qp->ibqp.qp_num); - if (!atomic_dec_and_test(&qp->refcount) || - !ipath_destroy_qp(&qp->ibqp)) - ipath_dbg("QP memory leak!\n"); - qp = nqp; - } + + for (; qp; qp = qp->next) + qp_inuse++; } + spin_unlock_irqrestore(&qpt->lock, flags); - for (n = 0; n < ARRAY_SIZE(qpt->map); n++) { + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) if (qpt->map[n].page) - free_page((unsigned long)qpt->map[n].page); - } + free_page((unsigned long) qpt->map[n].page); + return qp_inuse; } /** @@ -336,11 +324,12 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) qp->remote_qpn = 0; qp->qkey = 0; qp->qp_access_flags = 0; - qp->s_busy = 0; + atomic_set(&qp->s_dma_busy, 0); qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; qp->s_hdrwords = 0; qp->s_wqe = NULL; qp->s_pkt_delay = 0; + qp->s_draining = 0; qp->s_psn = 0; qp->r_psn = 0; qp->r_msn = 0; @@ -353,7 +342,8 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) } qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; qp->r_nak_state = 0; - qp->r_wrid_valid = 0; + qp->r_aflags = 0; + qp->r_flags = 0; qp->s_rnr_timeout = 0; qp->s_head = 0; qp->s_tail = 0; @@ -361,7 +351,6 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) qp->s_last = 0; qp->s_ssn = 1; qp->s_lsn = 0; - qp->s_wait_credit = 0; memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; @@ -370,17 +359,17 @@ static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) qp->r_rq.wq->head = 0; qp->r_rq.wq->tail = 0; } - qp->r_reuse_sge = 0; } /** - * ipath_error_qp - put a QP into an error state - * @qp: the QP to put into an error state + * ipath_error_qp - put a QP into the error state + * @qp: the QP to put into the error state * @err: the receive completion error to signal if a RWQE is active * * Flushes both send and receive work queues. * Returns true if last WQE event should be generated. * The QP s_lock should be held and interrupts disabled. + * If we are already in error state, just return. */ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) @@ -389,8 +378,10 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) struct ib_wc wc; int ret = 0; - ipath_dbg("QP%d/%d in error state (%d)\n", - qp->ibqp.qp_num, qp->remote_qpn, err); + if (qp->state == IB_QPS_ERR) + goto bail; + + qp->state = IB_QPS_ERR; spin_lock(&dev->pending_lock); if (!list_empty(&qp->timerwait)) @@ -399,39 +390,21 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) list_del_init(&qp->piowait); spin_unlock(&dev->pending_lock); - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + ipath_schedule_send(qp); + + memset(&wc, 0, sizeof(wc)); wc.qp = &qp->ibqp; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - if (qp->r_wrid_valid) { - qp->r_wrid_valid = 0; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) { wc.wr_id = qp->r_wr_id; - wc.opcode = IB_WC_RECV; wc.status = err; ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); } wc.status = IB_WC_WR_FLUSH_ERR; - while (qp->s_last != qp->s_head) { - struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); - } - qp->s_cur = qp->s_tail = qp->s_head; - qp->s_hdrwords = 0; - qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - if (qp->r_rq.wq) { struct ipath_rwq *wq; u32 head; @@ -447,7 +420,6 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; - wc.opcode = IB_WC_RECV; while (tail != head) { wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; if (++tail >= qp->r_rq.size) @@ -460,6 +432,7 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) } else if (qp->ibqp.event_handler) ret = 1; +bail: return ret; } @@ -478,11 +451,10 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct ipath_ibdev *dev = to_idev(ibqp->device); struct ipath_qp *qp = to_iqp(ibqp); enum ib_qp_state cur_state, new_state; - unsigned long flags; int lastwqe = 0; int ret; - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irq(&qp->s_lock); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -535,16 +507,42 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, switch (new_state) { case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + spin_lock_irq(&qp->s_lock); + } ipath_reset_qp(qp, ibqp->qp_type); break; + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + case IB_QPS_ERR: lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: + qp->state = new_state; break; - } if (attr_mask & IB_QP_PKEY_INDEX) @@ -597,8 +595,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) qp->s_max_rd_atomic = attr->max_rd_atomic; - qp->state = new_state; - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irq(&qp->s_lock); if (lastwqe) { struct ib_event ev; @@ -612,7 +609,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto bail; inval: - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irq(&qp->s_lock); ret = -EINVAL; bail: @@ -643,7 +640,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->pkey_index = qp->s_pkey_index; attr->alt_pkey_index = 0; attr->en_sqd_async_notify = 0; - attr->sq_draining = 0; + attr->sq_draining = qp->s_draining; attr->max_rd_atomic = qp->s_max_rd_atomic; attr->max_dest_rd_atomic = qp->r_max_rd_atomic; attr->min_rnr_timer = qp->r_min_rnr_timer; @@ -833,6 +830,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); INIT_LIST_HEAD(&qp->piowait); INIT_LIST_HEAD(&qp->timerwait); @@ -926,6 +924,7 @@ bail_ip: else vfree(qp->r_rq.wq); ipath_free_qp(&dev->qp_table, qp); + free_qpn(&dev->qp_table, qp->ibqp.qp_num); bail_qp: kfree(qp); bail_swq: @@ -947,41 +946,44 @@ int ipath_destroy_qp(struct ib_qp *ibqp) { struct ipath_qp *qp = to_iqp(ibqp); struct ipath_ibdev *dev = to_idev(ibqp->device); - unsigned long flags; - spin_lock_irqsave(&qp->s_lock, flags); - qp->state = IB_QPS_ERR; - spin_unlock_irqrestore(&qp->s_lock, flags); - spin_lock(&dev->n_qps_lock); - dev->n_qps_allocated--; - spin_unlock(&dev->n_qps_lock); + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + } else + spin_unlock_irq(&qp->s_lock); - /* Stop the sending tasklet. */ - tasklet_kill(&qp->s_task); + ipath_free_qp(&dev->qp_table, qp); if (qp->s_tx) { atomic_dec(&qp->refcount); if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) kfree(qp->s_tx->txreq.map_addr); + spin_lock_irq(&dev->pending_lock); + list_add(&qp->s_tx->txreq.list, &dev->txreq_free); + spin_unlock_irq(&dev->pending_lock); + qp->s_tx = NULL; } - /* Make sure the QP isn't on the timeout list. */ - spin_lock_irqsave(&dev->pending_lock, flags); - if (!list_empty(&qp->timerwait)) - list_del_init(&qp->timerwait); - if (!list_empty(&qp->piowait)) - list_del_init(&qp->piowait); - if (qp->s_tx) - list_add(&qp->s_tx->txreq.list, &dev->txreq_free); - spin_unlock_irqrestore(&dev->pending_lock, flags); + wait_event(qp->wait, !atomic_read(&qp->refcount)); - /* - * Make sure that the QP is not in the QPN table so receive - * interrupts will discard packets for this QP. XXX Also remove QP - * from multicast table. - */ - if (atomic_read(&qp->refcount) != 0) - ipath_free_qp(&dev->qp_table, qp); + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); if (qp->ip) kref_put(&qp->ip->ref, ipath_release_mmap_info); @@ -1026,48 +1028,6 @@ bail: } /** - * ipath_sqerror_qp - put a QP's send queue into an error state - * @qp: QP who's send queue will be put into an error state - * @wc: the WC responsible for putting the QP in this state - * - * Flushes the send work queue. - * The QP s_lock should be held and interrupts disabled. - */ - -void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) -{ - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); - - ipath_dbg("Send queue error on QP%d/%d: err: %d\n", - qp->ibqp.qp_num, qp->remote_qpn, wc->status); - - spin_lock(&dev->pending_lock); - if (!list_empty(&qp->timerwait)) - list_del_init(&qp->timerwait); - if (!list_empty(&qp->piowait)) - list_del_init(&qp->piowait); - spin_unlock(&dev->pending_lock); - - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - - wc->status = IB_WC_WR_FLUSH_ERR; - - while (qp->s_last != qp->s_head) { - wqe = get_swqe_ptr(qp, qp->s_last); - wc->wr_id = wqe->wr.wr_id; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1); - if (++qp->s_last >= qp->s_size) - qp->s_last = 0; - } - qp->s_cur = qp->s_tail = qp->s_head; - qp->state = IB_QPS_SQE; -} - -/** * ipath_get_credit - flush the send work queue of a QP * @qp: the qp who's send work queue to flush * @aeth: the Acknowledge Extended Transport Header @@ -1093,9 +1053,10 @@ void ipath_get_credit(struct ipath_qp *qp, u32 aeth) } /* Restart sending if it was blocked due to lack of credits. */ - if (qp->s_cur != qp->s_head && + if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) && + qp->s_cur != qp->s_head && (qp->s_lsn == (u32) -1 || ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, qp->s_lsn + 1) <= 0)) - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); } diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index c405dfba553..108df667d2e 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -92,6 +92,10 @@ static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, u32 bth0; u32 bth2; + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto bail; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; @@ -238,14 +242,25 @@ int ipath_make_rc_req(struct ipath_qp *qp) ipath_make_rc_ack(dev, qp, ohdr, pmtu)) goto done; - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) || - qp->s_rnr_timeout || qp->s_wait_credit) - goto bail; + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } - /* Limit the number of packets sent without an ACK. */ - if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) { - qp->s_wait_credit = 1; - dev->n_rc_stalls++; + /* Leave BUSY set until RNR timeout. */ + if (qp->s_rnr_timeout) { + qp->s_flags |= IPATH_S_WAITING; goto bail; } @@ -257,6 +272,9 @@ int ipath_make_rc_req(struct ipath_qp *qp) wqe = get_swqe_ptr(qp, qp->s_cur); switch (qp->s_state) { default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; /* * Resend an old request or start a new one. * @@ -294,8 +312,10 @@ int ipath_make_rc_req(struct ipath_qp *qp) case IB_WR_SEND_WITH_IMM: /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && - ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; goto bail; + } wqe->lpsn = wqe->psn; if (len > pmtu) { wqe->lpsn += (len - 1) / pmtu; @@ -325,8 +345,10 @@ int ipath_make_rc_req(struct ipath_qp *qp) case IB_WR_RDMA_WRITE_WITH_IMM: /* If no credit, return. */ if (qp->s_lsn != (u32) -1 && - ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; goto bail; + } ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->wr.wr.rdma.remote_addr); ohdr->u.rc.reth.rkey = @@ -570,7 +592,11 @@ int ipath_make_rc_req(struct ipath_qp *qp) ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); done: ret = 1; + goto unlock; + bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } @@ -606,7 +632,11 @@ static void send_rc_ack(struct ipath_qp *qp) spin_unlock_irqrestore(&qp->s_lock, flags); + /* Don't try to send ACKs if the link isn't ACTIVE */ dd = dev->dd; + if (!(dd->ipath_flags & IPATH_LINKACTIVE)) + goto done; + piobuf = ipath_getpiobuf(dd, 0, NULL); if (!piobuf) { /* @@ -668,15 +698,16 @@ static void send_rc_ack(struct ipath_qp *qp) goto done; queue_ack: - dev->n_rc_qacks++; - qp->s_flags |= IPATH_S_ACK_PENDING; - qp->s_nak_state = qp->r_nak_state; - qp->s_ack_psn = qp->r_ack_psn; + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) { + dev->n_rc_qacks++; + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + } spin_unlock_irqrestore(&qp->s_lock, flags); - - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); - done: return; } @@ -735,7 +766,7 @@ static void reset_psn(struct ipath_qp *qp, u32 psn) /* * Set the state to restart in the middle of a request. * Don't change the s_sge, s_cur_sge, or s_cur_size. - * See ipath_do_rc_send(). + * See ipath_make_rc_req(). */ switch (opcode) { case IB_WR_SEND: @@ -771,27 +802,14 @@ done: * * The QP s_lock should be held and interrupts disabled. */ -void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) +void ipath_restart_rc(struct ipath_qp *qp, u32 psn) { struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); struct ipath_ibdev *dev; if (qp->s_retry == 0) { - wc->wr_id = wqe->wr.wr_id; - wc->status = IB_WC_RETRY_EXC_ERR; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = 0; - wc->qp = &qp->ibqp; - wc->imm_data = 0; - wc->src_qp = qp->remote_qpn; - wc->wc_flags = 0; - wc->pkey_index = 0; - wc->slid = qp->remote_ah_attr.dlid; - wc->sl = qp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_sqerror_qp(qp, wc); + ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); goto bail; } qp->s_retry--; @@ -804,6 +822,8 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) spin_lock(&dev->pending_lock); if (!list_empty(&qp->timerwait)) list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); spin_unlock(&dev->pending_lock); if (wqe->wr.opcode == IB_WR_RDMA_READ) @@ -812,7 +832,7 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; reset_psn(qp, psn); - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); bail: return; @@ -820,13 +840,7 @@ bail: static inline void update_last_psn(struct ipath_qp *qp, u32 psn) { - if (qp->s_last_psn != psn) { - qp->s_last_psn = psn; - if (qp->s_wait_credit) { - qp->s_wait_credit = 0; - tasklet_hi_schedule(&qp->s_task); - } - } + qp->s_last_psn = psn; } /** @@ -845,6 +859,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; + enum ib_wc_status status; struct ipath_swqe *wqe; int ret = 0; u32 ack_psn; @@ -909,7 +924,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, */ update_last_psn(qp, wqe->psn - 1); /* Retry this request. */ - ipath_restart_rc(qp, wqe->psn, &wc); + ipath_restart_rc(qp, wqe->psn); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -925,32 +940,23 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { qp->s_num_rd_atomic--; /* Restart sending task if fence is complete */ - if ((qp->s_flags & IPATH_S_FENCE_PENDING) && - !qp->s_num_rd_atomic) { - qp->s_flags &= ~IPATH_S_FENCE_PENDING; - tasklet_hi_schedule(&qp->s_task); - } else if (qp->s_flags & IPATH_S_RDMAR_PENDING) { - qp->s_flags &= ~IPATH_S_RDMAR_PENDING; - tasklet_hi_schedule(&qp->s_task); - } + if (((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) || + qp->s_flags & IPATH_S_RDMAR_PENDING) + ipath_schedule_send(qp); } /* Post a send completion queue entry if requested. */ if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); wc.wr_id = wqe->wr.wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; wc.byte_len = wqe->length; - wc.imm_data = 0; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.wc_flags = 0; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); } qp->s_retry = qp->s_retry_cnt; @@ -971,6 +977,8 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, } else { if (++qp->s_last >= qp->s_size) qp->s_last = 0; + if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur) + qp->s_draining = 0; if (qp->s_last == qp->s_tail) break; wqe = get_swqe_ptr(qp, qp->s_last); @@ -994,7 +1002,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, */ if (ipath_cmp24(qp->s_psn, psn) <= 0) { reset_psn(qp, psn + 1); - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); } } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { qp->s_state = OP(SEND_LAST); @@ -1012,7 +1020,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, if (qp->s_last == qp->s_tail) goto bail; if (qp->s_rnr_retry == 0) { - wc.status = IB_WC_RNR_RETRY_EXC_ERR; + status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } if (qp->s_rnr_retry_cnt < 7) @@ -1033,6 +1041,7 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK]; ipath_insert_rnr_queue(qp); + ipath_schedule_send(qp); goto bail; case 3: /* NAK */ @@ -1050,37 +1059,25 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, * RDMA READ response which terminates the RDMA * READ. */ - ipath_restart_rc(qp, psn, &wc); + ipath_restart_rc(qp, psn); break; case 1: /* Invalid Request */ - wc.status = IB_WC_REM_INV_REQ_ERR; + status = IB_WC_REM_INV_REQ_ERR; dev->n_other_naks++; goto class_b; case 2: /* Remote Access Error */ - wc.status = IB_WC_REM_ACCESS_ERR; + status = IB_WC_REM_ACCESS_ERR; dev->n_other_naks++; goto class_b; case 3: /* Remote Operation Error */ - wc.status = IB_WC_REM_OP_ERR; + status = IB_WC_REM_OP_ERR; dev->n_other_naks++; class_b: - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.qp = &qp->ibqp; - wc.imm_data = 0; - wc.src_qp = qp->remote_qpn; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = qp->remote_ah_attr.dlid; - wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_sqerror_qp(qp, &wc); + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); break; default: @@ -1126,8 +1123,8 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, int header_in_data) { struct ipath_swqe *wqe; + enum ib_wc_status status; unsigned long flags; - struct ib_wc wc; int diff; u32 pad; u32 aeth; @@ -1135,6 +1132,10 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto ack_done; + /* Ignore invalid responses. */ if (ipath_cmp24(psn, qp->s_next_psn) >= 0) goto ack_done; @@ -1159,6 +1160,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, if (unlikely(qp->s_last == qp->s_tail)) goto ack_done; wqe = get_swqe_ptr(qp, qp->s_last); + status = IB_WC_SUCCESS; switch (opcode) { case OP(ACKNOWLEDGE): @@ -1187,6 +1189,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, wqe = get_swqe_ptr(qp, qp->s_last); if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) goto ack_op_err; + qp->r_flags &= ~IPATH_R_RDMAR_SEQ; /* * If this is a response to a resent RDMA read, we * have to be careful to copy the data to the right @@ -1200,7 +1203,10 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, /* no AETH, no ACK */ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); goto ack_done; } if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) @@ -1261,7 +1267,10 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, /* ACKs READ req. */ if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { dev->n_rdma_seq++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); goto ack_done; } if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) @@ -1291,31 +1300,16 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, goto ack_done; } -ack_done: - spin_unlock_irqrestore(&qp->s_lock, flags); - goto bail; - ack_op_err: - wc.status = IB_WC_LOC_QP_OP_ERR; + status = IB_WC_LOC_QP_OP_ERR; goto ack_err; ack_len_err: - wc.status = IB_WC_LOC_LEN_ERR; + status = IB_WC_LOC_LEN_ERR; ack_err: - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = qp->remote_ah_attr.dlid; - wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_sqerror_qp(qp, &wc); + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); +ack_done: spin_unlock_irqrestore(&qp->s_lock, flags); bail: return; @@ -1384,7 +1378,12 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, psn &= IPATH_PSN_MASK; e = NULL; old_req = 1; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock_done; + for (i = qp->r_head_ack_queue; ; i = prev) { if (i == qp->s_tail_ack_queue) old_req = 0; @@ -1512,7 +1511,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, break; } qp->r_nak_state = 0; - tasklet_hi_schedule(&qp->s_task); + ipath_schedule_send(qp); unlock_done: spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1523,13 +1522,12 @@ send_ack: return 0; } -static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) { unsigned long flags; int lastwqe; spin_lock_irqsave(&qp->s_lock, flags); - qp->state = IB_QPS_ERR; lastwqe = ipath_error_qp(qp, err); spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1545,18 +1543,15 @@ static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) { - unsigned long flags; unsigned next; next = n + 1; if (next > IPATH_MAX_RDMA_ATOMIC) next = 0; - spin_lock_irqsave(&qp->s_lock, flags); if (n == qp->s_tail_ack_queue) { qp->s_tail_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); } - spin_unlock_irqrestore(&qp->s_lock, flags); } /** @@ -1585,6 +1580,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int diff; struct ib_reth *reth; int header_in_data; + unsigned long flags; /* Validate the SLID. See Ch. 9.6.1.5 */ if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) @@ -1643,11 +1639,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, opcode == OP(SEND_LAST) || opcode == OP(SEND_LAST_WITH_IMMEDIATE)) break; - nack_inv: - ipath_rc_error(qp, IB_WC_REM_INV_REQ_ERR); - qp->r_nak_state = IB_NAK_INVALID_REQUEST; - qp->r_ack_psn = qp->r_psn; - goto send_ack; + goto nack_inv; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_MIDDLE): @@ -1673,18 +1665,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, break; } - wc.imm_data = 0; - wc.wc_flags = 0; + memset(&wc, 0, sizeof wc); /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - if (!ipath_get_rwqe(qp, 0)) { - rnr_nak: - qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; - qp->r_ack_psn = qp->r_psn; - goto send_ack; - } + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; qp->r_rcv_len = 0; /* FALLTHROUGH */ case OP(SEND_MIDDLE): @@ -1741,20 +1728,19 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto nack_inv; ipath_copy_sge(&qp->r_sge, data, tlen); qp->r_msn++; - if (!qp->r_wrid_valid) + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) break; - qp->r_wrid_valid = 0; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & @@ -1815,9 +1801,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, next = qp->r_head_ack_queue + 1; if (next > IPATH_MAX_RDMA_ATOMIC) next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; if (unlikely(next == qp->s_tail_ack_queue)) { if (!qp->s_ack_queue[next].sent) - goto nack_inv; + goto nack_inv_unlck; ipath_update_ack_queue(qp, next); } e = &qp->s_ack_queue[qp->r_head_ack_queue]; @@ -1838,7 +1828,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) - goto nack_acc; + goto nack_acc_unlck; /* * Update the next expected PSN. We add 1 later * below, so only add the remainder here. @@ -1865,13 +1855,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->r_psn++; qp->r_state = opcode; qp->r_nak_state = 0; - barrier(); qp->r_head_ack_queue = next; - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); - goto done; + goto unlock; } case OP(COMPARE_SWAP): @@ -1890,9 +1879,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, next = qp->r_head_ack_queue + 1; if (next > IPATH_MAX_RDMA_ATOMIC) next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; if (unlikely(next == qp->s_tail_ack_queue)) { if (!qp->s_ack_queue[next].sent) - goto nack_inv; + goto nack_inv_unlck; ipath_update_ack_queue(qp, next); } if (!header_in_data) @@ -1902,13 +1895,13 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | be32_to_cpu(ateth->vaddr[1]); if (unlikely(vaddr & (sizeof(u64) - 1))) - goto nack_inv; + goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); /* Check rkey & NAK */ if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), vaddr, rkey, IB_ACCESS_REMOTE_ATOMIC))) - goto nack_acc; + goto nack_acc_unlck; /* Perform atomic OP and save result. */ maddr = (atomic64_t *) qp->r_sge.sge.vaddr; sdata = be64_to_cpu(ateth->swap_data); @@ -1925,13 +1918,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->r_psn++; qp->r_state = opcode; qp->r_nak_state = 0; - barrier(); qp->r_head_ack_queue = next; - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); - goto done; + goto unlock; } default: @@ -1947,14 +1939,31 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto send_ack; goto done; +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); nack_acc: - ipath_rc_error(qp, IB_WC_REM_ACCESS_ERR); + ipath_rc_error(qp, IB_WC_LOC_PROT_ERR); qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; qp->r_ack_psn = qp->r_psn; - send_ack: send_rc_ack(qp); + goto done; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); done: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index 8ac5c1d82cc..a4b5521567f 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -78,6 +78,7 @@ const u32 ib_ipath_rnr_table[32] = { * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device * @qp: the QP * + * Called with the QP s_lock held and interrupts disabled. * XXX Use a simple list for now. We might need a priority * queue if we have lots of QPs waiting for RNR timeouts * but that should be rare. @@ -85,9 +86,9 @@ const u32 ib_ipath_rnr_table[32] = { void ipath_insert_rnr_queue(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - unsigned long flags; - spin_lock_irqsave(&dev->pending_lock, flags); + /* We already did a spin_lock_irqsave(), so just use spin_lock */ + spin_lock(&dev->pending_lock); if (list_empty(&dev->rnrwait)) list_add(&qp->timerwait, &dev->rnrwait); else { @@ -109,7 +110,7 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp) nqp->s_rnr_timeout -= qp->s_rnr_timeout; list_add(&qp->timerwait, l); } - spin_unlock_irqrestore(&dev->pending_lock, flags); + spin_unlock(&dev->pending_lock); } /** @@ -140,20 +141,11 @@ int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, goto bail; bad_lkey: + memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr_id; wc.status = IB_WC_LOC_PROT_ERR; wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.imm_data = 0; wc.qp = &qp->ibqp; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; /* Signal solicited completion event. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); ret = 0; @@ -194,6 +186,11 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) } spin_lock_irqsave(&rq->lock, flags); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + wq = rq->wq; tail = wq->tail; /* Validate tail before using it since it is user writable. */ @@ -201,9 +198,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) tail = 0; do { if (unlikely(tail == wq->head)) { - spin_unlock_irqrestore(&rq->lock, flags); ret = 0; - goto bail; + goto unlock; } /* Make sure entry is read after head index is read. */ smp_rmb(); @@ -216,7 +212,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) wq->tail = tail; ret = 1; - qp->r_wrid_valid = 1; + set_bit(IPATH_R_WRID_VALID, &qp->r_aflags); if (handler) { u32 n; @@ -243,8 +239,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) goto bail; } } +unlock: spin_unlock_irqrestore(&rq->lock, flags); - bail: return ret; } @@ -270,38 +266,63 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp) struct ib_wc wc; u64 sdata; atomic64_t *maddr; + enum ib_wc_status send_status; + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); - if (!qp) { - dev->n_pkt_drops++; - return; - } -again: spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) || - sqp->s_rnr_timeout) { - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto done; - } + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + goto unlock; - /* Get the next send request. */ - if (sqp->s_last == sqp->s_head) { - /* Send work queue is empty. */ - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto done; + sqp->s_flags |= IPATH_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; } /* * We can rely on the entry not changing without the s_lock * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. */ - wqe = get_swqe_ptr(sqp, sqp->s_last); + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } spin_unlock_irqrestore(&sqp->s_lock, flags); - wc.wc_flags = 0; - wc.imm_data = 0; + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; sqp->s_sge.sge = wqe->sg_list[0]; sqp->s_sge.sg_list = wqe->sg_list + 1; @@ -313,75 +334,33 @@ again: wc.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - if (!ipath_get_rwqe(qp, 0)) { - rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - if (sqp->s_rnr_retry == 0) { - wc.status = IB_WC_RNR_RETRY_EXC_ERR; - goto err; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - dev->n_rnr_naks++; - sqp->s_rnr_timeout = - ib_ipath_rnr_table[qp->r_min_rnr_timer]; - ipath_insert_rnr_queue(sqp); - goto done; - } + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; break; case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_WRITE))) { - wc.status = IB_WC_REM_INV_REQ_ERR; - goto err; - } + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.imm_data = wqe->wr.ex.imm_data; if (!ipath_get_rwqe(qp, 1)) goto rnr_nak; /* FALLTHROUGH */ case IB_WR_RDMA_WRITE: - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_WRITE))) { - wc.status = IB_WC_REM_INV_REQ_ERR; - goto err; - } + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; if (wqe->length == 0) break; if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length, wqe->wr.wr.rdma.remote_addr, wqe->wr.wr.rdma.rkey, - IB_ACCESS_REMOTE_WRITE))) { - acc_err: - wc.status = IB_WC_REM_ACCESS_ERR; - err: - wc.wr_id = wqe->wr.wr_id; - wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = 0; - wc.qp = &sqp->ibqp; - wc.src_qp = sqp->remote_qpn; - wc.pkey_index = 0; - wc.slid = sqp->remote_ah_attr.dlid; - wc.sl = sqp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; - spin_lock_irqsave(&sqp->s_lock, flags); - ipath_sqerror_qp(sqp, &wc); - spin_unlock_irqrestore(&sqp->s_lock, flags); - goto done; - } + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; break; case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) { - wc.status = IB_WC_REM_INV_REQ_ERR; - goto err; - } + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, wqe->wr.wr.rdma.remote_addr, wqe->wr.wr.rdma.rkey, @@ -394,11 +373,8 @@ again: case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_ATOMIC))) { - wc.status = IB_WC_REM_INV_REQ_ERR; - goto err; - } + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), wqe->wr.wr.atomic.remote_addr, wqe->wr.wr.atomic.rkey, @@ -415,7 +391,8 @@ again: goto send_comp; default: - goto done; + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; } sge = &sqp->s_sge.sge; @@ -448,8 +425,7 @@ again: sqp->s_len -= len; } - if (wqe->wr.opcode == IB_WR_RDMA_WRITE || - wqe->wr.opcode == IB_WR_RDMA_READ) + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) goto send_comp; if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) @@ -458,32 +434,89 @@ again: wc.opcode = IB_WC_RECV; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.vendor_err = 0; wc.byte_len = wqe->length; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); +flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - ipath_send_complete(sqp, wqe, IB_WC_SUCCESS); + ipath_send_complete(sqp, wqe, send_status); goto again; +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= IPATH_S_WAITING; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto clr_busy; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + ipath_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + ipath_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~IPATH_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); done: - if (atomic_dec_and_test(&qp->refcount)) + if (qp && atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } -static void want_buffer(struct ipath_devdata *dd) +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) { - if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) { + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { unsigned long flags; spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); @@ -501,26 +534,36 @@ static void want_buffer(struct ipath_devdata *dd) * @dev: the device we ran out of buffers on * * Called when we run out of PIO buffers. + * If we are now in the error state, return zero to flush the + * send work request. */ -static void ipath_no_bufs_available(struct ipath_qp *qp, +static int ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) { unsigned long flags; + int ret = 1; /* * Note that as soon as want_buffer() is called and * possibly before it returns, ipath_ib_piobufavail() - * could be called. If we are still in the tasklet function, - * tasklet_hi_schedule() will not call us until the next time - * tasklet_hi_schedule() is called. - * We leave the busy flag set so that another post send doesn't - * try to put the same QP on the piowait list again. + * could be called. Therefore, put QP on the piowait list before + * enabling the PIO avail interrupt. */ - spin_lock_irqsave(&dev->pending_lock, flags); - list_add_tail(&qp->piowait, &dev->piowait); - spin_unlock_irqrestore(&dev->pending_lock, flags); - want_buffer(dev->dd); - dev->n_piowait++; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_piowait++; + qp->s_flags |= IPATH_S_WAITING; + qp->s_flags &= ~IPATH_S_BUSY; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->piowait)) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock(&dev->pending_lock); + } else + ret = 0; + spin_unlock_irqrestore(&qp->s_lock, flags); + if (ret) + want_buffer(dev->dd, qp); + return ret; } /** @@ -596,15 +639,13 @@ void ipath_do_send(unsigned long data) struct ipath_qp *qp = (struct ipath_qp *)data; struct ipath_ibdev *dev = to_idev(qp->ibqp.device); int (*make_req)(struct ipath_qp *qp); - - if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy)) - goto bail; + unsigned long flags; if ((qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) && qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { ipath_ruc_loopback(qp); - goto clear; + goto bail; } if (qp->ibqp.qp_type == IB_QPT_RC) @@ -614,6 +655,19 @@ void ipath_do_send(unsigned long data) else make_req = ipath_make_ud_req; + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + } + + qp->s_flags |= IPATH_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + again: /* Check for a constructed packet to be sent. */ if (qp->s_hdrwords != 0) { @@ -623,8 +677,8 @@ again: */ if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, qp->s_cur_sge, qp->s_cur_size)) { - ipath_no_bufs_available(qp, dev); - goto bail; + if (ipath_no_bufs_available(qp, dev)) + goto bail; } dev->n_unicast_xmit++; /* Record that we sent the packet and s_hdr is empty. */ @@ -633,16 +687,20 @@ again: if (make_req(qp)) goto again; -clear: - clear_bit(IPATH_S_BUSY, &qp->s_busy); + bail:; } +/* + * This should be called with s_lock held. + */ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, enum ib_wc_status status) { - unsigned long flags; - u32 last; + u32 old_last, last; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + return; /* See ch. 11.2.4.1 and 10.7.3.1 */ if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || @@ -650,27 +708,25 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, status != IB_WC_SUCCESS) { struct ib_wc wc; + memset(&wc, 0, sizeof wc); wc.wr_id = wqe->wr.wr_id; wc.status = status; wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc.vendor_err = 0; - wc.byte_len = wqe->length; - wc.imm_data = 0; wc.qp = &qp->ibqp; - wc.src_qp = 0; - wc.wc_flags = 0; - wc.pkey_index = 0; - wc.slid = 0; - wc.sl = 0; - wc.dlid_path_bits = 0; - wc.port_num = 0; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); } - spin_lock_irqsave(&qp->s_lock, flags); - last = qp->s_last; + old_last = last = qp->s_last; if (++last >= qp->s_size) last = 0; qp->s_last = last; - spin_unlock_irqrestore(&qp->s_lock, flags); + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; } diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 1974df7a9f7..3697449c1ba 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -308,13 +308,15 @@ static void sdma_abort_task(unsigned long opaque) spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); /* - * Don't restart sdma here. Wait until link is up to ACTIVE. - * VL15 MADs used to bring the link up use PIO, and multiple - * link transitions otherwise cause the sdma engine to be + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be * stopped and started multiple times. - * The disable is done here, including the shadow, so the - * state is kept consistent. - * See ipath_restart_sdma() for the actual starting of sdma. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. */ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; @@ -326,6 +328,13 @@ static void sdma_abort_task(unsigned long opaque) /* make sure I see next message */ dd->ipath_sdma_abort_jiffies = 0; + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + goto done; } @@ -427,7 +436,12 @@ int setup_sdma(struct ipath_devdata *dd) goto done; } - dd->ipath_sdma_status = 0; + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; dd->ipath_sdma_abort_jiffies = 0; dd->ipath_sdma_generation = 0; dd->ipath_sdma_descq_tail = 0; @@ -449,16 +463,19 @@ int setup_sdma(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, dd->ipath_sdma_head_phys); - /* Reserve all the former "kernel" piobufs */ - n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - dd->ipath_pioreserved; - for (i = dd->ipath_lastport_piobuf; i < n; ++i) { + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { unsigned word = i / 64; unsigned bit = i & 63; BUG_ON(word >= 3); senddmabufmask[word] |= 1ULL << bit; } - ipath_chg_pioavailkernel(dd, dd->ipath_lastport_piobuf, - n - dd->ipath_lastport_piobuf, 0); ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, senddmabufmask[0]); ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, @@ -615,6 +632,9 @@ void ipath_restart_sdma(struct ipath_devdata *dd) ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + bail: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index bfe8926b551..7fd18e83390 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,14 +47,30 @@ int ipath_make_uc_req(struct ipath_qp *qp) { struct ipath_other_headers *ohdr; struct ipath_swqe *wqe; + unsigned long flags; u32 hwords; u32 bth0; u32 len; u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); int ret = 0; - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; + } ohdr = &qp->s_hdr.u.oth; if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) @@ -69,9 +85,12 @@ int ipath_make_uc_req(struct ipath_qp *qp) qp->s_wqe = NULL; switch (qp->s_state) { default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; /* Check if send work queue is empty. */ if (qp->s_cur == qp->s_head) - goto done; + goto bail; /* * Start a new request. */ @@ -134,7 +153,7 @@ int ipath_make_uc_req(struct ipath_qp *qp) break; default: - goto done; + goto bail; } break; @@ -194,9 +213,14 @@ int ipath_make_uc_req(struct ipath_qp *qp) ipath_make_ruc_header(to_idev(qp->ibqp.device), qp, ohdr, bth0 | (qp->s_state << 24), qp->s_next_psn++ & IPATH_PSN_MASK); +done: ret = 1; + goto unlock; -done: +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } @@ -258,8 +282,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, */ opcode = be32_to_cpu(ohdr->bth[0]) >> 24; - wc.imm_data = 0; - wc.wc_flags = 0; + memset(&wc, 0, sizeof wc); /* Compare the PSN verses the expected PSN. */ if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { @@ -322,8 +345,8 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): send_first: - if (qp->r_reuse_sge) { - qp->r_reuse_sge = 0; + if (qp->r_flags & IPATH_R_REUSE_SGE) { + qp->r_flags &= ~IPATH_R_REUSE_SGE; qp->r_sge = qp->s_rdma_read_sge; } else if (!ipath_get_rwqe(qp, 0)) { dev->n_pkt_drops++; @@ -340,13 +363,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ if (unlikely(tlen != (hdrsize + pmtu + 4))) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } @@ -372,7 +395,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, /* Check for invalid length. */ /* XXX LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } @@ -380,7 +403,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, tlen -= (hdrsize + pad + 4); wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto done; } @@ -390,14 +413,10 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; - wc.pkey_index = 0; wc.slid = qp->remote_ah_attr.dlid; wc.sl = qp->remote_ah_attr.sl; - wc.dlid_path_bits = 0; - wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, (ohdr->bth[0] & @@ -488,8 +507,8 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, dev->n_pkt_drops++; goto done; } - if (qp->r_reuse_sge) - qp->r_reuse_sge = 0; + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; else if (!ipath_get_rwqe(qp, 1)) { dev->n_pkt_drops++; goto done; diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 8b6a261c89e..77ca8ca74e7 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -65,9 +65,9 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) u32 length; qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); - if (!qp) { + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { dev->n_pkt_drops++; - goto send_comp; + goto done; } rsge.sg_list = NULL; @@ -91,14 +91,12 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) * present on the wire. */ length = swqe->length; + memset(&wc, 0, sizeof wc); wc.byte_len = length + sizeof(struct ib_grh); if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { wc.wc_flags = IB_WC_WITH_IMM; wc.imm_data = swqe->wr.ex.imm_data; - } else { - wc.wc_flags = 0; - wc.imm_data = 0; } /* @@ -229,7 +227,6 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) } wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; - wc.vendor_err = 0; wc.qp = &qp->ibqp; wc.src_qp = sqp->ibqp.qp_num; /* XXX do we know which pkey matched? Only needed for GSI. */ @@ -248,8 +245,7 @@ drop: kfree(rsge.sg_list); if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); -send_comp: - ipath_send_complete(sqp, swqe, IB_WC_SUCCESS); +done:; } /** @@ -264,6 +260,7 @@ int ipath_make_ud_req(struct ipath_qp *qp) struct ipath_other_headers *ohdr; struct ib_ah_attr *ah_attr; struct ipath_swqe *wqe; + unsigned long flags; u32 nwords; u32 extra_bytes; u32 bth0; @@ -271,13 +268,30 @@ int ipath_make_ud_req(struct ipath_qp *qp) u16 lid; int ret = 0; - if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK))) - goto bail; + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } if (qp->s_cur == qp->s_head) goto bail; wqe = get_swqe_ptr(qp, qp->s_cur); + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; /* Construct the header. */ ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; @@ -288,10 +302,23 @@ int ipath_make_ud_req(struct ipath_qp *qp) dev->n_unicast_xmit++; } else { dev->n_unicast_xmit++; - lid = ah_attr->dlid & - ~((1 << dev->dd->ipath_lmc) - 1); + lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1); if (unlikely(lid == dev->dd->ipath_lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + spin_unlock_irqrestore(&qp->s_lock, flags); ipath_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } } @@ -368,11 +395,13 @@ int ipath_make_ud_req(struct ipath_qp *qp) ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: - if (++qp->s_cur >= qp->s_size) - qp->s_cur = 0; ret = 1; + goto unlock; bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); return ret; } @@ -506,8 +535,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, /* * Get the next work request entry to find where to put the data. */ - if (qp->r_reuse_sge) - qp->r_reuse_sge = 0; + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; else if (!ipath_get_rwqe(qp, 0)) { /* * Count VL15 packets dropped due to no receive buffer. @@ -523,7 +552,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } /* Silently drop packets which are too big. */ if (wc.byte_len > qp->r_len) { - qp->r_reuse_sge = 1; + qp->r_flags |= IPATH_R_REUSE_SGE; dev->n_pkt_drops++; goto bail; } @@ -535,7 +564,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); ipath_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh)); - qp->r_wrid_valid = 0; + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto bail; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h index e70946c1428..fc76316c4a5 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.h +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h @@ -45,8 +45,6 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd, int ipath_user_sdma_make_progress(struct ipath_devdata *dd, struct ipath_user_sdma_queue *pq); -int ipath_user_sdma_pkt_sent(const struct ipath_user_sdma_queue *pq, - u32 counter); void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, struct ipath_user_sdma_queue *pq); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index e63927cce5b..e0ec540042b 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -111,16 +111,24 @@ static unsigned int ib_ipath_disable_sma; module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(disable_sma, "Disable the SMA"); +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; ipath_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { [IB_QPS_RESET] = 0, [IB_QPS_INIT] = IPATH_POST_RECV_OK, [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | - IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK | + IPATH_PROCESS_NEXT_SEND_OK, [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | - IPATH_POST_SEND_OK, - [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, - [IB_QPS_ERR] = 0, + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, + [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, }; struct ipath_ucontext { @@ -230,18 +238,6 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) } } -static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr) -{ - struct ib_wc wc; - - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wr->wr_id; - wc.status = IB_WC_WR_FLUSH_ERR; - wc.opcode = ib_ipath_wc_opcode[wr->opcode]; - wc.qp = &qp->ibqp; - ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1); -} - /* * Count the number of DMA descriptors needed to send length bytes of data. * Don't modify the ipath_sge_state to get the count. @@ -347,14 +343,8 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) spin_lock_irqsave(&qp->s_lock, flags); /* Check that state is OK to post send. */ - if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) { - if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR) - goto bail_inval; - /* C10-96 says generate a flushed completion entry. */ - ipath_flush_wqe(qp, wr); - ret = 0; - goto bail; - } + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) + goto bail_inval; /* IB spec says that num_sge == 0 is OK. */ if (wr->num_sge > qp->s_max_sge) @@ -396,7 +386,6 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) wqe = get_swqe_ptr(qp, qp->s_head); wqe->wr = *wr; - wqe->ssn = qp->s_ssn++; wqe->length = 0; if (wr->num_sge) { acc = wr->opcode >= IB_WR_RDMA_READ ? @@ -422,6 +411,7 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) goto bail_inval; } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) goto bail_inval; + wqe->ssn = qp->s_ssn++; qp->s_head = next; ret = 0; @@ -677,6 +667,7 @@ bail:; static void ipath_ib_timer(struct ipath_ibdev *dev) { struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; struct list_head *last; struct ipath_qp *qp; unsigned long flags; @@ -703,7 +694,9 @@ static void ipath_ib_timer(struct ipath_ibdev *dev) if (--qp->s_rnr_timeout == 0) { do { list_del_init(&qp->timerwait); - tasklet_hi_schedule(&qp->s_task); + qp->timer_next = rnr; + rnr = qp; + atomic_inc(&qp->refcount); if (list_empty(last)) break; qp = list_entry(last->next, struct ipath_qp, @@ -743,13 +736,15 @@ static void ipath_ib_timer(struct ipath_ibdev *dev) spin_unlock_irqrestore(&dev->pending_lock, flags); /* XXX What if timer fires again while this is running? */ - for (qp = resend; qp != NULL; qp = qp->timer_next) { - struct ib_wc wc; + while (resend != NULL) { + qp = resend; + resend = qp->timer_next; spin_lock_irqsave(&qp->s_lock, flags); - if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) { + if (qp->s_last != qp->s_tail && + ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { dev->n_timeouts++; - ipath_restart_rc(qp, qp->s_last_psn + 1, &wc); + ipath_restart_rc(qp, qp->s_last_psn + 1); } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -757,6 +752,19 @@ static void ipath_ib_timer(struct ipath_ibdev *dev) if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); } + while (rnr != NULL) { + qp = rnr; + rnr = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } } static void update_sge(struct ipath_sge_state *ss, u32 length) @@ -1012,13 +1020,24 @@ static void sdma_complete(void *cookie, int status) struct ipath_verbs_txreq *tx = cookie; struct ipath_qp *qp = tx->qp; struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned int flags; + enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; - /* Generate a completion queue entry if needed */ - if (qp->ibqp.qp_type != IB_QPT_RC && tx->wqe) { - enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? - IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; - + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if (tx->wqe) + ipath_send_complete(qp, tx->wqe, ibs); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } else if (tx->wqe) { + spin_lock_irqsave(&qp->s_lock, flags); ipath_send_complete(qp, tx->wqe, ibs); + spin_unlock_irqrestore(&qp->s_lock, flags); } if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) @@ -1029,6 +1048,21 @@ static void sdma_complete(void *cookie, int status) wake_up(&qp->wait); } +static void decrement_dma_busy(struct ipath_qp *qp) +{ + unsigned int flags; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } +} + /* * Compute the number of clock cycles of delay before sending the next packet. * The multipliers reflect the number of clocks for the fastest rate so @@ -1067,9 +1101,12 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp, if (tx) { qp->s_tx = NULL; /* resend previously constructed packet */ + atomic_inc(&qp->s_dma_busy); ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx); - if (ret) + if (ret) { qp->s_tx = tx; + decrement_dma_busy(qp); + } goto bail; } @@ -1120,12 +1157,14 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp, tx->txreq.sg_count = ndesc; tx->map_len = (hdrwords + 2) << 2; tx->txreq.map_addr = &tx->hdr; + atomic_inc(&qp->s_dma_busy); ret = ipath_sdma_verbs_send(dd, ss, dwords, tx); if (ret) { /* save ss and length in dwords */ tx->ss = ss; tx->len = dwords; qp->s_tx = tx; + decrement_dma_busy(qp); } goto bail; } @@ -1146,6 +1185,7 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp, memcpy(piobuf, hdr, hdrwords << 2); ipath_copy_from_sge(piobuf + hdrwords, ss, len); + atomic_inc(&qp->s_dma_busy); ret = ipath_sdma_verbs_send(dd, NULL, 0, tx); /* * If we couldn't queue the DMA request, save the info @@ -1156,6 +1196,7 @@ static int ipath_verbs_send_dma(struct ipath_qp *qp, tx->ss = NULL; tx->len = 0; qp->s_tx = tx; + decrement_dma_busy(qp); } dev->n_unaligned++; goto bail; @@ -1179,6 +1220,7 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp, unsigned flush_wc; u32 control; int ret; + unsigned int flags; piobuf = ipath_getpiobuf(dd, plen, NULL); if (unlikely(piobuf == NULL)) { @@ -1249,8 +1291,11 @@ static int ipath_verbs_send_pio(struct ipath_qp *qp, } copy_io(piobuf, ss, len, flush_wc); done: - if (qp->s_wqe) + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } ret = 0; bail: return ret; @@ -1283,19 +1328,12 @@ int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, * can defer SDMA restart until link goes ACTIVE without * worrying about just how we got there. */ - if (qp->ibqp.qp_type == IB_QPT_SMI) + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->ipath_flags & IPATH_HAS_SEND_DMA)) ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, plen, dwords); - /* All non-VL15 packets are dropped if link is not ACTIVE */ - else if (!(dd->ipath_flags & IPATH_LINKACTIVE)) { - if (qp->s_wqe) - ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); - ret = 0; - } else if (dd->ipath_flags & IPATH_HAS_SEND_DMA) - ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, - plen, dwords); else - ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, + ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, plen, dwords); return ret; @@ -1403,27 +1441,46 @@ bail: * This is called from ipath_intr() at interrupt level when a PIO buffer is * available after ipath_verbs_send() returned an error that no buffers were * available. Return 1 if we consumed all the PIO buffers and we still have - * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and + * QPs waiting for buffers (for now, just restart the send tasklet and * return zero). */ int ipath_ib_piobufavail(struct ipath_ibdev *dev) { + struct list_head *list; + struct ipath_qp *qplist; struct ipath_qp *qp; unsigned long flags; if (dev == NULL) goto bail; + list = &dev->piowait; + qplist = NULL; + spin_lock_irqsave(&dev->pending_lock, flags); - while (!list_empty(&dev->piowait)) { - qp = list_entry(dev->piowait.next, struct ipath_qp, - piowait); + while (!list_empty(list)) { + qp = list_entry(list->next, struct ipath_qp, piowait); list_del_init(&qp->piowait); - clear_bit(IPATH_S_BUSY, &qp->s_busy); - tasklet_hi_schedule(&qp->s_task); + qp->pio_next = qplist; + qplist = qp; + atomic_inc(&qp->refcount); } spin_unlock_irqrestore(&dev->pending_lock, flags); + while (qplist != NULL) { + qp = qplist; + qplist = qp->pio_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + bail: return 0; } @@ -2145,11 +2202,12 @@ bail: void ipath_unregister_ib_device(struct ipath_ibdev *dev) { struct ib_device *ibdev = &dev->ibdev; - - disable_timer(dev->dd); + u32 qps_inuse; ib_unregister_device(ibdev); + disable_timer(dev->dd); + if (!list_empty(&dev->pending[0]) || !list_empty(&dev->pending[1]) || !list_empty(&dev->pending[2])) @@ -2164,7 +2222,10 @@ void ipath_unregister_ib_device(struct ipath_ibdev *dev) * Note that ipath_unregister_ib_device() can be called before all * the QPs are destroyed! */ - ipath_free_all_qps(&dev->qp_table); + qps_inuse = ipath_free_all_qps(&dev->qp_table); + if (qps_inuse) + ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n", + qps_inuse); kfree(dev->qp_table.table); kfree(dev->lk_table.table); kfree(dev->txreq_bufs); @@ -2215,17 +2276,14 @@ static ssize_t show_stats(struct device *device, struct device_attribute *attr, "RC OTH NAKs %d\n" "RC timeouts %d\n" "RC RDMA dup %d\n" - "RC stalls %d\n" "piobuf wait %d\n" - "no piobuf %d\n" "unaligned %d\n" "PKT drops %d\n" "WQE errs %d\n", dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, dev->n_other_naks, dev->n_timeouts, - dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait, - dev->n_no_piobuf, dev->n_unaligned, + dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned, dev->n_pkt_drops, dev->n_wqe_errs); for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { const struct ipath_opcode_stats *si = &dev->opstats[i]; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 6514aa8306c..9d12ae8a778 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -74,6 +74,11 @@ #define IPATH_POST_RECV_OK 0x02 #define IPATH_PROCESS_RECV_OK 0x04 #define IPATH_PROCESS_SEND_OK 0x08 +#define IPATH_PROCESS_NEXT_SEND_OK 0x10 +#define IPATH_FLUSH_SEND 0x20 +#define IPATH_FLUSH_RECV 0x40 +#define IPATH_PROCESS_OR_FLUSH_SEND \ + (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND) /* IB Performance Manager status values */ #define IB_PMA_SAMPLE_STATUS_DONE 0x00 @@ -353,12 +358,14 @@ struct ipath_qp { struct ib_qp ibqp; struct ipath_qp *next; /* link list for QPN hash table */ struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ + struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */ struct list_head piowait; /* link for wait PIO buf */ struct list_head timerwait; /* link for waiting for timeouts */ struct ib_ah_attr remote_ah_attr; struct ipath_ib_header s_hdr; /* next packet header to send */ atomic_t refcount; wait_queue_head_t wait; + wait_queue_head_t wait_dma; struct tasklet_struct s_task; struct ipath_mmap_info *ip; struct ipath_sge_state *s_cur_sge; @@ -369,7 +376,7 @@ struct ipath_qp { struct ipath_sge_state s_rdma_read_sge; struct ipath_sge_state r_sge; /* current receive data */ spinlock_t s_lock; - unsigned long s_busy; + atomic_t s_dma_busy; u16 s_pkt_delay; u16 s_hdrwords; /* size of s_hdr in 32 bit words */ u32 s_cur_size; /* size of send packet in bytes */ @@ -383,6 +390,7 @@ struct ipath_qp { u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ u64 r_wr_id; /* ID for current receive WQE */ + unsigned long r_aflags; u32 r_len; /* total length of r_sge */ u32 r_rcv_len; /* receive data len processed */ u32 r_psn; /* expected rcv packet sequence number */ @@ -394,8 +402,7 @@ struct ipath_qp { u8 r_state; /* opcode of last packet received */ u8 r_nak_state; /* non-zero if NAK is pending */ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ - u8 r_reuse_sge; /* for UC receive errors */ - u8 r_wrid_valid; /* r_wrid set but CQ entry not yet made */ + u8 r_flags; u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ u8 r_head_ack_queue; /* index into s_ack_queue[] */ u8 qp_access_flags; @@ -404,13 +411,13 @@ struct ipath_qp { u8 s_rnr_retry_cnt; u8 s_retry; /* requester retry counter */ u8 s_rnr_retry; /* requester RNR retry counter */ - u8 s_wait_credit; /* limit number of unacked packets sent */ u8 s_pkey_index; /* PKEY index to use */ u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ u8 s_flags; u8 s_dmult; + u8 s_draining; u8 timeout; /* Timeout for this QP */ enum ib_mtu path_mtu; u32 remote_qpn; @@ -428,16 +435,40 @@ struct ipath_qp { struct ipath_sge r_sg_list[0]; /* verified SGEs */ }; -/* Bit definition for s_busy. */ -#define IPATH_S_BUSY 0 +/* + * Atomic bit definitions for r_aflags. + */ +#define IPATH_R_WRID_VALID 0 + +/* + * Bit definitions for r_flags. + */ +#define IPATH_R_REUSE_SGE 0x01 +#define IPATH_R_RDMAR_SEQ 0x02 /* * Bit definitions for s_flags. + * + * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_WAITING - waiting for RNR timeout or send buffer available. + * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA. */ #define IPATH_S_SIGNAL_REQ_WR 0x01 #define IPATH_S_FENCE_PENDING 0x02 #define IPATH_S_RDMAR_PENDING 0x04 #define IPATH_S_ACK_PENDING 0x08 +#define IPATH_S_BUSY 0x10 +#define IPATH_S_WAITING 0x20 +#define IPATH_S_WAIT_SSN_CREDIT 0x40 +#define IPATH_S_WAIT_DMA 0x80 + +#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \ + IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA) #define IPATH_PSN_CREDIT 512 @@ -573,13 +604,11 @@ struct ipath_ibdev { u32 n_rnr_naks; u32 n_other_naks; u32 n_timeouts; - u32 n_rc_stalls; u32 n_pkt_drops; u32 n_vl15_dropped; u32 n_wqe_errs; u32 n_rdma_dup_busy; u32 n_piowait; - u32 n_no_piobuf; u32 n_unaligned; u32 port_cap_flags; u32 pma_sample_start; @@ -657,6 +686,17 @@ static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) return container_of(ibdev, struct ipath_ibdev, ibdev); } +/* + * This must be called with s_lock held. + */ +static inline void ipath_schedule_send(struct ipath_qp *qp) +{ + if (qp->s_flags & IPATH_S_ANY_WAIT) + qp->s_flags &= ~IPATH_S_ANY_WAIT; + if (!(qp->s_flags & IPATH_S_BUSY)) + tasklet_hi_schedule(&qp->s_task); +} + int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, @@ -706,12 +746,10 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); -void ipath_free_all_qps(struct ipath_qp_table *qpt); +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt); int ipath_init_qp_table(struct ipath_ibdev *idev, int size); -void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc); - void ipath_get_credit(struct ipath_qp *qp, u32 aeth); unsigned ipath_ib_rate_to_mult(enum ib_rate rate); @@ -729,7 +767,9 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int has_grh, void *data, u32 tlen, struct ipath_qp *qp); -void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc); +void ipath_restart_rc(struct ipath_qp *qp, u32 psn); + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err); int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 5e570bb0bb6..4521319b140 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -137,7 +137,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont int err; *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), - IB_ACCESS_LOCAL_WRITE); + IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -221,7 +221,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector } err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq); + cq->db.dma, &cq->mcq, 0); if (err) goto err_dbmap; @@ -246,7 +246,7 @@ err_mtt: if (context) ib_umem_release(cq->umem); else - mlx4_ib_free_cq_buf(dev, &cq->buf, entries); + mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); err_db: if (!context) @@ -434,7 +434,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq) mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); ib_umem_release(mcq->umem); } else { - mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe + 1); + mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index 8e342cc9bae..8aee4233b38 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -63,7 +63,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, - PAGE_SIZE, 0); + PAGE_SIZE, 0, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index fe2c2e94a5f..68e92485fc7 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -132,7 +132,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, access_flags); + mr->umem = ib_umem_get(pd->uobject->context, start, length, + access_flags, 0); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 80ea8b9e776..8e02ecfec18 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -482,7 +482,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0); + qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 204619702f9..12d6bc6f800 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -109,7 +109,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - buf_size, 0); + buf_size, 0, 0); if (IS_ERR(srq->umem)) { err = PTR_ERR(srq->umem); goto err_srq; diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 3538da16e3f..820205dec56 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -818,15 +818,9 @@ int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) { - u32 key; - if (!fmr->maps) return; - key = tavor_key_to_hw_index(fmr->ibmr.lkey); - key &= dev->limits.num_mpts - 1; - fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key); - fmr->maps = 0; writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); @@ -834,16 +828,9 @@ void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) { - u32 key; - if (!fmr->maps) return; - key = arbel_key_to_hw_index(fmr->ibmr.lkey); - key &= dev->limits.num_mpts - 1; - key = adjust_key(dev, key); - fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key); - fmr->maps = 0; *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 696e1f30233..be34f99ca62 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -39,6 +39,8 @@ #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> + +#include <linux/sched.h> #include <linux/mm.h> #include "mthca_dev.h" @@ -367,6 +369,8 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, return ERR_PTR(-EFAULT); } + context->reg_mr_warned = 0; + return &context->ibucontext; } @@ -1006,17 +1010,31 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct mthca_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; struct mthca_mr *mr; + struct mthca_reg_mr ucmd; u64 *pages; int shift, n, len; int i, j, k; int err = 0; int write_mtt_size; + if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) { + if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { + mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", + current->comm); + mthca_warn(dev, " Update libmthca to fix this.\n"); + } + ++to_mucontext(pd->uobject->context)->reg_mr_warned; + ucmd.mr_attrs = 0; + } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) + return ERR_PTR(-EFAULT); + mr = kmalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, acc); + mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, + ucmd.mr_attrs & MTHCA_MR_DMASYNC); + if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 262616c8ebb..934bf954403 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -67,6 +67,7 @@ struct mthca_ucontext { struct ib_ucontext ibucontext; struct mthca_uar uar; struct mthca_user_db_table *db_tab; + int reg_mr_warned; }; struct mthca_mtt; diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h index 02cc0a766f3..e1262c942db 100644 --- a/drivers/infiniband/hw/mthca/mthca_user.h +++ b/drivers/infiniband/hw/mthca/mthca_user.h @@ -61,6 +61,16 @@ struct mthca_alloc_pd_resp { __u32 reserved; }; +struct mthca_reg_mr { +/* + * Mark the memory region with a DMA attribute that causes + * in-flight DMA to be flushed when the region is written to: + */ +#define MTHCA_MR_DMASYNC 0x1 + __u32 mr_attrs; + __u32 reserved; +}; + struct mthca_create_cq { __u32 lkey; __u32 pdn; diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig index 2aeb7ac972a..d449eb6ec78 100644 --- a/drivers/infiniband/hw/nes/Kconfig +++ b/drivers/infiniband/hw/nes/Kconfig @@ -2,6 +2,7 @@ config INFINIBAND_NES tristate "NetEffect RNIC Driver" depends on PCI && INET && INFINIBAND select LIBCRC32C + select INET_LRO ---help--- This is a low-level driver for NetEffect RDMA enabled Network Interface Cards (RNIC). diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index cdf2e9ad62f..61b46e9c7d2 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -535,8 +535,8 @@ int nes_register_ofa_device(struct nes_ib_device *); int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *); void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16); void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *); -void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16); -void nes_read_10G_phy_reg(struct nes_device *, u16, u8); +void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16); +void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16); struct nes_cqp_request *nes_get_cqp_request(struct nes_device *); void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *, int); int nes_arp_table(struct nes_device *, u32, u8 *, u32); diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index d940fc27129..9a4b40fae40 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -594,7 +594,7 @@ static void nes_cm_timer_tick(unsigned long pass) continue; } /* this seems like the correct place, but leave send entry unprotected */ - // spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); + /* spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags); */ atomic_inc(&send_entry->skb->users); cm_packets_retrans++; nes_debug(NES_DBG_CM, "Retransmitting send_entry %p for node %p," @@ -1335,7 +1335,7 @@ static int process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb, cm_node->loc_addr, cm_node->loc_port, cm_node->rem_addr, cm_node->rem_port, cm_node->state, atomic_read(&cm_node->ref_count)); - // create event + /* create event */ cm_node->state = NES_CM_STATE_CLOSED; create_event(cm_node, NES_CM_EVENT_ABORTED); @@ -1669,7 +1669,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, if (!cm_node) return NULL; - // set our node side to client (active) side + /* set our node side to client (active) side */ cm_node->tcp_cntxt.client = 1; cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; @@ -1694,7 +1694,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core, loopbackremotenode->mpa_frame_size = mpa_frame_size - sizeof(struct ietf_mpa_frame); - // we are done handling this state, set node to a TSA state + /* we are done handling this state, set node to a TSA state */ cm_node->state = NES_CM_STATE_TSA; cm_node->tcp_cntxt.rcv_nxt = loopbackremotenode->tcp_cntxt.loc_seq_num; loopbackremotenode->tcp_cntxt.rcv_nxt = cm_node->tcp_cntxt.loc_seq_num; diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 08964cc7e98..d3278f111ca 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -38,9 +38,14 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/if_vlan.h> +#include <linux/inet_lro.h> #include "nes.h" +static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR; +module_param(nes_lro_max_aggr, uint, 0444); +MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation"); + static u32 crit_err_count; u32 int_mod_timer_init; u32 int_mod_cq_depth_256; @@ -832,7 +837,7 @@ static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_cou nes_write_indexed(nesdev, 0x00000900, 0x20000001); nes_write_indexed(nesdev, 0x000060C0, 0x0000028e); nes_write_indexed(nesdev, 0x000060C8, 0x00000020); - // + nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0); /* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */ @@ -1207,11 +1212,16 @@ int nes_init_phy(struct nes_device *nesdev) { struct nes_adapter *nesadapter = nesdev->nesadapter; u32 counter = 0; + u32 sds_common_control0; u32 mac_index = nesdev->mac_index; - u32 tx_config; + u32 tx_config = 0; u16 phy_data; + u32 temp_phy_data = 0; + u32 temp_phy_data2 = 0; + u32 i = 0; - if (nesadapter->OneG_Mode) { + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index); if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_1G) { printk(PFX "%s: Programming mdc config for 1G\n", __func__); @@ -1223,7 +1233,7 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_1G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index], &phy_data); nes_debug(NES_DBG_PHY, "Phy data from register 1 phy address %u = 0x%X.\n", nesadapter->phy_index[mac_index], phy_data); - nes_write_1G_phy_reg(nesdev, 23, nesadapter->phy_index[mac_index], 0xb000); + nes_write_1G_phy_reg(nesdev, 23, nesadapter->phy_index[mac_index], 0xb000); /* Reset the PHY */ nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], 0x8000); @@ -1277,12 +1287,126 @@ int nes_init_phy(struct nes_device *nesdev) nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], &phy_data); nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[mac_index], phy_data | 0x0300); } else { - if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) { + if ((nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) || + (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_ARGUS)) { /* setup 10G MDIO operation */ tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG); tx_config |= 0x14; nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config); } + if ((nesadapter->phy_type[mac_index] == NES_PHY_TYPE_ARGUS)) { + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); + + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + mdelay(10); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + + /* + * if firmware is already running (like from a + * driver un-load/load, don't do anything. + */ + if (temp_phy_data == temp_phy_data2) { + /* configure QT2505 AMCC PHY */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0x0000, 0x8000); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc300, 0x0000); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc302, 0x0044); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc318, 0x0052); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc319, 0x0008); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc31a, 0x0098); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0026, 0x0E00); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0027, 0x0000); + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0x0028, 0xA528); + + /* + * remove micro from reset; chip boots from ROM, + * uploads EEPROM f/w image, uC executes f/w + */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc300, 0x0002); + + /* + * wait for heart beat to start to + * know loading is done + */ + counter = 0; + do { + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if (counter++ > 1000) { + nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from heartbeat check <this is bad!!!> \n"); + break; + } + mdelay(100); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7ee); + temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + } while ((temp_phy_data2 == temp_phy_data)); + + /* + * wait for tracking to start to know + * f/w is good to go + */ + counter = 0; + do { + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x3, 0xd7fd); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if (counter++ > 1000) { + nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from status check <this is bad!!!> \n"); + break; + } + mdelay(1000); + /* + * nes_debug(NES_DBG_PHY, "AMCC PHY- phy_status not ready yet = 0x%02X\n", + * temp_phy_data); + */ + } while (((temp_phy_data & 0xff) != 0x50) && ((temp_phy_data & 0xff) != 0x70)); + + /* set LOS Control invert RXLOSB_I_PADINV */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd003, 0x0000); + /* set LOS Control to mask of RXLOSB_I */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xc314, 0x0042); + /* set LED1 to input mode (LED1 and LED2 share same LED) */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd006, 0x0007); + /* set LED2 to RX link_status and activity */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd007, 0x000A); + /* set LED3 to RX link_status */ + nes_write_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 0x1, 0xd008, 0x0009); + + /* + * reset the res-calibration on t2 + * serdes; ensures it is stable after + * the amcc phy is stable + */ + + sds_common_control0 = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0); + sds_common_control0 |= 0x1; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds_common_control0); + + /* release the res-calibration reset */ + sds_common_control0 &= 0xfffffffe; + nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds_common_control0); + + i = 0; + while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040) + && (i++ < 5000)) { + /* mdelay(1); */ + } + + /* + * wait for link train done before moving on, + * or will get an interupt storm + */ + counter = 0; + do { + temp_phy_data = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1))); + if (counter++ > 1000) { + nes_debug(NES_DBG_PHY, "AMCC PHY- breaking from link train wait <this is bad, link didnt train!!!>\n"); + break; + } + mdelay(1); + } while (((temp_phy_data & 0x0f1f0000) != 0x0f0f0000)); + } + } } return 0; } @@ -1375,6 +1499,25 @@ static void nes_rq_wqes_timeout(unsigned long parm) } +static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + return 0; +} + + /** * nes_init_nic_qp */ @@ -1520,10 +1663,10 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) } u64temp = (u64)nesvnic->nic.sq_pbase; - nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp); nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); u64temp = (u64)nesvnic->nic.rq_pbase; - nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); + nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp); nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32)); cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP | @@ -1575,7 +1718,7 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) nic_rqe = &nesvnic->nic.rq_vbase[counter]; nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size); nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0; - nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); + nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem); nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32)); nesvnic->nic.rx_skb[counter] = skb; } @@ -1592,15 +1735,21 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev) nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout; nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic; nes_debug(NES_DBG_INIT, "NAPI support Enabled\n"); - if (nesdev->nesadapter->et_use_adaptive_rx_coalesce) { nes_nic_init_timer(nesdev); if (netdev->mtu > 1500) jumbomode = 1; - nes_nic_init_timer_defaults(nesdev, jumbomode); - } - + nes_nic_init_timer_defaults(nesdev, jumbomode); + } + nesvnic->lro_mgr.max_aggr = nes_lro_max_aggr; + nesvnic->lro_mgr.max_desc = NES_MAX_LRO_DESCRIPTORS; + nesvnic->lro_mgr.lro_arr = nesvnic->lro_desc; + nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr; + nesvnic->lro_mgr.features = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID; + nesvnic->lro_mgr.dev = netdev; + nesvnic->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY; + nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; return 0; } @@ -1620,8 +1769,8 @@ void nes_destroy_nic_qp(struct nes_vnic *nesvnic) /* Free remaining NIC receive buffers */ while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) { - nic_rqe = &nesvnic->nic.rq_vbase[nesvnic->nic.rq_tail]; - wqe_frag = (u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); + nic_rqe = &nesvnic->nic.rq_vbase[nesvnic->nic.rq_tail]; + wqe_frag = (u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]); wqe_frag |= ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32; pci_unmap_single(nesdev->pcidev, (dma_addr_t)wqe_frag, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE); @@ -1704,17 +1853,17 @@ int nes_napi_isr(struct nes_device *nesdev) /* iff NIC, process here, else wait for DPC */ if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) { nesdev->napi_isr_ran = 0; - nes_write32(nesdev->regs+NES_INT_STAT, - (int_stat & - ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0|NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3))); + nes_write32(nesdev->regs + NES_INT_STAT, + (int_stat & + ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); /* Process the CEQs */ nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]); if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) && - (!nesadapter->et_use_adaptive_rx_coalesce)) || - ((nesadapter->et_use_adaptive_rx_coalesce) && - (nesdev->deepcq_count > nesadapter->et_pkt_rate_low)))) ) { + (!nesadapter->et_use_adaptive_rx_coalesce)) || + ((nesadapter->et_use_adaptive_rx_coalesce) && + (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) { if ((nesdev->int_req & NES_INT_TIMER) == 0) { /* Enable Periodic timer interrupts */ nesdev->int_req |= NES_INT_TIMER; @@ -1792,12 +1941,12 @@ void nes_dpc(unsigned long param) } if (int_stat) { - if (int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| - NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3)) { + if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) { /* Ack the interrupts */ nes_write32(nesdev->regs+NES_INT_STAT, - (int_stat & ~(NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| - NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3))); + (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0| + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3))); } temp_int_stat = int_stat; @@ -1862,8 +2011,8 @@ void nes_dpc(unsigned long param) } } /* Don't use the interface interrupt bit stay in loop */ - int_stat &= ~NES_INT_INTF|NES_INT_TIMER|NES_INT_MAC0| - NES_INT_MAC1|NES_INT_MAC2|NES_INT_MAC3; + int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | + NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3; } while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS)); if (timer_ints == 1) { @@ -1874,9 +2023,9 @@ void nes_dpc(unsigned long param) nesdev->timer_only_int_count = 0; nesdev->int_req &= ~NES_INT_TIMER; nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req)); - nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req); + nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req); } else { - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); } } else { if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) @@ -1884,7 +2033,7 @@ void nes_dpc(unsigned long param) nes_nic_init_timer(nesdev); } nesdev->timer_only_int_count = 0; - nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff|(~nesdev->int_req)); + nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req)); } } else { nesdev->timer_only_int_count = 0; @@ -1933,7 +2082,7 @@ static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) do { if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) & NES_CEQE_VALID) { - u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX])))<<32) | + u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) | ((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX]))); u64temp <<= 1; cq = *((struct nes_hw_cq **)&u64temp); @@ -1961,7 +2110,7 @@ static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq) */ static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) { -// u64 u64temp; + /* u64 u64temp; */ u32 head; u32 aeq_size; u32 aeqe_misc; @@ -1980,8 +2129,10 @@ static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq) if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) { if (aeqe_cq_id >= NES_FIRST_QPN) { /* dealing with an accelerated QP related AE */ -// u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])))<<32) | -// ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); + /* + * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) | + * ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]))); + */ nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe); } else { /* TODO: dealing with a CQP related AE */ @@ -2081,6 +2232,8 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) u32 u32temp; u16 phy_data; u16 temp_phy_data; + u32 pcs_val = 0x0f0f0000; + u32 pcs_mask = 0x0f1f0000; spin_lock_irqsave(&nesadapter->phy_lock, flags); if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) { @@ -2144,13 +2297,30 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n", nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0), nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200)); - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200)); - pcs_control_status = nes_read_indexed(nesdev, - NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index&1)*0x200)); + + if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) { + switch (mac_index) { + case 1: + case 3: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200); + break; + default: + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0); + break; + } + } else { + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + pcs_control_status = nes_read_indexed(nesdev, + NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200)); + } + nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n", mac_index, pcs_control_status); - if (nesadapter->OneG_Mode) { + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) { u32temp = 0x01010000; if (nesadapter->port_count > 2) { u32temp |= 0x02020000; @@ -2159,24 +2329,59 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) phy_data = 0; nes_debug(NES_DBG_PHY, "PCS says the link is down\n"); } - } else if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_IRIS) { - nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]); - temp_phy_data = (u16)nes_read_indexed(nesdev, - NES_IDX_MAC_MDIO_CONTROL); - u32temp = 20; - do { - nes_read_10G_phy_reg(nesdev, 1, nesadapter->phy_index[mac_index]); - phy_data = (u16)nes_read_indexed(nesdev, - NES_IDX_MAC_MDIO_CONTROL); - if ((phy_data == temp_phy_data) || (!(--u32temp))) - break; - temp_phy_data = phy_data; - } while (1); - nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", - __func__, phy_data, nesadapter->mac_link_down ? "DOWN" : "UP"); - } else { - phy_data = (0x0f0f0000 == (pcs_control_status & 0x0f1f0000)) ? 4 : 0; + switch (nesadapter->phy_type[mac_index]) { + case NES_PHY_TYPE_IRIS: + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + u32temp = 20; + do { + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((phy_data == temp_phy_data) || (!(--u32temp))) + break; + temp_phy_data = phy_data; + } while (1); + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP"); + break; + + case NES_PHY_TYPE_ARGUS: + /* clear the alarms */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004); + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005); + /* check link status */ + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); + temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + u32temp = 100; + do { + nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1); + + phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); + if ((phy_data == temp_phy_data) || (!(--u32temp))) + break; + temp_phy_data = phy_data; + } while (1); + nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n", + __func__, phy_data, nesadapter->mac_link_down ? "DOWN" : "UP"); + break; + + case NES_PHY_TYPE_PUMA_1G: + if (mac_index < 2) + pcs_val = pcs_mask = 0x01010000; + else + pcs_val = pcs_mask = 0x02020000; + /* fall through */ + default: + phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0; + break; + } } if (phy_data & 0x0004) { @@ -2185,8 +2390,8 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) nes_debug(NES_DBG_PHY, "The Link is UP!!. linkup was %d\n", nesvnic->linkup); if (nesvnic->linkup == 0) { - printk(PFX "The Link is now up for port %u, netdev %p.\n", - mac_index, nesvnic->netdev); + printk(PFX "The Link is now up for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); if (netif_queue_stopped(nesvnic->netdev)) netif_start_queue(nesvnic->netdev); nesvnic->linkup = 1; @@ -2199,8 +2404,8 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number) nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n", nesvnic->linkup); if (nesvnic->linkup == 1) { - printk(PFX "The Link is now down for port %u, netdev %p.\n", - mac_index, nesvnic->netdev); + printk(PFX "The Link is now down for port %s, netdev %p.\n", + nesvnic->netdev->name, nesvnic->netdev); if (!(netif_queue_stopped(nesvnic->netdev))) netif_stop_queue(nesvnic->netdev); nesvnic->linkup = 0; @@ -2254,10 +2459,13 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) u16 pkt_type; u16 rqes_processed = 0; u8 sq_cqes = 0; + u8 nes_use_lro = 0; head = cq->cq_head; cq_size = cq->cq_size; cq->cqes_pending = 1; + if (nesvnic->netdev->features & NETIF_F_LRO) + nes_use_lro = 1; do { if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) & NES_NIC_CQE_VALID) { @@ -2272,8 +2480,10 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) /* bump past the vlan tag */ wqe_fragment_length++; if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) { - u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]); - u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32; + u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) << 32; bus_address = (dma_addr_t)u64temp; if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) { pci_unmap_single(nesdev->pcidev, @@ -2283,8 +2493,10 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) } for (; wqe_fragment_index < 5; wqe_fragment_index++) { if (wqe_fragment_length[wqe_fragment_index]) { - u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+wqe_fragment_index*2]); - u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+wqe_fragment_index*2]))<<32; + u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX + + wqe_fragment_index * 2]); + u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX + + wqe_fragment_index * 2])) <<32; bus_address = (dma_addr_t)u64temp; pci_unmap_page(nesdev->pcidev, bus_address, @@ -2331,7 +2543,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) { nes_write32(nesdev->regs+NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); -// nesadapter->tune_timer.cq_count += cqe_count; + /* nesadapter->tune_timer.cq_count += cqe_count; */ nesdev->currcq_count += cqe_count; cqe_count = 0; nes_replenish_nic_rq(nesvnic); @@ -2379,9 +2591,16 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) >> 16); nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n", nesvnic->netdev->name, vlan_tag); - nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag); + if (nes_use_lro) + lro_vlan_hwaccel_receive_skb(&nesvnic->lro_mgr, rx_skb, + nesvnic->vlan_grp, vlan_tag, NULL); + else + nes_vlan_rx(rx_skb, nesvnic->vlan_grp, vlan_tag); } else { - nes_netif_rx(rx_skb); + if (nes_use_lro) + lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL); + else + nes_netif_rx(rx_skb); } } @@ -2399,7 +2618,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) /* Replenish Nic CQ */ nes_write32(nesdev->regs+NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16)); -// nesdev->nesadapter->tune_timer.cq_count += cqe_count; + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ nesdev->currcq_count += cqe_count; cqe_count = 0; } @@ -2413,26 +2632,27 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq) } while (1); + if (nes_use_lro) + lro_flush_all(&nesvnic->lro_mgr); if (sq_cqes) { barrier(); /* restart the queue if it had been stopped */ if (netif_queue_stopped(nesvnic->netdev)) netif_wake_queue(nesvnic->netdev); } - cq->cq_head = head; /* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n", cq->cq_number, cqe_count, cq->cq_head); */ cq->cqe_allocs_pending = cqe_count; if (unlikely(nesadapter->et_use_adaptive_rx_coalesce)) { -// nesdev->nesadapter->tune_timer.cq_count += cqe_count; + /* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */ nesdev->currcq_count += cqe_count; nes_nic_tune_timer(nesdev); } if (atomic_read(&nesvnic->rx_skbs_needed)) nes_replenish_nic_rq(nesvnic); - } +} /** @@ -2461,7 +2681,7 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) { u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head]. - cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) | + cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) | ((u64)(le32_to_cpu(cq->cq_vbase[head]. cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]))); cqp = *((struct nes_hw_cqp **)&u64temp); @@ -2478,7 +2698,7 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) } u64temp = (((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. - wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX])))<<32) | + wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) | ((u64)(le32_to_cpu(nesdev->cqp.sq_vbase[cqp->sq_tail]. wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]))); cqp_request = *((struct nes_cqp_request **)&u64temp); @@ -2515,7 +2735,7 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) } else { nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n", cqp_request, - le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f); + le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f); if (cqp_request->dynamic) { kfree(cqp_request); } else { @@ -2529,7 +2749,7 @@ static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq) } cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0; - nes_write32(nesdev->regs+NES_CQE_ALLOC, cq->cq_number | (1 << 16)); + nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16)); if (++cqp->sq_tail >= cqp->sq_size) cqp->sq_tail = 0; @@ -2598,13 +2818,13 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nes_debug(NES_DBG_AEQ, "\n"); aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]); if ((NES_AEQE_INBOUND_RDMA&aeq_info) || (!(NES_AEQE_QP&aeq_info))) { - context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); + context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; } else { aeqe_context = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]); aeqe_context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32; context = (unsigned long)nesadapter->qp_table[le32_to_cpu( - aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX])-NES_FIRST_QPN]; + aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN]; BUG_ON(!context); } @@ -2617,7 +2837,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe, nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]); - switch (async_event_id) { case NES_AEQE_AEID_LLP_FIN_RECEIVED: nesqp = *((struct nes_qp **)&context); @@ -3021,7 +3240,7 @@ void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr, cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID); cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32( (((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) | - (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); + (((u32)mac_addr[4]) << 8) | (u32)mac_addr[5]); cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32( (((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]); } else { diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index 8f36e231bdf..745bf94f3f0 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -33,8 +33,12 @@ #ifndef __NES_HW_H #define __NES_HW_H -#define NES_PHY_TYPE_1G 2 -#define NES_PHY_TYPE_IRIS 3 +#include <linux/inet_lro.h> + +#define NES_PHY_TYPE_1G 2 +#define NES_PHY_TYPE_IRIS 3 +#define NES_PHY_TYPE_ARGUS 4 +#define NES_PHY_TYPE_PUMA_1G 5 #define NES_PHY_TYPE_PUMA_10G 6 #define NES_MULTICAST_PF_MAX 8 @@ -965,7 +969,7 @@ struct nes_arp_entry { #define NES_NIC_CQ_DOWNWARD_TREND 16 struct nes_hw_tune_timer { - //u16 cq_count; + /* u16 cq_count; */ u16 threshold_low; u16 threshold_target; u16 threshold_high; @@ -982,8 +986,10 @@ struct nes_hw_tune_timer { #define NES_TIMER_INT_LIMIT 2 #define NES_TIMER_INT_LIMIT_DYNAMIC 10 #define NES_TIMER_ENABLE_LIMIT 4 -#define NES_MAX_LINK_INTERRUPTS 128 -#define NES_MAX_LINK_CHECK 200 +#define NES_MAX_LINK_INTERRUPTS 128 +#define NES_MAX_LINK_CHECK 200 +#define NES_MAX_LRO_DESCRIPTORS 32 +#define NES_LRO_MAX_AGGR 64 struct nes_adapter { u64 fw_ver; @@ -1183,6 +1189,9 @@ struct nes_vnic { u8 of_device_registered; u8 rdma_enabled; u8 rx_checksum_disabled; + u32 lro_max_aggr; + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS]; }; struct nes_ib_device { diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index e5366b013c1..1b0938c8777 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -185,12 +185,13 @@ static int nes_netdev_open(struct net_device *netdev) nic_active |= nic_active_bit; nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active); - macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; macaddr_high += (u16)netdev->dev_addr[1]; - macaddr_low = ((u32)netdev->dev_addr[2]) << 24; - macaddr_low += ((u32)netdev->dev_addr[3]) << 16; - macaddr_low += ((u32)netdev->dev_addr[4]) << 8; - macaddr_low += (u32)netdev->dev_addr[5]; + + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; /* Program the various MAC regs */ for (i = 0; i < NES_MAX_PORT_COUNT; i++) { @@ -451,7 +452,7 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) __le16 *wqe_fragment_length; u32 nr_frags; u32 original_first_length; -// u64 *wqe_fragment_address; + /* u64 *wqe_fragment_address; */ /* first fragment (0) is used by copy buffer */ u16 wqe_fragment_index=1; u16 hoffset; @@ -461,11 +462,12 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) u32 old_head; u32 wqe_misc; - /* nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," - " (%u frags), tso_size=%u\n", - netdev->name, skb->len, skb_headlen(skb), - skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); - */ + /* + * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u," + * " (%u frags), tso_size=%u\n", + * netdev->name, skb->len, skb_headlen(skb), + * skb_shinfo(skb)->nr_frags, skb_is_gso(skb)); + */ if (!netif_carrier_ok(netdev)) return NETDEV_TX_OK; @@ -795,12 +797,12 @@ static int nes_netdev_set_mac_address(struct net_device *netdev, void *p) memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len); printk(PFX "%s: Address length = %d, Address = %s\n", __func__, netdev->addr_len, print_mac(mac, mac_addr->sa_data)); - macaddr_high = ((u16)netdev->dev_addr[0]) << 8; + macaddr_high = ((u16)netdev->dev_addr[0]) << 8; macaddr_high += (u16)netdev->dev_addr[1]; - macaddr_low = ((u32)netdev->dev_addr[2]) << 24; - macaddr_low += ((u32)netdev->dev_addr[3]) << 16; - macaddr_low += ((u32)netdev->dev_addr[4]) << 8; - macaddr_low += (u32)netdev->dev_addr[5]; + macaddr_low = ((u32)netdev->dev_addr[2]) << 24; + macaddr_low += ((u32)netdev->dev_addr[3]) << 16; + macaddr_low += ((u32)netdev->dev_addr[4]) << 8; + macaddr_low += (u32)netdev->dev_addr[5]; for (i = 0; i < NES_MAX_PORT_COUNT; i++) { if (nesvnic->qp_nic_index[i] == 0xf) { @@ -881,12 +883,12 @@ static void nes_netdev_set_multicast_list(struct net_device *netdev) print_mac(mac, multicast_addr->dmi_addr), perfect_filter_register_address+(mc_index * 8), mc_nic_index); - macaddr_high = ((u16)multicast_addr->dmi_addr[0]) << 8; + macaddr_high = ((u16)multicast_addr->dmi_addr[0]) << 8; macaddr_high += (u16)multicast_addr->dmi_addr[1]; - macaddr_low = ((u32)multicast_addr->dmi_addr[2]) << 24; - macaddr_low += ((u32)multicast_addr->dmi_addr[3]) << 16; - macaddr_low += ((u32)multicast_addr->dmi_addr[4]) << 8; - macaddr_low += (u32)multicast_addr->dmi_addr[5]; + macaddr_low = ((u32)multicast_addr->dmi_addr[2]) << 24; + macaddr_low += ((u32)multicast_addr->dmi_addr[3]) << 16; + macaddr_low += ((u32)multicast_addr->dmi_addr[4]) << 8; + macaddr_low += (u32)multicast_addr->dmi_addr[5]; nes_write_indexed(nesdev, perfect_filter_register_address+(mc_index * 8), macaddr_low); @@ -910,23 +912,23 @@ static void nes_netdev_set_multicast_list(struct net_device *netdev) /** * nes_netdev_change_mtu */ -static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) +static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) { struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; - int ret = 0; - u8 jumbomode=0; + struct nes_device *nesdev = nesvnic->nesdev; + int ret = 0; + u8 jumbomode = 0; - if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) + if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu)) return -EINVAL; - netdev->mtu = new_mtu; + netdev->mtu = new_mtu; nesvnic->max_frame_size = new_mtu + VLAN_ETH_HLEN; if (netdev->mtu > 1500) { jumbomode=1; } - nes_nic_init_timer_defaults(nesdev, jumbomode); + nes_nic_init_timer_defaults(nesdev, jumbomode); if (netif_running(netdev)) { nes_netdev_stop(netdev); @@ -936,8 +938,7 @@ static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu) return ret; } -#define NES_ETHTOOL_STAT_COUNT 55 -static const char nes_ethtool_stringset[NES_ETHTOOL_STAT_COUNT][ETH_GSTRING_LEN] = { +static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = { "Link Change Interrupts", "Linearized SKBs", "T/GSO Requests", @@ -993,8 +994,12 @@ static const char nes_ethtool_stringset[NES_ETHTOOL_STAT_COUNT][ETH_GSTRING_LEN] "CQ Depth 32", "CQ Depth 128", "CQ Depth 256", + "LRO aggregated", + "LRO flushed", + "LRO no_desc", }; +#define NES_ETHTOOL_STAT_COUNT ARRAY_SIZE(nes_ethtool_stringset) /** * nes_netdev_get_rx_csum @@ -1189,6 +1194,9 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev, target_stat_values[52] = int_mod_cq_depth_32; target_stat_values[53] = int_mod_cq_depth_128; target_stat_values[54] = int_mod_cq_depth_256; + target_stat_values[55] = nesvnic->lro_mgr.stats.aggregated; + target_stat_values[56] = nesvnic->lro_mgr.stats.flushed; + target_stat_values[57] = nesvnic->lro_mgr.stats.no_desc; } @@ -1219,14 +1227,14 @@ static int nes_netdev_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *et_coalesce) { struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; + struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; unsigned long flags; - spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); + spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); if (et_coalesce->rx_max_coalesced_frames_low) { - shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; + shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low; } if (et_coalesce->rx_max_coalesced_frames_irq) { shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq; @@ -1246,14 +1254,14 @@ static int nes_netdev_set_coalesce(struct net_device *netdev, nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq; if (et_coalesce->use_adaptive_rx_coalesce) { nesadapter->et_use_adaptive_rx_coalesce = 1; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC; nesadapter->et_rx_coalesce_usecs_irq = 0; if (et_coalesce->pkt_rate_low) { - nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; + nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low; } } else { nesadapter->et_use_adaptive_rx_coalesce = 0; - nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; + nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT; if (nesadapter->et_rx_coalesce_usecs_irq) { nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8))); @@ -1270,28 +1278,28 @@ static int nes_netdev_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *et_coalesce) { struct nes_vnic *nesvnic = netdev_priv(netdev); - struct nes_device *nesdev = nesvnic->nesdev; + struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; struct ethtool_coalesce temp_et_coalesce; struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer; unsigned long flags; memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce)); - temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; - temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; - temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; + temp_et_coalesce.rx_coalesce_usecs_irq = nesadapter->et_rx_coalesce_usecs_irq; + temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce; + temp_et_coalesce.rate_sample_interval = nesadapter->et_rate_sample_interval; temp_et_coalesce.pkt_rate_low = nesadapter->et_pkt_rate_low; spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags); - temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; - temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; + temp_et_coalesce.rx_max_coalesced_frames_low = shared_timer->threshold_low; + temp_et_coalesce.rx_max_coalesced_frames_irq = shared_timer->threshold_target; temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high; - temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; + temp_et_coalesce.rx_coalesce_usecs_low = shared_timer->timer_in_use_min; temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max; if (nesadapter->et_use_adaptive_rx_coalesce) { temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use; } spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags); - memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); + memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce)); return 0; } @@ -1370,30 +1378,38 @@ static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd u16 phy_data; et_cmd->duplex = DUPLEX_FULL; - et_cmd->port = PORT_MII; + et_cmd->port = PORT_MII; + if (nesadapter->OneG_Mode) { - et_cmd->supported = SUPPORTED_1000baseT_Full|SUPPORTED_Autoneg; - et_cmd->advertising = ADVERTISED_1000baseT_Full|ADVERTISED_Autoneg; et_cmd->speed = SPEED_1000; - nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], - &phy_data); - if (phy_data&0x1000) { - et_cmd->autoneg = AUTONEG_ENABLE; + if (nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) { + et_cmd->supported = SUPPORTED_1000baseT_Full; + et_cmd->advertising = ADVERTISED_1000baseT_Full; + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_INTERNAL; + et_cmd->phy_address = nesdev->mac_index; } else { - et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->supported = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg; + et_cmd->advertising = ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg; + nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], &phy_data); + if (phy_data & 0x1000) + et_cmd->autoneg = AUTONEG_ENABLE; + else + et_cmd->autoneg = AUTONEG_DISABLE; + et_cmd->transceiver = XCVR_EXTERNAL; + et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; } - et_cmd->transceiver = XCVR_EXTERNAL; - et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; } else { - if (nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) { + if ((nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_IRIS) || + (nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_ARGUS)) { et_cmd->transceiver = XCVR_EXTERNAL; - et_cmd->port = PORT_FIBRE; - et_cmd->supported = SUPPORTED_FIBRE; + et_cmd->port = PORT_FIBRE; + et_cmd->supported = SUPPORTED_FIBRE; et_cmd->advertising = ADVERTISED_FIBRE; et_cmd->phy_address = nesadapter->phy_index[nesdev->mac_index]; } else { et_cmd->transceiver = XCVR_INTERNAL; - et_cmd->supported = SUPPORTED_10000baseT_Full; + et_cmd->supported = SUPPORTED_10000baseT_Full; et_cmd->advertising = ADVERTISED_10000baseT_Full; et_cmd->phy_address = nesdev->mac_index; } @@ -1416,14 +1432,15 @@ static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd struct nes_adapter *nesadapter = nesdev->nesadapter; u16 phy_data; - if (nesadapter->OneG_Mode) { + if ((nesadapter->OneG_Mode) && + (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) { nes_read_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], &phy_data); if (et_cmd->autoneg) { /* Turn on Full duplex, Autoneg, and restart autonegotiation */ phy_data |= 0x1300; } else { - // Turn off autoneg + /* Turn off autoneg */ phy_data &= ~0x1000; } nes_write_1G_phy_reg(nesdev, 0, nesadapter->phy_index[nesdev->mac_index], @@ -1454,6 +1471,8 @@ static struct ethtool_ops nes_ethtool_ops = { .set_sg = ethtool_op_set_sg, .get_tso = ethtool_op_get_tso, .set_tso = ethtool_op_set_tso, + .get_flags = ethtool_op_get_flags, + .set_flags = ethtool_op_set_flags, }; @@ -1607,27 +1626,34 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev, list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]); if ((nesdev->netdev_count == 0) && - (PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index)) { - nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n", - NES_IDX_PHY_PCS_CONTROL_STATUS0+(0x200*(nesvnic->logical_port&1))); + ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) || + ((nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) && + (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) || + ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) { + /* + * nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n", + * NES_IDX_PHY_PCS_CONTROL_STATUS0 + (0x200 * (nesvnic->logical_port & 1))); + */ u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200*(nesvnic->logical_port&1))); - u32temp |= 0x00200000; - nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200*(nesvnic->logical_port&1)), u32temp); + (0x200 * (nesdev->mac_index & 1))); + if (nesdev->nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G) { + u32temp |= 0x00200000; + nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + + (0x200 * (nesdev->mac_index & 1)), u32temp); + } + u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + - (0x200*(nesvnic->logical_port&1)) ); + (0x200 * (nesdev->mac_index & 1))); + if ((u32temp&0x0f1f0000) == 0x0f0f0000) { - if (nesdev->nesadapter->phy_type[nesvnic->logical_port] == NES_PHY_TYPE_IRIS) { + if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_IRIS) { nes_init_phy(nesdev); - nes_read_10G_phy_reg(nesdev, 1, - nesdev->nesadapter->phy_index[nesvnic->logical_port]); + nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1); temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); u32temp = 20; do { - nes_read_10G_phy_reg(nesdev, 1, - nesdev->nesadapter->phy_index[nesvnic->logical_port]); + nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1); phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); if ((phy_data == temp_phy_data) || (!(--u32temp))) @@ -1644,6 +1670,14 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev, nes_debug(NES_DBG_INIT, "The Link is UP!!.\n"); nesvnic->linkup = 1; } + } else if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) { + nes_debug(NES_DBG_INIT, "mac_index=%d, logical_port=%d, u32temp=0x%04X, PCI_FUNC=%d\n", + nesdev->mac_index, nesvnic->logical_port, u32temp, PCI_FUNC(nesdev->pcidev->devfn)); + if (((nesdev->mac_index < 2) && ((u32temp&0x01010000) == 0x01010000)) || + ((nesdev->mac_index > 1) && ((u32temp&0x02020000) == 0x02020000))) { + nes_debug(NES_DBG_INIT, "The Link is UP!!.\n"); + nesvnic->linkup = 1; + } } /* clear the MAC interrupt status, assumes direct logical to physical mapping */ u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index)); diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index c6d5631a699..fe83d1b2b17 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -444,15 +444,13 @@ void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 /** * nes_write_10G_phy_reg */ -void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, - u8 phy_addr, u16 data) +void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg, + u16 data) { - u32 dev_addr; u32 port_addr; u32 u32temp; u32 counter; - dev_addr = 1; port_addr = phy_addr; /* set address */ @@ -492,14 +490,12 @@ void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, * This routine only issues the read, the data must be read * separately. */ -void nes_read_10G_phy_reg(struct nes_device *nesdev, u16 phy_reg, u8 phy_addr) +void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg) { - u32 dev_addr; u32 port_addr; u32 u32temp; u32 counter; - dev_addr = 1; port_addr = phy_addr; /* set address */ diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index ee74f7c7a6d..99b3c4ae86e 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1266,7 +1266,7 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, sq_size = init_attr->cap.max_send_wr; rq_size = init_attr->cap.max_recv_wr; - // check if the encoded sizes are OK or not... + /* check if the encoded sizes are OK or not... */ sq_encoded_size = nes_get_encoded_size(&sq_size); rq_encoded_size = nes_get_encoded_size(&rq_size); @@ -2377,7 +2377,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u8 single_page = 1; u8 stag_key; - region = ib_umem_get(pd->uobject->context, start, length, acc); + region = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(region)) { return (struct ib_mr *)region; } |