From owner-svn-src-all@FreeBSD.ORG Mon Oct 21 06:31:57 2013 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTP id F2C3E8A0; Mon, 21 Oct 2013 06:31:56 +0000 (UTC) (envelope-from np@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id DF2902DB9; Mon, 21 Oct 2013 06:31:56 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id r9L6VuAn078677; Mon, 21 Oct 2013 06:31:56 GMT (envelope-from np@svn.freebsd.org) Received: (from np@localhost) by svn.freebsd.org (8.14.7/8.14.5/Submit) id r9L6Vuj7078674; Mon, 21 Oct 2013 06:31:56 GMT (envelope-from np@svn.freebsd.org) Message-Id: <201310210631.r9L6Vuj7078674@svn.freebsd.org> From: Navdeep Parhar Date: Mon, 21 Oct 2013 06:31:56 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r256829 - stable/10/sys/contrib/rdma/krping X-SVN-Group: stable-10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 21 Oct 2013 06:31:57 -0000 Author: np Date: Mon Oct 21 06:31:56 2013 New Revision: 256829 URL: http://svnweb.freebsd.org/changeset/base/256829 Log: MFC r256470: Update krping to the latest upstream code. Move all the FreeBSD specific parts to krping_dev.c, which leaves the other files as close to their upstream versions as possible. Approved by: re (glebius) Modified: stable/10/sys/contrib/rdma/krping/getopt.c stable/10/sys/contrib/rdma/krping/krping.c stable/10/sys/contrib/rdma/krping/krping.h stable/10/sys/contrib/rdma/krping/krping_dev.c Directory Properties: stable/10/sys/ (props changed) Modified: stable/10/sys/contrib/rdma/krping/getopt.c ============================================================================== --- stable/10/sys/contrib/rdma/krping/getopt.c Mon Oct 21 06:27:20 2013 (r256828) +++ stable/10/sys/contrib/rdma/krping/getopt.c Mon Oct 21 06:31:56 2013 (r256829) @@ -5,9 +5,10 @@ #include __FBSDID("$FreeBSD$"); -#include -#include -#include +#include +#include +#include + #include "getopt.h" /** @@ -49,29 +50,29 @@ int krping_getopt(const char *caller, ch if (opts->has_arg & OPT_NOPARAM) { return opts->val; } - printf("%s: the %s option requires " + printk(KERN_INFO "%s: the %s option requires " "an argument\n", caller, token); return -EINVAL; } if (opts->has_arg & OPT_INT) { char* v; - *value = strtoul(val, &v, 0); + *value = simple_strtoul(val, &v, 0); if (!*v) { return opts->val; } - printf("%s: invalid numeric value " + printk(KERN_INFO "%s: invalid numeric value " "in %s=%s\n", caller, token, val); return -EDOM; } if (opts->has_arg & OPT_STRING) { return opts->val; } - printf("%s: unexpected argument %s to the " + printk(KERN_INFO "%s: unexpected argument %s to the " "%s option\n", caller, val, token); return -EINVAL; } } - printf("%s: Unrecognized option %s\n", caller, token); + printk(KERN_INFO "%s: Unrecognized option %s\n", caller, token); return -EOPNOTSUPP; } Modified: stable/10/sys/contrib/rdma/krping/krping.c ============================================================================== --- stable/10/sys/contrib/rdma/krping/krping.c Mon Oct 21 06:27:20 2013 (r256828) +++ stable/10/sys/contrib/rdma/krping/krping.c Mon Oct 21 06:31:56 2013 (r256829) @@ -1,6 +1,6 @@ /* * Copyright (c) 2005 Ammasso, Inc. All rights reserved. - * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2006-2009 Open Grid Computing, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,39 +34,52 @@ #include __FBSDID("$FreeBSD$"); -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include +#include -#include -#include - -#include +#include #include -#include "getopt.h" #include "krping.h" +#include "getopt.h" -#define PFX "krping: " +extern int krping_debug; +#define DEBUG_LOG(cb, x...) if (krping_debug) krping_printf((cb)->cookie, x) +#define PRINTF(cb, x...) krping_printf((cb)->cookie, x) -static int debug = 0; -#define DEBUG_LOG if (debug) printf +MODULE_AUTHOR("Steve Wise"); +MODULE_DESCRIPTION("RDMA ping client/server"); +MODULE_LICENSE("Dual BSD/GPL"); + +static __inline uint64_t +get_cycles(void) +{ + uint32_t low, high; + __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); + return (low | ((u_int64_t)high << 32)); +} + +typedef uint64_t cycles_t; + +enum mem_type { + DMA = 1, + FASTREG = 2, + MW = 3, + MR = 4 +}; static const struct krping_option krping_opts[] = { {"count", OPT_INT, 'C'}, @@ -77,23 +90,29 @@ static const struct krping_option krping {"validate", OPT_NOPARAM, 'V'}, {"server", OPT_NOPARAM, 's'}, {"client", OPT_NOPARAM, 'c'}, - {"dmamr", OPT_NOPARAM, 'D'}, - {"debug", OPT_NOPARAM, 'd'}, - {"wlat", OPT_NOPARAM, 'l'}, - {"rlat", OPT_NOPARAM, 'L'}, - {"bw", OPT_NOPARAM, 'B'}, - {"tx-depth", OPT_INT, 't'}, - {"poll", OPT_NOPARAM, 'P'}, - {"memlimit", OPT_INT, 'm'}, + {"mem_mode", OPT_STRING, 'm'}, + {"server_inv", OPT_NOPARAM, 'I'}, + {"wlat", OPT_NOPARAM, 'l'}, + {"rlat", OPT_NOPARAM, 'L'}, + {"bw", OPT_NOPARAM, 'B'}, + {"duplex", OPT_NOPARAM, 'd'}, + {"txdepth", OPT_INT, 'T'}, + {"poll", OPT_NOPARAM, 'P'}, + {"local_dma_lkey", OPT_NOPARAM, 'Z'}, + {"read_inv", OPT_NOPARAM, 'R'}, + {"fr", OPT_NOPARAM, 'f'}, {NULL, 0, 0} }; -struct mtx krping_mutex; +#define htonll(x) cpu_to_be64((x)) +#define ntohll(x) cpu_to_be64((x)) + +static struct mutex krping_mutex; /* * List of running krping threads. */ -struct krping_cb_list krping_cbs; +static LIST_HEAD(krping_cbs); /* * krping "ping/pong" loop: @@ -109,24 +128,118 @@ struct krping_cb_list krping_cbs; */ /* + * These states are used to signal events between the completion handler + * and the main client or server thread. + * + * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, + * and RDMA_WRITE_COMPLETE for each ping. + */ +enum test_state { + IDLE = 1, + CONNECT_REQUEST, + ADDR_RESOLVED, + ROUTE_RESOLVED, + CONNECTED, + RDMA_READ_ADV, + RDMA_READ_COMPLETE, + RDMA_WRITE_ADV, + RDMA_WRITE_COMPLETE, + ERROR +}; + +struct krping_rdma_info { + uint64_t buf; + uint32_t rkey; + uint32_t size; +}; + +/* * Default max buffer size for IO... */ #define RPING_BUFSIZE 128*1024 -#define RPING_SQ_DEPTH 32 +#define RPING_SQ_DEPTH 64 -static void krping_wait(struct krping_cb *cb, int state) -{ - int rc; - mtx_lock(&cb->lock); - while (cb->state < state) { - rc = msleep(cb, &cb->lock, PCATCH, "krping", 0); - if (rc && rc != ERESTART) { - cb->state = ERROR; - break; - } - } - mtx_unlock(&cb->lock); -} +/* + * Control block struct. + */ +struct krping_cb { + void *cookie; + int server; /* 0 iff client */ + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_qp *qp; + + enum mem_type mem; + struct ib_mr *dma_mr; + + struct ib_fast_reg_page_list *page_list; + int page_list_len; + struct ib_send_wr fastreg_wr; + struct ib_send_wr invalidate_wr; + struct ib_mr *fastreg_mr; + int server_invalidate; + int read_inv; + u8 key; + + struct ib_mw *mw; + struct ib_mw_bind bind_attr; + + struct ib_recv_wr rq_wr; /* recv work request record */ + struct ib_sge recv_sgl; /* recv single SGE */ + struct krping_rdma_info recv_buf;/* malloc'd buffer */ + u64 recv_dma_addr; + DECLARE_PCI_UNMAP_ADDR(recv_mapping) + struct ib_mr *recv_mr; + + struct ib_send_wr sq_wr; /* send work requrest record */ + struct ib_sge send_sgl; + struct krping_rdma_info send_buf;/* single send buf */ + u64 send_dma_addr; + DECLARE_PCI_UNMAP_ADDR(send_mapping) + struct ib_mr *send_mr; + + struct ib_send_wr rdma_sq_wr; /* rdma work request record */ + struct ib_sge rdma_sgl; /* rdma single SGE */ + char *rdma_buf; /* used as rdma sink */ + u64 rdma_dma_addr; + DECLARE_PCI_UNMAP_ADDR(rdma_mapping) + struct ib_mr *rdma_mr; + + uint32_t remote_rkey; /* remote guys RKEY */ + uint64_t remote_addr; /* remote guys TO */ + uint32_t remote_len; /* remote guys LEN */ + + char *start_buf; /* rdma read src */ + u64 start_dma_addr; + DECLARE_PCI_UNMAP_ADDR(start_mapping) + struct ib_mr *start_mr; + + enum test_state state; /* used for cond/signalling */ + wait_queue_head_t sem; + struct krping_stats stats; + + uint16_t port; /* dst port in NBO */ + struct in_addr addr; /* dst addr in NBO */ + char *addr_str; /* dst addr string */ + int verbose; /* verbose logging */ + int count; /* ping count */ + int size; /* ping data size */ + int validate; /* validate ping data */ + int wlat; /* run wlat test */ + int rlat; /* run rlat test */ + int bw; /* run bw test */ + int duplex; /* run bw full duplex test */ + int poll; /* poll or block for rlat test */ + int txdepth; /* SQ depth */ + int local_dma_lkey; /* use 0 for lkey */ + int frtest; /* fastreg test */ + + /* CM stuff */ + struct rdma_cm_id *cm_id; /* connection on client side,*/ + /* listener on server side. */ + struct rdma_cm_id *child_cm_id; /* connection on server side */ + struct list_head list; +}; static int krping_cma_event_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) @@ -134,39 +247,37 @@ static int krping_cma_event_handler(stru int ret; struct krping_cb *cb = cma_id->context; - DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, - (cma_id == cb->cm_id) ? "parent" : "child"); + DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event, + cma_id, (cma_id == cb->cm_id) ? "parent" : "child"); - mtx_lock(&cb->lock); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: cb->state = ADDR_RESOLVED; ret = rdma_resolve_route(cma_id, 2000); if (ret) { - log(LOG_ERR, "rdma_resolve_route error %d\n", - ret); - wakeup(cb); + PRINTF(cb, "rdma_resolve_route error %d\n", ret); + wake_up_interruptible(&cb->sem); } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: cb->state = ROUTE_RESOLVED; - wakeup(cb); + wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_CONNECT_REQUEST: cb->state = CONNECT_REQUEST; cb->child_cm_id = cma_id; - DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id); - wakeup(cb); + DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id); + wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ESTABLISHED: - DEBUG_LOG(PFX "ESTABLISHED\n"); + DEBUG_LOG(cb, "ESTABLISHED\n"); if (!cb->server) { cb->state = CONNECTED; - wakeup(cb); } + wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_ADDR_ERROR: @@ -174,40 +285,34 @@ static int krping_cma_event_handler(stru case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - log(LOG_ERR, "cma event %d, error %d\n", event->event, + PRINTF(cb, "cma event %d, error %d\n", event->event, event->status); cb->state = ERROR; - wakeup(cb); + wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DISCONNECTED: - DEBUG_LOG(PFX "DISCONNECT EVENT...\n"); + PRINTF(cb, "DISCONNECT EVENT...\n"); cb->state = ERROR; - wakeup(cb); + wake_up_interruptible(&cb->sem); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - DEBUG_LOG(PFX "cma detected device removal!!!!\n"); - cb->state = ERROR; - wakeup(cb); - mtx_unlock(&cb->lock); - krping_wait(cb, CLEANUP); - tsleep(cb, 0, "krping", 5000); - return 0; + PRINTF(cb, "cma detected device removal!!!!\n"); + break; default: - log(LOG_ERR, "oof bad type!\n"); - wakeup(cb); + PRINTF(cb, "oof bad type!\n"); + wake_up_interruptible(&cb->sem); break; } - mtx_unlock(&cb->lock); return 0; } static int server_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - log(LOG_ERR, "Received bogus data, size %d\n", + PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } @@ -215,7 +320,7 @@ static int server_recv(struct krping_cb cb->remote_rkey = ntohl(cb->recv_buf.rkey); cb->remote_addr = ntohll(cb->recv_buf.buf); cb->remote_len = ntohl(cb->recv_buf.size); - DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n", + DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n", cb->remote_rkey, (unsigned long long)cb->remote_addr, cb->remote_len); @@ -230,7 +335,7 @@ static int server_recv(struct krping_cb static int client_recv(struct krping_cb *cb, struct ib_wc *wc) { if (wc->byte_len != sizeof(cb->recv_buf)) { - log(LOG_ERR, "Received bogus data, size %d\n", + PRINTF(cb, "Received bogus data, size %d\n", wc->byte_len); return -1; } @@ -250,11 +355,13 @@ static void krping_cq_event_handler(stru struct ib_recv_wr *bad_wr; int ret; - mtx_lock(&cb->lock); - KASSERT(cb->cq == cq, ("bad condition")); + BUG_ON(cb->cq != cq); if (cb->state == ERROR) { - log(LOG_ERR, "cq completion in ERROR state\n"); - mtx_unlock(&cb->lock); + PRINTF(cb, "cq completion in ERROR state\n"); + return; + } + if (cb->frtest) { + PRINTF(cb, "cq completion event in frtest!\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw) @@ -262,76 +369,77 @@ static void krping_cq_event_handler(stru while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { if (wc.status == IB_WC_WR_FLUSH_ERR) { - DEBUG_LOG("cq flushed\n"); + DEBUG_LOG(cb, "cq flushed\n"); continue; } else { - log(LOG_CRIT, "cq completion failed status %d\n", - wc.status); + PRINTF(cb, "cq completion failed with " + "wr_id %Lx status %d opcode %d vender_err %x\n", + wc.wr_id, wc.status, wc.opcode, wc.vendor_err); goto error; } } switch (wc.opcode) { case IB_WC_SEND: - DEBUG_LOG(PFX "send completion\n"); + DEBUG_LOG(cb, "send completion\n"); cb->stats.send_bytes += cb->send_sgl.length; cb->stats.send_msgs++; break; case IB_WC_RDMA_WRITE: - DEBUG_LOG(PFX "rdma write completion\n"); + DEBUG_LOG(cb, "rdma write completion\n"); cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.write_msgs++; cb->state = RDMA_WRITE_COMPLETE; - wakeup(cb); + wake_up_interruptible(&cb->sem); break; case IB_WC_RDMA_READ: - DEBUG_LOG(PFX "rdma read completion\n"); + DEBUG_LOG(cb, "rdma read completion\n"); cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; cb->stats.read_msgs++; cb->state = RDMA_READ_COMPLETE; - wakeup(cb); + wake_up_interruptible(&cb->sem); break; case IB_WC_RECV: - DEBUG_LOG(PFX "recv completion\n"); + DEBUG_LOG(cb, "recv completion\n"); cb->stats.recv_bytes += sizeof(cb->recv_buf); cb->stats.recv_msgs++; if (cb->wlat || cb->rlat || cb->bw) ret = server_recv(cb, &wc); else ret = cb->server ? server_recv(cb, &wc) : - client_recv(cb, &wc); + client_recv(cb, &wc); if (ret) { - log(LOG_ERR, "recv wc error: %d\n", ret); + PRINTF(cb, "recv wc error: %d\n", ret); goto error; } ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); if (ret) { - log(LOG_ERR, "post recv error: %d\n", + PRINTF(cb, "post recv error: %d\n", ret); goto error; } - wakeup(cb); + wake_up_interruptible(&cb->sem); break; default: - log(LOG_ERR, "unknown!!!!! completion\n"); + PRINTF(cb, + "%s:%d Unexpected opcode %d, Shutting down\n", + __func__, __LINE__, wc.opcode); goto error; } } if (ret) { - log(LOG_ERR, "poll error %d\n", ret); + PRINTF(cb, "poll error %d\n", ret); goto error; } - mtx_unlock(&cb->lock); return; error: cb->state = ERROR; - wakeup(cb); - mtx_unlock(&cb->lock); + wake_up_interruptible(&cb->sem); } static int krping_accept(struct krping_cb *cb) @@ -339,7 +447,7 @@ static int krping_accept(struct krping_c struct rdma_conn_param conn_param; int ret; - DEBUG_LOG(PFX "accepting client connection request\n"); + DEBUG_LOG(cb, "accepting client connection request\n"); memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; @@ -347,14 +455,15 @@ static int krping_accept(struct krping_c ret = rdma_accept(cb->child_cm_id, &conn_param); if (ret) { - log(LOG_ERR, "rdma_accept error: %d\n", ret); + PRINTF(cb, "rdma_accept error: %d\n", ret); return ret; } if (!cb->wlat && !cb->rlat && !cb->bw) { - krping_wait(cb, CONNECTED); + wait_event_interruptible(cb->sem, cb->state >= CONNECTED); if (cb->state == ERROR) { - log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); + PRINTF(cb, "wait for CONNECTED state %d\n", + cb->state); return -1; } } @@ -363,19 +472,22 @@ static int krping_accept(struct krping_c static void krping_setup_wr(struct krping_cb *cb) { - /* XXX X86 only here... not mapping for dma! */ - cb->recv_sgl.addr = vtophys(&cb->recv_buf); + cb->recv_sgl.addr = cb->recv_dma_addr; cb->recv_sgl.length = sizeof cb->recv_buf; - if (cb->use_dmamr) + if (cb->local_dma_lkey) + cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey; + else if (cb->mem == DMA) cb->recv_sgl.lkey = cb->dma_mr->lkey; else cb->recv_sgl.lkey = cb->recv_mr->lkey; cb->rq_wr.sg_list = &cb->recv_sgl; cb->rq_wr.num_sge = 1; - cb->send_sgl.addr = vtophys(&cb->send_buf); + cb->send_sgl.addr = cb->send_dma_addr; cb->send_sgl.length = sizeof cb->send_buf; - if (cb->use_dmamr) + if (cb->local_dma_lkey) + cb->send_sgl.lkey = cb->qp->device->local_dma_lkey; + else if (cb->mem == DMA) cb->send_sgl.lkey = cb->dma_mr->lkey; else cb->send_sgl.lkey = cb->send_mr->lkey; @@ -385,18 +497,39 @@ static void krping_setup_wr(struct krpin cb->sq_wr.sg_list = &cb->send_sgl; cb->sq_wr.num_sge = 1; - cb->rdma_addr = vtophys(cb->rdma_buf); - cb->rdma_sgl.addr = cb->rdma_addr; - if (cb->use_dmamr) - cb->rdma_sgl.lkey = cb->dma_mr->lkey; - else - cb->rdma_sgl.lkey = cb->rdma_mr->lkey; - cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; - cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; - cb->rdma_sq_wr.num_sge = 1; + if (cb->server || cb->wlat || cb->rlat || cb->bw) { + cb->rdma_sgl.addr = cb->rdma_dma_addr; + if (cb->mem == MR) + cb->rdma_sgl.lkey = cb->rdma_mr->lkey; + cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; + cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; + cb->rdma_sq_wr.num_sge = 1; + } + + switch(cb->mem) { + case FASTREG: + + /* + * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR. + * both unsignaled. The client uses them to reregister + * the rdma buffers with a new key each iteration. + */ + cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR; + cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + cb->fastreg_wr.wr.fast_reg.length = cb->size; + cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list; + cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len; - if (!cb->server || cb->wlat || cb->rlat || cb->bw) { - cb->start_addr = vtophys(cb->start_buf); + cb->invalidate_wr.next = &cb->fastreg_wr; + cb->invalidate_wr.opcode = IB_WR_LOCAL_INV; + break; + case MW: + cb->bind_attr.wr_id = 0xabbaabba; + cb->bind_attr.send_flags = 0; /* unsignaled */ + cb->bind_attr.length = cb->size; + break; + default: + break; } } @@ -406,134 +539,207 @@ static int krping_setup_buffers(struct k struct ib_phys_buf buf; u64 iovbase; - DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); + DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb); + + cb->recv_dma_addr = dma_map_single(cb->pd->device->dma_device, + &cb->recv_buf, + sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); + pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr); + cb->send_dma_addr = dma_map_single(cb->pd->device->dma_device, + &cb->send_buf, sizeof(cb->send_buf), + DMA_BIDIRECTIONAL); + pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr); - if (cb->use_dmamr) { + if (cb->mem == DMA) { cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE); if (IS_ERR(cb->dma_mr)) { - log(LOG_ERR, "reg_dmamr failed\n"); - return PTR_ERR(cb->dma_mr); + DEBUG_LOG(cb, "reg_dmamr failed\n"); + ret = PTR_ERR(cb->dma_mr); + goto bail; } } else { + if (!cb->local_dma_lkey) { + buf.addr = cb->recv_dma_addr; + buf.size = sizeof cb->recv_buf; + DEBUG_LOG(cb, "recv buf dma_addr %llx size %d\n", buf.addr, + (int)buf.size); + iovbase = cb->recv_dma_addr; + cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + IB_ACCESS_LOCAL_WRITE, + &iovbase); + + if (IS_ERR(cb->recv_mr)) { + DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); + ret = PTR_ERR(cb->recv_mr); + goto bail; + } - buf.addr = vtophys(&cb->recv_buf); - buf.size = sizeof cb->recv_buf; - iovbase = vtophys(&cb->recv_buf); - cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - IB_ACCESS_LOCAL_WRITE, - &iovbase); - - if (IS_ERR(cb->recv_mr)) { - log(LOG_ERR, "recv_buf reg_mr failed\n"); - return PTR_ERR(cb->recv_mr); - } - - buf.addr = vtophys(&cb->send_buf); - buf.size = sizeof cb->send_buf; - iovbase = vtophys(&cb->send_buf); - cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, - 0, &iovbase); - - if (IS_ERR(cb->send_mr)) { - log(LOG_ERR, "send_buf reg_mr failed\n"); - ib_dereg_mr(cb->recv_mr); - return PTR_ERR(cb->send_mr); + buf.addr = cb->send_dma_addr; + buf.size = sizeof cb->send_buf; + DEBUG_LOG(cb, "send buf dma_addr %llx size %d\n", buf.addr, + (int)buf.size); + iovbase = cb->send_dma_addr; + cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + 0, &iovbase); + + if (IS_ERR(cb->send_mr)) { + DEBUG_LOG(cb, "send_buf reg_mr failed\n"); + ret = PTR_ERR(cb->send_mr); + goto bail; + } } } - /* RNIC adapters have a limit upto which it can register physical memory - * If DMA-MR memory mode is set then normally driver registers maximum - * supported memory. After that if contigmalloc allocates memory beyond the - * specified RNIC limit then Krping may not work. - */ - if (cb->use_dmamr && cb->memlimit) - cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, cb->memlimit, - PAGE_SIZE, 0); - else - cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, - PAGE_SIZE, 0); - + cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->rdma_buf) { - log(LOG_ERR, "rdma_buf malloc failed\n"); - ret = ENOMEM; - goto err1; - } - if (!cb->use_dmamr) { - - buf.addr = vtophys(cb->rdma_buf); - buf.size = cb->size; - iovbase = vtophys(cb->rdma_buf); - cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + DEBUG_LOG(cb, "rdma_buf malloc failed\n"); + ret = -ENOMEM; + goto bail; + } + + cb->rdma_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->rdma_buf, cb->size, + DMA_BIDIRECTIONAL); + pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr); + if (cb->mem != DMA) { + switch (cb->mem) { + case FASTREG: + cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + + PAGE_SIZE) >> PAGE_SHIFT; + cb->page_list = ib_alloc_fast_reg_page_list( + cb->pd->device, + cb->page_list_len); + if (IS_ERR(cb->page_list)) { + DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); + ret = PTR_ERR(cb->page_list); + goto bail; + } + cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd, + cb->page_list->max_page_list_len); + if (IS_ERR(cb->fastreg_mr)) { + DEBUG_LOG(cb, "recv_buf reg_mr failed\n"); + ret = PTR_ERR(cb->fastreg_mr); + goto bail; + } + DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p" + " page_list_len %u\n", cb->fastreg_mr->rkey, + cb->page_list, cb->page_list_len); + break; + case MW: + cb->mw = ib_alloc_mw(cb->pd); + if (IS_ERR(cb->mw)) { + DEBUG_LOG(cb, "recv_buf alloc_mw failed\n"); + ret = PTR_ERR(cb->mw); + goto bail; + } + DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey); + /*FALLTHROUGH*/ + case MR: + buf.addr = cb->rdma_dma_addr; + buf.size = cb->size; + iovbase = cb->rdma_dma_addr; + cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE, &iovbase); - - if (IS_ERR(cb->rdma_mr)) { - log(LOG_ERR, "rdma_buf reg_mr failed\n"); - ret = PTR_ERR(cb->rdma_mr); - goto err2; + if (IS_ERR(cb->rdma_mr)) { + DEBUG_LOG(cb, "rdma_buf reg_mr failed\n"); + ret = PTR_ERR(cb->rdma_mr); + goto bail; + } + DEBUG_LOG(cb, "rdma buf dma_addr %llx size %d mr rkey 0x%x\n", + buf.addr, (int)buf.size, cb->rdma_mr->rkey); + break; + default: + ret = -EINVAL; + goto bail; + break; } } if (!cb->server || cb->wlat || cb->rlat || cb->bw) { - if (cb->use_dmamr && cb->memlimit) - cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, - 0, cb->memlimit, PAGE_SIZE, 0); - else - cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, - 0, -1UL, PAGE_SIZE, 0); + + cb->start_buf = kmalloc(cb->size, GFP_KERNEL); if (!cb->start_buf) { - log(LOG_ERR, "start_buf malloc failed\n"); - ret = ENOMEM; - goto err2; + DEBUG_LOG(cb, "start_buf malloc failed\n"); + ret = -ENOMEM; + goto bail; } - if (!cb->use_dmamr) { + + cb->start_dma_addr = dma_map_single(cb->pd->device->dma_device, + cb->start_buf, cb->size, + DMA_BIDIRECTIONAL); + pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr); + + if (cb->mem == MR || cb->mem == MW) { unsigned flags = IB_ACCESS_REMOTE_READ; - if (cb->wlat || cb->rlat || cb->bw) + if (cb->wlat || cb->rlat || cb->bw) flags |= IB_ACCESS_REMOTE_WRITE; - buf.addr = vtophys(cb->start_buf); + + buf.addr = cb->start_dma_addr; buf.size = cb->size; - iovbase = vtophys(cb->start_buf); + DEBUG_LOG(cb, "start buf dma_addr %llx size %d\n", + buf.addr, (int)buf.size); + iovbase = cb->start_dma_addr; cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, flags, &iovbase); if (IS_ERR(cb->start_mr)) { - log(LOG_ERR, "start_buf reg_mr failed\n"); + DEBUG_LOG(cb, "start_buf reg_mr failed\n"); ret = PTR_ERR(cb->start_mr); - goto err3; + goto bail; } } } krping_setup_wr(cb); - DEBUG_LOG(PFX "allocated & registered buffers...\n"); + DEBUG_LOG(cb, "allocated & registered buffers...\n"); return 0; -err3: - contigfree(cb->start_buf, cb->size, M_DEVBUF); - - if (!cb->use_dmamr) +bail: + if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr)) + ib_dereg_mr(cb->fastreg_mr); + if (cb->mw && !IS_ERR(cb->mw)) + ib_dealloc_mw(cb->mw); + if (cb->rdma_mr && !IS_ERR(cb->rdma_mr)) ib_dereg_mr(cb->rdma_mr); -err2: - contigfree(cb->rdma_buf, cb->size, M_DEVBUF); -err1: - if (cb->use_dmamr) + if (cb->page_list && !IS_ERR(cb->page_list)) + ib_free_fast_reg_page_list(cb->page_list); + if (cb->dma_mr && !IS_ERR(cb->dma_mr)) ib_dereg_mr(cb->dma_mr); - else { + if (cb->recv_mr && !IS_ERR(cb->recv_mr)) ib_dereg_mr(cb->recv_mr); + if (cb->send_mr && !IS_ERR(cb->send_mr)) ib_dereg_mr(cb->send_mr); - } + if (cb->rdma_buf) + kfree(cb->rdma_buf); + if (cb->start_buf) + kfree(cb->start_buf); return ret; } static void krping_free_buffers(struct krping_cb *cb) { - DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb); + DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb); -#if 0 + if (cb->dma_mr) + ib_dereg_mr(cb->dma_mr); + if (cb->send_mr) + ib_dereg_mr(cb->send_mr); + if (cb->recv_mr) + ib_dereg_mr(cb->recv_mr); + if (cb->rdma_mr) + ib_dereg_mr(cb->rdma_mr); + if (cb->start_mr) + ib_dereg_mr(cb->start_mr); + if (cb->fastreg_mr) + ib_dereg_mr(cb->fastreg_mr); + if (cb->mw) + ib_dealloc_mw(cb->mw); + dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, recv_mapping), sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); @@ -543,24 +749,12 @@ static void krping_free_buffers(struct k dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, rdma_mapping), cb->size, DMA_BIDIRECTIONAL); -#endif - contigfree(cb->rdma_buf, cb->size, M_DEVBUF); - if (!cb->server || cb->wlat || cb->rlat || cb->bw) { -#if 0 + kfree(cb->rdma_buf); + if (cb->start_buf) { dma_unmap_single(cb->pd->device->dma_device, pci_unmap_addr(cb, start_mapping), cb->size, DMA_BIDIRECTIONAL); -#endif - contigfree(cb->start_buf, cb->size, M_DEVBUF); - } - if (cb->use_dmamr) - ib_dereg_mr(cb->dma_mr); - else { - ib_dereg_mr(cb->send_mr); - ib_dereg_mr(cb->recv_mr); - ib_dereg_mr(cb->rdma_mr); - if (!cb->server) - ib_dereg_mr(cb->start_mr); + kfree(cb->start_buf); } } @@ -577,6 +771,7 @@ static int krping_create_qp(struct krpin init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = cb->cq; init_attr.recv_cq = cb->cq; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; if (cb->server) { ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); @@ -603,36 +798,36 @@ static int krping_setup_qp(struct krping int ret; cb->pd = ib_alloc_pd(cm_id->device); if (IS_ERR(cb->pd)) { - log(LOG_ERR, "ib_alloc_pd failed\n"); + PRINTF(cb, "ib_alloc_pd failed\n"); return PTR_ERR(cb->pd); } - DEBUG_LOG(PFX "created pd %p\n", cb->pd); + DEBUG_LOG(cb, "created pd %p\n", cb->pd); - strlcpy(cb->name, cb->pd->device->name, sizeof(cb->name)); + strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name)); cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, cb, cb->txdepth * 2, 0); if (IS_ERR(cb->cq)) { - log(LOG_ERR, "ib_create_cq failed\n"); + PRINTF(cb, "ib_create_cq failed\n"); ret = PTR_ERR(cb->cq); goto err1; } - DEBUG_LOG(PFX "created cq %p\n", cb->cq); + DEBUG_LOG(cb, "created cq %p\n", cb->cq); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***