Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 17 Aug 2012 00:49:30 +0000 (UTC)
From:      Navdeep Parhar <np@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r239344 - in head/sys: dev/cxgbe dev/cxgbe/common dev/cxgbe/tom modules/cxgbe/tom
Message-ID:  <201208170049.q7H0nUQc093196@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: np
Date: Fri Aug 17 00:49:29 2012
New Revision: 239344
URL: http://svn.freebsd.org/changeset/base/239344

Log:
  Support for TCP DDP (Direct Data Placement) in the T4 TOE module.
  
  Basically, this is automatic rx zero copy when feasible.  TCP payload is
  DMA'd directly into the userspace buffer described by the uio submitted
  in soreceive by an application.
  
  - Works with sockets that are being handled by the TCP offload engine
    of a T4 chip (you need t4_tom.ko module loaded after cxgbe, and an
    "ifconfig +toe" on the cxgbe interface).
  - Does not require any modification to the application.
  - Not enabled by default.  Use hw.t4nex.<X>.toe.ddp="1" to enable it.

Added:
  head/sys/dev/cxgbe/tom/t4_ddp.c   (contents, props changed)
Modified:
  head/sys/dev/cxgbe/common/t4_hw.h
  head/sys/dev/cxgbe/common/t4_msg.h
  head/sys/dev/cxgbe/offload.h
  head/sys/dev/cxgbe/tom/t4_connect.c
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_listen.c
  head/sys/dev/cxgbe/tom/t4_tom.c
  head/sys/dev/cxgbe/tom/t4_tom.h
  head/sys/modules/cxgbe/tom/Makefile

Modified: head/sys/dev/cxgbe/common/t4_hw.h
==============================================================================
--- head/sys/dev/cxgbe/common/t4_hw.h	Thu Aug 16 23:59:29 2012	(r239343)
+++ head/sys/dev/cxgbe/common/t4_hw.h	Fri Aug 17 00:49:29 2012	(r239344)
@@ -161,10 +161,12 @@ struct pagepod {
 #define S_PPOD_TAG    6
 #define M_PPOD_TAG    0xFFFFFF
 #define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+#define G_PPOD_TAG(x) (((x) >> S_PPOD_TAG) & M_PPOD_TAG)
 
 #define S_PPOD_PGSZ    30
 #define M_PPOD_PGSZ    0x3
 #define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+#define G_PPOD_PGSZ(x) (((x) >> S_PPOD_PGSZ) & M_PPOD_PGSZ)
 
 #define S_PPOD_TID    32
 #define M_PPOD_TID    0xFFFFFF

Modified: head/sys/dev/cxgbe/common/t4_msg.h
==============================================================================
--- head/sys/dev/cxgbe/common/t4_msg.h	Thu Aug 16 23:59:29 2012	(r239343)
+++ head/sys/dev/cxgbe/common/t4_msg.h	Fri Aug 17 00:49:29 2012	(r239344)
@@ -792,6 +792,14 @@ struct cpl_set_tcb_field {
 	__be64 val;
 };
 
+struct cpl_set_tcb_field_core {
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 word_cookie;
+	__be64 mask;
+	__be64 val;
+};
+
 /* cpl_set_tcb_field.word_cookie fields */
 #define S_WORD    0
 #define M_WORD    0x1F
@@ -1376,6 +1384,11 @@ struct cpl_rx_data_ack {
 	__be32 credit_dack;
 };
 
+struct cpl_rx_data_ack_core {
+	union opcode_tid ot;
+	__be32 credit_dack;
+};
+
 /* cpl_rx_data_ack.ack_seq fields */
 #define S_RX_CREDITS    0
 #define M_RX_CREDITS    0x3FFFFFF

Modified: head/sys/dev/cxgbe/offload.h
==============================================================================
--- head/sys/dev/cxgbe/offload.h	Thu Aug 16 23:59:29 2012	(r239343)
+++ head/sys/dev/cxgbe/offload.h	Fri Aug 17 00:49:29 2012	(r239344)
@@ -31,13 +31,16 @@
 #ifndef __T4_OFFLOAD_H__
 #define __T4_OFFLOAD_H__
 
-#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
-	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
-	(w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
+#define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \
+	(w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \
+	(w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
 			       V_FW_WR_FLOWID(tid)); \
-	(w)->wr.wr_lo = cpu_to_be64(0); \
+	(w)->wr_lo = cpu_to_be64(0); \
 } while (0)
 
+#define INIT_ULPTX_WR(w, wrlen, atomic, tid) \
+    INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid)
+
 #define INIT_TP_WR(w, tid) do { \
 	(w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \
                               V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \

Modified: head/sys/dev/cxgbe/tom/t4_connect.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_connect.c	Thu Aug 16 23:59:29 2012	(r239343)
+++ head/sys/dev/cxgbe/tom/t4_connect.c	Fri Aug 17 00:49:29 2012	(r239344)
@@ -247,10 +247,14 @@ calc_opt2a(struct socket *so)
 	opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
 	opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id);
 
+#ifdef USE_DDP_RX_FLOW_CONTROL
+	if (toep->ulp_mode == ULP_MODE_TCPDDP)
+		opt2 |= F_RX_FC_VALID | F_RX_FC_DDP;
+#endif
+
 	return (htobe32(opt2));
 }
 
-
 void
 t4_init_connect_cpl_handlers(struct adapter *sc)
 {
@@ -320,7 +324,10 @@ t4_connect(struct toedev *tod, struct so
 
 	toep->tid = atid;
 	toep->l2te = e;
-	toep->ulp_mode = ULP_MODE_NONE;
+	if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)
+		set_tcpddp_ulp_mode(toep);
+	else
+		toep->ulp_mode = ULP_MODE_NONE;
 	SOCKBUF_LOCK(&so->so_rcv);
 	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);

Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c	Thu Aug 16 23:59:29 2012	(r239343)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c	Fri Aug 17 00:49:29 2012	(r239344)
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
+#include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
@@ -299,12 +300,14 @@ make_established(struct toepcb *toep, ui
 }
 
 static int
-send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits)
+send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
+	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
+
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
@@ -323,25 +326,28 @@ t4_rcvd(struct toedev *tod, struct tcpcb
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
-	struct sockbuf *so_rcv = &so->so_rcv;
+	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
-	int must_send;
+	int credits;
 
 	INP_WLOCK_ASSERT(inp);
 
-	SOCKBUF_LOCK(so_rcv);
-	KASSERT(toep->enqueued >= so_rcv->sb_cc,
-	    ("%s: so_rcv->sb_cc > enqueued", __func__));
-	toep->rx_credits += toep->enqueued - so_rcv->sb_cc;
-	toep->enqueued = so_rcv->sb_cc;
-	SOCKBUF_UNLOCK(so_rcv);
-
-	must_send = toep->rx_credits + 16384 >= tp->rcv_wnd;
-	if (must_send || toep->rx_credits >= 15 * 1024) {
-		int credits;
+	SOCKBUF_LOCK(sb);
+	KASSERT(toep->sb_cc >= sb->sb_cc,
+	    ("%s: sb %p has more data (%d) than last time (%d).",
+	    __func__, sb, sb->sb_cc, toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	toep->sb_cc = sb->sb_cc;
+	credits = toep->rx_credits;
+	SOCKBUF_UNLOCK(sb);
+
+	if (credits > 0 &&
+	    (credits + 16384 >= tp->rcv_wnd || credits >= 15 * 1024)) {
 
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
+		credits = send_rx_credits(sc, toep, credits);
+		SOCKBUF_LOCK(sb);
 		toep->rx_credits -= credits;
+		SOCKBUF_UNLOCK(sb);
 		tp->rcv_wnd += credits;
 		tp->rcv_adv += credits;
 	}
@@ -537,7 +543,8 @@ t4_push_frames(struct adapter *sc, struc
 	KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT),
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
-	if (toep->ulp_mode != ULP_MODE_NONE)
+	if (__predict_false(toep->ulp_mode != ULP_MODE_NONE &&
+	    toep->ulp_mode != ULP_MODE_TCPDDP))
 		CXGBE_UNIMPLEMENTED("ulp_mode");
 
 	/*
@@ -765,7 +772,8 @@ do_peer_close(struct sge_iq *iq, const s
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
-	struct socket *so = NULL;
+	struct socket *so;
+	struct sockbuf *sb;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
@@ -785,10 +793,35 @@ do_peer_close(struct sge_iq *iq, const s
 	if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN))
 		goto done;
 
+	tp->rcv_nxt++;	/* FIN */
+
 	so = inp->inp_socket;
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	if (__predict_false(toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) {
+		m = m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL)
+			CXGBE_UNIMPLEMENTED("mbuf alloc failure");
+
+		m->m_len = be32toh(cpl->rcv_nxt) - tp->rcv_nxt;
+		m->m_flags |= M_DDP;	/* Data is already where it should be */
+		m->m_data = "nothing to see here";
+		tp->rcv_nxt = be32toh(cpl->rcv_nxt);
+
+		toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
+
+		KASSERT(toep->sb_cc >= sb->sb_cc,
+		    ("%s: sb %p has more data (%d) than last time (%d).",
+		    __func__, sb, sb->sb_cc, toep->sb_cc));
+		toep->rx_credits += toep->sb_cc - sb->sb_cc;
+#ifdef USE_DDP_RX_FLOW_CONTROL
+		toep->rx_credits -= m->m_len;	/* adjust for F_RX_FC_DDP */
+#endif
+		sbappendstream_locked(sb, m);
+		toep->sb_cc = sb->sb_cc;
+	}
+	socantrcvmore_locked(so);	/* unlocks the sockbuf */
 
-	socantrcvmore(so);
-	tp->rcv_nxt++;	/* FIN */
 	KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt),
 	    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 	    be32toh(cpl->rcv_nxt)));
@@ -1046,7 +1079,8 @@ do_rx_data(struct sge_iq *iq, const stru
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
-	struct sockbuf *so_rcv;
+	struct sockbuf *sb;
+	int len;
 
 	if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) {
 		/*
@@ -1064,11 +1098,12 @@ do_rx_data(struct sge_iq *iq, const stru
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
+	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
-		    __func__, tid, m->m_pkthdr.len, inp->inp_flags);
+		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
@@ -1084,21 +1119,20 @@ do_rx_data(struct sge_iq *iq, const stru
 	}
 #endif
 
-	tp->rcv_nxt += m->m_pkthdr.len;
-	KASSERT(tp->rcv_wnd >= m->m_pkthdr.len,
-	    ("%s: negative window size", __func__));
-	tp->rcv_wnd -= m->m_pkthdr.len;
+	tp->rcv_nxt += len;
+	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
+	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	so = inp_inpcbtosocket(inp);
-	so_rcv = &so->so_rcv;
-	SOCKBUF_LOCK(so_rcv);
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
 
-	if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) {
+	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
-		    __func__, tid, m->m_pkthdr.len);
+		    __func__, tid, len);
 		m_freem(m);
-		SOCKBUF_UNLOCK(so_rcv);
+		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		INP_INFO_WLOCK(&V_tcbinfo);
@@ -1112,23 +1146,76 @@ do_rx_data(struct sge_iq *iq, const stru
 	}
 
 	/* receive buffer autosize */
-	if (so_rcv->sb_flags & SB_AUTOSIZE &&
+	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
-	    so_rcv->sb_hiwat < V_tcp_autorcvbuf_max &&
-	    m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) {
-		unsigned int hiwat = so_rcv->sb_hiwat;
+	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
+	    len > (sbspace(sb) / 8 * 7)) {
+		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
-		if (!sbreserve_locked(so_rcv, newsize, so, NULL))
-			so_rcv->sb_flags &= ~SB_AUTOSIZE;
+		if (!sbreserve_locked(sb, newsize, so, NULL))
+			sb->sb_flags &= ~SB_AUTOSIZE;
 		else
 			toep->rx_credits += newsize - hiwat;
 	}
-	toep->enqueued += m->m_pkthdr.len;
-	sbappendstream_locked(so_rcv, m);
+
+	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
+		int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off;
+
+		if (changed) {
+			if (__predict_false(!(toep->ddp_flags & DDP_SC_REQ))) {
+				/* XXX: handle this if legitimate */
+				panic("%s: unexpected DDP state change %d",
+				    __func__, cpl->ddp_off);
+			}
+			toep->ddp_flags ^= DDP_ON | DDP_SC_REQ;
+		}
+
+		if ((toep->ddp_flags & DDP_OK) == 0 &&
+		    time_uptime >= toep->ddp_disabled + DDP_RETRY_WAIT) {
+			toep->ddp_score = DDP_LOW_SCORE;
+			toep->ddp_flags |= DDP_OK;
+			CTR3(KTR_CXGBE, "%s: tid %u DDP_OK @ %u",
+			    __func__, tid, time_uptime);
+		}
+
+		if (toep->ddp_flags & DDP_ON) {
+
+			/*
+			 * CPL_RX_DATA with DDP on can only be an indicate.  Ask
+			 * soreceive to post a buffer or disable DDP.  The
+			 * payload that arrived in this indicate is appended to
+			 * the socket buffer as usual.
+			 */
+
+#if 0
+			CTR5(KTR_CXGBE,
+			    "%s: tid %u (0x%x) DDP indicate (seq 0x%x, len %d)",
+			    __func__, tid, toep->flags, be32toh(cpl->seq), len);
+#endif
+			sb->sb_flags |= SB_DDP_INDICATE;
+		} else if ((toep->ddp_flags & (DDP_OK|DDP_SC_REQ)) == DDP_OK &&
+		    tp->rcv_wnd > DDP_RSVD_WIN && len >= sc->tt.ddp_thres) {
+
+			/*
+			 * DDP allowed but isn't on (and a request to switch it
+			 * on isn't pending either), and conditions are ripe for
+			 * it to work.  Switch it on.
+			 */
+
+			enable_ddp(sc, toep);
+		}
+	}
+
+	KASSERT(toep->sb_cc >= sb->sb_cc,
+	    ("%s: sb %p has more data (%d) than last time (%d).",
+	    __func__, sb, sb->sb_cc, toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+	sbappendstream_locked(sb, m);
+	toep->sb_cc = sb->sb_cc;
 	sorwakeup_locked(so);
-	SOCKBUF_UNLOCK_ASSERT(so_rcv);
+	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	return (0);

Added: head/sys/dev/cxgbe/tom/t4_ddp.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c	Fri Aug 17 00:49:29 2012	(r239344)
@@ -0,0 +1,1223 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/domain.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/tcp_var.h>
+#define TCPSTATES
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+
+#ifdef TCP_OFFLOAD
+#include "common/common.h"
+#include "common/t4_msg.h"
+#include "common/t4_regs.h"
+#include "common/t4_tcb.h"
+#include "tom/t4_tom.h"
+
+#define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
+#define PPOD_SIZE	(PPOD_SZ(1))
+
+/* XXX: must match A_ULP_RX_TDDP_PSZ */ 
+static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6};
+
+#if 0
+static void
+t4_dump_tcb(struct adapter *sc, int tid)
+{
+	uint32_t tcb_base, off, i, j;
+
+	/* Dump TCB for the tid */
+	tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
+	t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2),
+	    tcb_base + tid * TCB_SIZE);
+	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2));
+	off = 0;
+	printf("\n");
+	for (i = 0; i < 4; i++) {
+		uint32_t buf[8];
+		for (j = 0; j < 8; j++, off += 4)
+			buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off));
+
+		printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
+		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
+		    buf[7]);
+	}
+}
+#endif
+
+#define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
+static int
+alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr)
+{
+	int ppod;
+
+	KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n));
+
+	mtx_lock(&td->ppod_lock);
+	if (n > td->nppods_free) {
+		mtx_unlock(&td->ppod_lock);
+		return (-1);
+	}
+
+	if (td->nppods_free_head >= n) {
+		td->nppods_free_head -= n;
+		ppod = td->nppods_free_head;
+		TAILQ_INSERT_HEAD(&td->ppods, pr, link);
+	} else {
+		struct ppod_region *p;
+
+		ppod = td->nppods_free_head;
+		TAILQ_FOREACH(p, &td->ppods, link) {
+			ppod += p->used + p->free;
+			if (n <= p->free) {
+				ppod -= n;
+				p->free -= n;
+				TAILQ_INSERT_AFTER(&td->ppods, p, pr, link);
+				goto allocated;
+			}
+		}
+
+		if (__predict_false(ppod != td->nppods)) {
+			panic("%s: ppods TAILQ (%p) corrupt."
+			    "  At %d instead of %d at the end of the queue.",
+			    __func__, &td->ppods, ppod, td->nppods);
+		}
+
+		mtx_unlock(&td->ppod_lock);
+		return (-1);
+	}
+
+allocated:
+	pr->used = n;
+	pr->free = 0;
+	td->nppods_free -= n;
+	mtx_unlock(&td->ppod_lock);
+
+	return (ppod);
+}
+
+static void
+free_ppods(struct tom_data *td, struct ppod_region *pr)
+{
+	struct ppod_region *p;
+
+	KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used));
+
+	mtx_lock(&td->ppod_lock);
+	p = TAILQ_PREV(pr, ppod_head, link);
+	if (p != NULL)
+		p->free += pr->used + pr->free;
+	else
+		td->nppods_free_head += pr->used + pr->free;
+	td->nppods_free += pr->used;
+	KASSERT(td->nppods_free <= td->nppods,
+	    ("%s: nppods_free (%d) > nppods (%d).  %d freed this time.",
+	    __func__, td->nppods_free, td->nppods, pr->used));
+	TAILQ_REMOVE(&td->ppods, pr, link);
+	mtx_unlock(&td->ppod_lock);
+}
+
+static inline int
+pages_to_nppods(int npages, int ddp_pgsz)
+{
+	int nsegs = npages * PAGE_SIZE / ddp_pgsz;
+
+	return (howmany(nsegs, PPOD_PAGES));
+}
+
+static void
+free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
+{
+
+	if (db == NULL)
+		return;
+
+	if (db->pages)
+		free(db->pages, M_CXGBE);
+
+	if (db->nppods > 0)
+		free_ppods(td, &db->ppod_region);
+
+	free(db, M_CXGBE);
+}
+
+void
+release_ddp_resources(struct toepcb *toep)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(toep->db); i++) {
+		if (toep->db[i] != NULL) {
+			free_ddp_buffer(toep->td, toep->db[i]);
+			toep->db[i] = NULL;
+		}
+	}
+}
+
+/* SET_TCB_FIELD sent as a ULP command looks like this */
+#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
+    sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
+
+/* RX_DATA_ACK sent as a ULP command looks like this */
+#define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
+    sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
+
+static inline void *
+mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
+    uint64_t word, uint64_t mask, uint64_t val)
+{
+	struct ulptx_idata *ulpsc;
+	struct cpl_set_tcb_field_core *req;
+
+	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
+	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
+
+	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
+	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
+	ulpsc->len = htobe32(sizeof(*req));
+
+	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
+	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
+	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
+	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
+	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
+        req->mask = htobe64(mask);
+        req->val = htobe64(val);
+
+	ulpsc = (struct ulptx_idata *)(req + 1);
+	if (LEN__SET_TCB_FIELD_ULP % 16) {
+		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
+		ulpsc->len = htobe32(0);
+		return (ulpsc + 1);
+	}
+	return (ulpsc);
+}
+
+static inline void *
+mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
+{
+	struct ulptx_idata *ulpsc;
+	struct cpl_rx_data_ack_core *req;
+
+	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
+	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
+
+	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
+	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
+	ulpsc->len = htobe32(sizeof(*req));
+
+	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
+	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
+	req->credit_dack = htobe32(F_RX_MODULATE_RX);
+
+	ulpsc = (struct ulptx_idata *)(req + 1);
+	if (LEN__RX_DATA_ACK_ULP % 16) {
+		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
+		ulpsc->len = htobe32(0);
+		return (ulpsc + 1);
+	}
+	return (ulpsc);
+}
+
+static inline uint64_t
+select_ddp_flags(struct socket *so, int flags, int db_idx)
+{
+	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
+	int waitall = flags & MSG_WAITALL;
+	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
+
+	KASSERT(db_idx == 0 || db_idx == 1,
+	    ("%s: bad DDP buffer index %d", __func__, db_idx));
+
+	if (db_idx == 0) {
+		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
+		if (waitall)
+			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
+		else if (nb)
+			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
+		else
+			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
+	} else {
+		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
+		if (waitall)
+			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
+		else if (nb)
+			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
+		else
+			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
+	}
+
+	return (ddp_flags);
+}
+
+static struct wrqe *
+mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
+    int offset, uint64_t ddp_flags)
+{
+	struct ddp_buffer *db = toep->db[db_idx];
+	struct wrqe *wr;
+	struct work_request_hdr *wrh;
+	struct ulp_txpkt *ulpmc;
+	int len;
+
+	KASSERT(db_idx == 0 || db_idx == 1,
+	    ("%s: bad DDP buffer index %d", __func__, db_idx));
+
+	/*
+	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
+	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
+	 *
+	 * The work request header is 16B and always ends at a 16B boundary.
+	 * The ULPTX master commands that follow must all end at 16B boundaries
+	 * too so we round up the size to 16.
+	 */
+	len = sizeof(*wrh) + 3 * roundup(LEN__SET_TCB_FIELD_ULP, 16) +
+	    roundup(LEN__RX_DATA_ACK_ULP, 16);
+
+	wr = alloc_wrqe(len, toep->ctrlq);
+	if (wr == NULL)
+		return (NULL);
+	wrh = wrtod(wr);
+	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
+	ulpmc = (struct ulp_txpkt *)(wrh + 1);
+
+	/* Write the buffer's tag */
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
+	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
+	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+	    V_TCB_RX_DDP_BUF0_TAG(db->tag));
+
+	/* Update the current offset in the DDP buffer and its total length */
+	if (db_idx == 0)
+		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
+		    W_TCB_RX_DDP_BUF0_OFFSET,
+		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
+		    V_TCB_RX_DDP_BUF0_LEN(db->len));
+	else
+		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
+		    W_TCB_RX_DDP_BUF1_OFFSET,
+		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
+		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
+		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
+
+	/* Update DDP flags */
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
+	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
+	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
+	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
+	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
+
+	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
+	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
+
+	return (wr);
+}
+
+static void
+discourage_ddp(struct toepcb *toep)
+{
+
+	if (toep->ddp_score && --toep->ddp_score == 0) {
+		toep->ddp_flags &= ~DDP_OK;
+		toep->ddp_disabled = time_uptime;
+		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
+		    __func__, toep->tid, time_uptime);
+	}
+}
+
+static int
+handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
+{
+	uint32_t report = be32toh(ddp_report);
+	unsigned int db_flag;
+	struct inpcb *inp = toep->inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	struct sockbuf *sb;
+	struct mbuf *m;
+
+	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
+
+	if (__predict_false(!(report & F_DDP_INV)))
+		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
+
+	INP_WLOCK(inp);
+	so = inp_inpcbtosocket(inp);
+	sb = &so->so_rcv;
+	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+
+		/*
+		 * XXX: think a bit more.
+		 * tcpcb probably gone, but socket should still be around
+		 * because we always wait for DDP completion in soreceive no
+		 * matter what.  Just wake it up and let it clean up.
+		 */
+
+		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
+		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
+		SOCKBUF_LOCK(sb);
+		goto wakeup;
+	}
+
+	tp = intotcpcb(inp);
+	len += be32toh(rcv_nxt) - tp->rcv_nxt;
+	tp->rcv_nxt += len;
+	tp->t_rcvtime = ticks;
+#ifndef USE_DDP_RX_FLOW_CONTROL
+	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
+	tp->rcv_wnd -= len;
+#endif
+
+	m = m_get(M_NOWAIT, MT_DATA);
+	if (m == NULL)
+		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
+	m->m_len = len;
+	m->m_flags |= M_DDP;	/* Data is already where it should be */
+	m->m_data = "nothing to see here";
+
+	SOCKBUF_LOCK(sb);
+	if (report & F_DDP_BUF_COMPLETE)
+		toep->ddp_score = DDP_HIGH_SCORE;
+	else
+		discourage_ddp(toep);
+
+	KASSERT(toep->sb_cc >= sb->sb_cc,
+	    ("%s: sb %p has more data (%d) than last time (%d).",
+	    __func__, sb, sb->sb_cc, toep->sb_cc));
+	toep->rx_credits += toep->sb_cc - sb->sb_cc;
+#ifdef USE_DDP_RX_FLOW_CONTROL
+	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
+#endif
+	sbappendstream_locked(sb, m);
+	toep->sb_cc = sb->sb_cc;
+wakeup:
+	KASSERT(toep->ddp_flags & db_flag,
+	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
+	    __func__, toep, toep->ddp_flags, report));
+	toep->ddp_flags &= ~db_flag;
+	sorwakeup_locked(so);
+	SOCKBUF_UNLOCK_ASSERT(sb);
+
+	INP_WUNLOCK(inp);
+	return (0);
+}
+
+#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
+	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
+	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
+	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
+
+static int
+do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	uint32_t vld;
+	struct toepcb *toep = lookup_tid(sc, tid);
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
+	KASSERT(!toepcb_flag(toep, TPF_SYNQE),
+	    ("%s: toep %p claims to be a synq entry", __func__, toep));
+
+	vld = be32toh(cpl->ddpvld);
+	if (__predict_false(vld & DDP_ERR)) {
+		panic("%s: DDP error 0x%x (tid %d, toep %p)",
+		    __func__, vld, tid, toep);
+	}
+
+	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
+
+	return (0);
+}
+
+static int
+do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
+    struct mbuf *m)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
+	unsigned int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+
+	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
+	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
+	KASSERT(!toepcb_flag(toep, TPF_SYNQE),
+	    ("%s: toep %p claims to be a synq entry", __func__, toep));
+
+	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
+
+	return (0);
+}
+
+void
+enable_ddp(struct adapter *sc, struct toepcb *toep)
+{
+
+	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
+	    ("%s: toep %p has bad ddp_flags 0x%x",
+	    __func__, toep, toep->ddp_flags));
+
+	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
+	    __func__, toep->tid, time_uptime);
+
+	toep->ddp_flags |= DDP_SC_REQ;
+	t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS,
+	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
+	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
+	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
+	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1));
+	t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS,
+	    V_TF_RCV_COALESCE_ENABLE(1), 0);
+}
+
+static inline void
+disable_ddp(struct adapter *sc, struct toepcb *toep)
+{
+
+	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
+	    ("%s: toep %p has bad ddp_flags 0x%x",
+	    __func__, toep, toep->ddp_flags));
+
+	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
+	    __func__, toep->tid, time_uptime);
+
+	toep->ddp_flags |= DDP_SC_REQ;
+	t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS,
+	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1));
+	t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+	    V_TF_DDP_OFF(1));
+}
+
+static int
+hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
+{
+	struct vm_map *map;
+	struct iovec *iov;
+	vm_offset_t start, end;
+	vm_page_t *pp;
+	int n;
+
+	KASSERT(uio->uio_iovcnt == 1,
+	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
+	KASSERT(uio->uio_td->td_proc == curproc,
+	    ("%s: uio proc (%p) is not curproc (%p)",
+	    __func__, uio->uio_td->td_proc, curproc));
+
+	map = &curproc->p_vmspace->vm_map;
+	iov = &uio->uio_iov[0];
+	start = trunc_page((uintptr_t)iov->iov_base);
+	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
+	n = howmany(end - start, PAGE_SIZE);
+
+	if (end - start > MAX_DDP_BUFFER_SIZE)
+		return (E2BIG);
+
+	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
+	if (pp == NULL)
+		return (ENOMEM);
+
+	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
+	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
+		free(pp, M_CXGBE);
+		return (EFAULT);
+	}
+
+	*ppages = pp;
+	*pnpages = n;
+
+	return (0);
+}
+
+static int
+bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
+{
+	int i;
+
+	if (db == NULL || db->npages != npages || db->offset != offset ||
+	    db->len != len)
+		return (1);
+
+	for (i = 0; i < npages; i++) {
+		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
+			return (1);
+	}
+
+	return (0);
+}
+
+static int
+calculate_hcf(int n1, int n2)
+{
+	int a, b, t;
+
+	if (n1 <= n2) {
+		a = n1;
+		b = n2;
+	} else {
+		a = n2;
+		b = n1;
+	}
+
+	while (a != 0) {
+		t = a;
+		a = b % a;
+		b = t;
+	}
+
+	return (b);
+}
+
+static struct ddp_buffer *
+alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset,
+    int len)
+{
+	int i, hcf, seglen, idx, ppod, nppods;
+	struct ddp_buffer *db;
+
+	/*
+	 * The DDP page size is unrelated to the VM page size.  We combine

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201208170049.q7H0nUQc093196>