From owner-p4-projects@FreeBSD.ORG  Wed Jan 23 05:09:01 2008
Return-Path: <owner-p4-projects@FreeBSD.ORG>
Delivered-To: p4-projects@freebsd.org
Received: by hub.freebsd.org (Postfix, from userid 32767)
	id 1B0E616A420; Wed, 23 Jan 2008 05:09:01 +0000 (UTC)
Delivered-To: perforce@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id D3C1216A418
	for <perforce@freebsd.org>; Wed, 23 Jan 2008 05:09:00 +0000 (UTC)
	(envelope-from kmacy@freebsd.org)
Received: from repoman.freebsd.org (repoman.freebsd.org
	[IPv6:2001:4f8:fff6::29])
	by mx1.freebsd.org (Postfix) with ESMTP id BE4E513C43E
	for <perforce@freebsd.org>; Wed, 23 Jan 2008 05:09:00 +0000 (UTC)
	(envelope-from kmacy@freebsd.org)
Received: from repoman.freebsd.org (localhost [127.0.0.1])
	by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0N590Zh021834
	for <perforce@freebsd.org>; Wed, 23 Jan 2008 05:09:00 GMT
	(envelope-from kmacy@freebsd.org)
Received: (from perforce@localhost)
	by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0N590i0021831
	for perforce@freebsd.org; Wed, 23 Jan 2008 05:09:00 GMT
	(envelope-from kmacy@freebsd.org)
Date: Wed, 23 Jan 2008 05:09:00 GMT
Message-Id: <200801230509.m0N590i0021831@repoman.freebsd.org>
X-Authentication-Warning: repoman.freebsd.org: perforce set sender to
	kmacy@freebsd.org using -f
From: Kip Macy <kmacy@FreeBSD.org>
To: Perforce Change Reviews <perforce@freebsd.org>
Cc: 
Subject: PERFORCE change 133915 for review
X-BeenThere: p4-projects@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: p4 projects tree changes <p4-projects.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/p4-projects>,
	<mailto:p4-projects-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/p4-projects>
List-Post: <mailto:p4-projects@freebsd.org>
List-Help: <mailto:p4-projects-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/p4-projects>,
	<mailto:p4-projects-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Wed, 23 Jan 2008 05:09:01 -0000

http://perforce.freebsd.org/chv.cgi?CH=133915

Change 133915 by kmacy@kmacy:storage:toehead on 2008/01/23 05:08:06

	basic zero-copy send and some infrastructure for DDP

Affected files ...

.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 edit

Differences ...

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 (text+ko) ====

@@ -3324,6 +3324,53 @@
 	SOCK_UNLOCK(lctx->lso);
 }
 
+
+int
+t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
+		   unsigned int pg_off, unsigned int color)
+{
+	unsigned int i, j, pidx;
+	struct pagepod *p;
+	struct mbuf *m;
+	struct ulp_mem_io *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	unsigned int tid = toep->tp_tid;
+	const struct tom_data *td = TOM_DATA(TOE_DEV(so));
+	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
+
+	for (i = 0; i < nppods; ++i) {
+		m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
+		m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+		req = mtod(m, struct ulp_mem_io *);
+		m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+		req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
+					   V_ULPTX_CMD(ULP_MEM_WRITE));
+		req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
+				 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
+
+		p = (struct pagepod *)(req + 1);
+		if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
+			p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+			p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
+						  V_PPOD_COLOR(color));
+			p->pp_max_offset = htonl(maxoff);
+			p->pp_page_offset = htonl(pg_off);
+			p->pp_rsvd = 0;
+			for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
+				p->pp_addr[j] = pidx < gl->dgl_nelem ?
+				    htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
+		} else
+			p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
+		send_or_defer(so, tp, m, 0);
+		ppod_addr += PPOD_SIZE;
+	}
+	return (0);
+}
+
+
 void
 t3_init_wr_tab(unsigned int wr_len)
 {

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 (text+ko) ====

@@ -38,6 +38,7 @@
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
+#include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
@@ -94,13 +95,13 @@
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
     int *flagsp);
 
-#ifdef notyet
 #define VM_HOLD_WRITEABLE	0x1
-static int  vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
-    int *count, int flags);
-#endif
+static int  vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
 static void vm_fault_unhold_pages(vm_page_t *m, int count);
 #define TMP_IOV_MAX 16
+#ifndef PG_FRAME
+#define PG_FRAME	~PAGE_MASK
+#endif
 
 void
 t3_init_socket_ops(void)
@@ -123,7 +124,6 @@
 #endif
 }
 
-
 struct cxgb_dma_info {
 	size_t			cdi_mapped;
 	int			cdi_nsegs;
@@ -182,21 +182,72 @@
 	}
 }
 
+static void
+cxgb_zero_copy_free(void *cl, void *arg)
+{
+	struct mbuf_vec *mv;
+	struct mbuf *m = (struct mbuf *)cl;
+
+	mv = mtomv(m);
+	/*
+	 * Physical addresses, don't try to free should be unheld separately from sbdrop
+	 *
+	 */
+	mv->mv_count = 0;
+	m_free_iovec(m, m->m_type);
+}
 
-static void
-cxgb_zero_copy_free(void *cl, void *arg) {}
 
 static int
 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
 {
+	struct iovec *iov = uio->uio_iov;
+	int iovcnt = uio->uio_iovcnt;
+	int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
+	uint64_t start, end;
+	vm_page_t *mp;
+	
+	totbytes = totcount = 0;
+	maxcount = *held;
+
+	mp = m;
+	for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount);  i++, iov++) {
+		count = maxcount - totcount;
+		    
+		start = (uint64_t)iov->iov_base;
+		end = (uint64_t)((caddr_t)iov->iov_base + iov->iov_len);
+		start &= PG_FRAME;
+		end += PAGE_MASK;
+		end &= PG_FRAME;
+		npages = (end - start) >> PAGE_SHIFT;
+		
+		count = min(count, npages);
 
-	return (EINVAL);
+		err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
+		if (err) {
+			vm_fault_unhold_pages(m, totcount);
+			return (err);
+		}
+		mp += count;
+		totcount += count;
+		curbytes = iov->iov_len;
+		if (count != npages)
+			curbytes = count*PAGE_SIZE - (((uint64_t)iov->iov_base)&PAGE_MASK);
+		totbytes += curbytes;
+	}
+	uio->uio_resid -= totbytes;
+
+	return (0);
 }
 
 static void
-cxgb_wait_dma_completion(struct toepcb *tp)
+cxgb_wait_dma_completion(struct toepcb *toep)
 {
+	struct mtx *lock;
 	
+	lock = &toep->tp_tp->t_inpcb->inp_mtx;
+	INP_LOCK(toep->tp_tp->t_inpcb);
+	cv_wait_unlock(&toep->tp_cv, lock);
 }
 
 static int
@@ -233,7 +284,13 @@
 		mi_collapse_sge(mi, segs);
 
 	*m = m0;
-	
+
+	/*
+	 * This appears to be a no-op at the moment
+	 * as busdma is all or nothing need to make
+	 * sure the tag values are large enough
+	 *
+	 */
 	if (cdi.cdi_mapped < uio->uio_resid) {
 		uio->uio_resid -= cdi.cdi_mapped;
 	} else
@@ -304,10 +361,11 @@
 		}
 		uio->uio_resid -= m->m_pkthdr.len;
 		sent += m->m_pkthdr.len;
-		sbappend_locked(&so->so_snd, m);
+		sbappend(&so->so_snd, m);
 		t3_push_frames(so, TRUE);
 		iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
 	}
+
 	/*
 	 * Wait for pending I/O to be DMA'd to the card 
 	 * 
@@ -454,51 +512,45 @@
  *  - hold all pages
  *  - return number of pages in count
  */
-#ifdef notyet
 static int
-vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
+vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
 {
 
 	vm_offset_t start, va;
 	vm_paddr_t pa;
 	int pageslen, faults, rv;
-	
+
 	struct thread *td;
 	vm_map_t map;
 	pmap_t pmap;
 	vm_page_t m, *pages;
 	vm_prot_t prot;
-	
-	start = addr & ~PAGE_MASK;
-	pageslen = roundup2(addr + len, PAGE_SIZE);
-	if (*count < (pageslen >> PAGE_SHIFT))
-		return (EFBIG);
 
-	*count = pageslen >> PAGE_SHIFT;
 	/*
 	 * Check that virtual address range is legal
 	 * This check is somewhat bogus as on some architectures kernel
 	 * and user do not share VA - however, it appears that all FreeBSD
 	 * architectures define it
 	 */
-	if (addr + len > VM_MAXUSER_ADDRESS)
+	pageslen =  count * PAGE_SIZE;
+	if (addr + pageslen > VM_MAXUSER_ADDRESS)
 		return (EFAULT);
-	
+
 	td = curthread;
 	map = &td->td_proc->p_vmspace->vm_map;
 	pmap = &td->td_proc->p_vmspace->vm_pmap;
 	pages = mp;
 
 	prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
-	bzero(pages, sizeof(vm_page_t *) * (*count));
+	bzero(pages, sizeof(vm_page_t *) * count);
 retry:
-	
+
 	/*
 	 * First optimistically assume that all pages are resident (and R/W if for write)
 	 * if so just mark pages as held (and dirty if for write) and return
 	 */
 	vm_page_lock_queues();
-	for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
+	for (start = addr, pages = mp, faults = 0, va = addr; va < start + pageslen; va += PAGE_SIZE, pages++) {
 		/*
 		 * Assure that we only hold the page once
 		 */
@@ -514,9 +566,10 @@
 				faults++;
 				continue;
 			}
+			
 			*pages = m;
-		if (flags & VM_HOLD_WRITEABLE)
-			vm_page_dirty(m);
+			if (flags & VM_HOLD_WRITEABLE)
+				vm_page_dirty(m);
 		}
 	}
 	vm_page_unlock_queues();
@@ -546,13 +599,15 @@
 
 error:	
 	vm_page_lock_queues();
-	for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) 
+	for (pages = mp,
+		 va = start; va < start + pageslen;
+	     va += PAGE_SIZE,
+		 pages++) 
 		if (*pages)
 			vm_page_unhold(*pages);
 	vm_page_unlock_queues();
 	return (EFAULT);
 }
-#endif
 
 static void
 vm_fault_unhold_pages(vm_page_t *mp, int count)
@@ -567,3 +622,276 @@
 	vm_page_unlock_queues();
 }
 
+/**
+ *	t3_pin_pages - pin a user memory range and prepare it for DDP
+ *	@addr - the starting address
+ *	@len - the length of the range
+ *	@newgl - contains the pages and physical addresses of the pinned range
+ *	@gl - an existing gather list, may be %NULL
+ *
+ *	Pins the pages in the user-space memory range [addr, addr + len) and
+ *	maps them for DMA.  Returns a gather list with the pinned pages and
+ *	their physical addresses.  If @gl is non NULL the pages it describes
+ *	are compared against the pages for [addr, addr + len), and if the
+ *	existing gather list already covers the range a new list is not
+ *	allocated.  Returns 0 on success, or a negative errno.  On success if
+ *	a new gather list was allocated it is returned in @newgl.
+ */ 
+static int
+t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, unsigned long addr,
+    size_t len, struct ddp_gather_list **newgl,
+    const struct ddp_gather_list *gl)
+{
+	int i, err;
+	size_t pg_off;
+	unsigned int npages;
+	struct ddp_gather_list *p;
+
+	if (addr >= VM_MAXUSER_ADDRESS)
+		return (EINVAL);
+#if 0	
+	if (!access_ok(VERIFY_WRITE, addr, len))
+		return (EFAULT);
+#endif
+	pg_off = addr & ~PAGE_MASK;
+	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
+	    M_DEVBUF, M_NOWAIT);
+	if (!p)
+		return (ENOMEM);
+
+
+	err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
+
+	if (err)
+		goto free_gl;
+
+	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
+	    gl->dgl_length >= len) {
+		for (i = 0; i < npages; ++i)
+			if (p->dgl_pages[i] != gl->dgl_pages[i])
+				goto different_gl;
+		err = 0;
+		goto unpin;
+	}
+
+different_gl:
+	p->dgl_length = len;
+	p->dgl_offset = pg_off;
+	p->dgl_nelem = npages;
+#if 0	
+	p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
+				       PAGE_SIZE - pg_off,
+				       PCI_DMA_FROMDEVICE) - pg_off;
+	for (i = 1; i < npages; ++i)
+		p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+#endif	
+
+	*newgl = p;
+	return 0;
+unpin:
+	vm_fault_unhold_pages(p->dgl_pages, npages);
+
+free_gl:
+	free(p, M_DEVBUF);
+	*newgl = NULL;
+	return err;
+}
+
+/*
+ * Return the # of page pods needed to accommodate a # of pages.
+ */
+static inline unsigned int
+pages2ppods(unsigned int pages)
+{
+	return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
+}
+
+/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
+#define MAX_PPODS 64U
+
+/*
+ * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
+ * the TCB.  We allocate page pods in multiples of PPOD_CLUSTER_SIZE.  First we
+ * try to allocate enough page pods to accommodate the whole buffer, subject to
+ * the MAX_PPODS limit.  If that fails we try to allocate PPOD_CLUSTER_SIZE page
+ * pods before failing entirely.
+ */
+static int
+alloc_buf1_ppods(struct socket *so, struct ddp_state *p,
+			    unsigned long addr, unsigned int len)
+{
+	int tag, npages, nppods;
+	struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+	npages = ((addr & ~PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	nppods = min(pages2ppods(npages), MAX_PPODS);
+#ifdef notyet	
+	nppods = ALIGN(nppods, PPOD_CLUSTER_SIZE);
+#endif	
+	tag = t3_alloc_ppods(d, nppods);
+	if (tag < 0 && nppods > PPOD_CLUSTER_SIZE) {
+		nppods = PPOD_CLUSTER_SIZE;
+		tag = t3_alloc_ppods(d, nppods);
+	}
+	if (tag < 0)
+		return (ENOMEM);
+
+	p->ubuf_nppods = nppods;
+	p->ubuf_tag = tag;
+#if NUM_DDP_KBUF == 1
+	t3_set_ddp_tag(so, 1, tag << 6);
+#endif
+	return 0;
+}
+
+/*
+ * Starting offset for the user DDP buffer.  A non-0 value ensures a DDP flush
+ * won't block indefinitely if there's nothing to place (which should be rare).
+ */
+#define UBUF_OFFSET 1
+
+static __inline unsigned long
+select_ddp_flags(const struct socket *so, int buf_idx,
+					     int nonblock, int rcv_flags)
+{
+	if (buf_idx == 1) {
+		if (__predict_false(rcv_flags & MSG_WAITALL))
+			return V_TF_DDP_PSH_NO_INVALIDATE(1) |
+			       V_TF_DDP_PUSH_DISABLE_1(1);
+		if (nonblock)
+			return V_TF_DDP_BUF1_FLUSH(1);
+
+		return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so),
+							ddp_push_wait));
+	}
+
+	if (__predict_false(rcv_flags & MSG_WAITALL))
+		return V_TF_DDP_PSH_NO_INVALIDATE(1) |
+		       V_TF_DDP_PUSH_DISABLE_0(1);
+	if (nonblock)
+		return V_TF_DDP_BUF0_FLUSH(1);
+
+	return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait));
+}
+
+/**
+ * setup_iovec_ppods - setup HW page pods for a user iovec
+ * @sk: the associated socket
+ * @iov: the iovec
+ * @oft: additional bytes to map before the start of the buffer
+ *
+ * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
+ * page pods for user buffers on the first call per socket.  Afterwards we
+ * limit the buffer length to whatever the existing page pods can accommodate.
+ * Returns a negative error code or the length of the mapped buffer.
+ *
+ * The current implementation handles iovecs with only one entry.
+ */
+static int
+setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft)
+{
+	int err;
+	unsigned int len;
+	struct ddp_gather_list *gl = NULL;
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+	unsigned long addr = (unsigned long)iov->iov_base - oft;
+
+	if (__predict_false(!p->ubuf_nppods)) {
+		err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft);
+		if (err)
+			return err;
+	}
+
+	len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
+	len -= addr & ~PAGE_MASK;
+	if (len > M_TCB_RX_DDP_BUF0_LEN)
+		len = M_TCB_RX_DDP_BUF0_LEN;
+	len = min(len, sototcpcb(so)->rcv_wnd - 32768);
+	len = min(len, iov->iov_len + oft);
+
+	if (len <= p->kbuf[0]->dgl_length)
+		return -EINVAL;
+
+	err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
+	if (err < 0)
+		return err;
+	if (gl) {
+		if (p->ubuf)
+			t3_free_ddp_gl(p->pdev, p->ubuf);
+		p->ubuf = gl;
+		t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
+			       gl->dgl_offset, 0);
+	}
+	return len;
+}
+
+#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE(1) | \
+		      V_TF_DDP_BUF1_FLUSH(1) | \
+		      V_TF_DDP_BUF0_FLUSH(1) | \
+		      V_TF_DDP_PUSH_DISABLE_1(1) | \
+		      V_TF_DDP_PUSH_DISABLE_0(1) | \
+		      V_TF_DDP_INDICATE_OUT(1))
+
+/*
+ * Post a user buffer as an overlay on top of the current kernel buffer.
+ */
+int
+t3_overlay_ubuf(struct socket *so, const struct iovec *iov,
+		    int nonblock, int rcv_flags, int modulate, int post_kbuf)
+{
+	int len, ubuf_idx;
+	unsigned long flags;
+	struct toepcb *toep = sototcpcb(so)->t_toe;
+	struct ddp_state *p = &toep->tp_ddp_state;
+
+	if (!p || !p->pdev)
+		return -1;
+
+	len = setup_iovec_ppods(so, iov, 0);
+	if (len < 0)
+		return len;
+
+	ubuf_idx = p->kbuf_idx;
+	p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
+	/* Use existing offset */
+	/* Don't need to update .gl, user buffer isn't copied. */
+	p->cur_buf = ubuf_idx;
+
+	flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags);
+
+	if (post_kbuf) {
+		struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
+
+		dbs->cur_offset = 0;
+		dbs->flags = 0;
+		dbs->gl = p->kbuf[ubuf_idx ^ 1];
+		p->kbuf_idx ^= 1;
+		flags |= p->kbuf_idx ?
+			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
+			 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
+	}
+
+	if (ubuf_idx == 0) {
+		t3_overlay_ddpbuf(so, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
+				  len);
+		t3_setup_ddpbufs(so, 0, 0, p->kbuf[1]->dgl_length, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	} else {
+		t3_overlay_ddpbuf(so, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
+				  len);
+		t3_setup_ddpbufs(so, p->kbuf[0]->dgl_length, 0, 0, 0,
+				 flags,
+				 OVERLAY_MASK | flags, 1);
+	}
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(sk),
+		  "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
+		  " kbuf_idx %d",
+		   p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
+#endif
+	return 0;
+}

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 (text+ko) ====

@@ -1,4 +1,3 @@
-
 /**************************************************************************
 
 Copyright (c) 2007, Chelsio Inc.
@@ -96,8 +95,7 @@
 	unsigned int	dgl_length;
 	unsigned int	dgl_offset;
 	unsigned int	dgl_nelem;
-	vm_page_t   	*dgl_pages;
-	bus_addr_t 	dgl_phys_addr[0];
+	vm_page_t   	dgl_pages[0];
 };
 
 struct ddp_buf_state {
@@ -161,9 +159,6 @@
 int t3_alloc_ppods(struct tom_data *td, unsigned int n);
 void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
 void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
-int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
-		 struct ddp_gather_list **newgl,
-		 const struct ddp_gather_list *gl);
 int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
 		int len);
 //void t3_repost_kbuf(struct socket *so, int modulate, int activate);

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 (text+ko) ====

@@ -39,6 +39,7 @@
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
+#include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
@@ -152,6 +153,7 @@
 {
 	bzero(toep, sizeof(*toep));
 	toep->tp_refcount = 1;
+	cv_init(&toep->tp_cv, "toep cv");
 }
 
 void