From owner-p4-projects@FreeBSD.ORG Wed Jan 23 05:09:01 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 1B0E616A420; Wed, 23 Jan 2008 05:09:01 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id D3C1216A418 for ; Wed, 23 Jan 2008 05:09:00 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id BE4E513C43E for ; Wed, 23 Jan 2008 05:09:00 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0N590Zh021834 for ; Wed, 23 Jan 2008 05:09:00 GMT (envelope-from kmacy@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0N590i0021831 for perforce@freebsd.org; Wed, 23 Jan 2008 05:09:00 GMT (envelope-from kmacy@freebsd.org) Date: Wed, 23 Jan 2008 05:09:00 GMT Message-Id: <200801230509.m0N590i0021831@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to kmacy@freebsd.org using -f From: Kip Macy To: Perforce Change Reviews Cc: Subject: PERFORCE change 133915 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Jan 2008 05:09:01 -0000 http://perforce.freebsd.org/chv.cgi?CH=133915 Change 133915 by kmacy@kmacy:storage:toehead on 2008/01/23 05:08:06 basic zero-copy send and some infrastructure for DDP Affected files ... .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 edit Differences ... ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 (text+ko) ==== @@ -3324,6 +3324,53 @@ SOCK_UNLOCK(lctx->lso); } + +int +t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl, + unsigned int nppods, unsigned int tag, unsigned int maxoff, + unsigned int pg_off, unsigned int color) +{ + unsigned int i, j, pidx; + struct pagepod *p; + struct mbuf *m; + struct ulp_mem_io *req; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + unsigned int tid = toep->tp_tid; + const struct tom_data *td = TOM_DATA(TOE_DEV(so)); + unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; + + for (i = 0; i < nppods; ++i) { + m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); + m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); + req = mtod(m, struct ulp_mem_io *); + m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); + req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | + V_ULPTX_CMD(ULP_MEM_WRITE)); + req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | + V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); + + p = (struct pagepod *)(req + 1); + if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { + p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); + p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | + V_PPOD_COLOR(color)); + p->pp_max_offset = htonl(maxoff); + p->pp_page_offset = htonl(pg_off); + p->pp_rsvd = 0; + for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) + p->pp_addr[j] = pidx < gl->dgl_nelem ? + htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; + } else + p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ + send_or_defer(so, tp, m, 0); + ppod_addr += PPOD_SIZE; + } + return (0); +} + + void t3_init_wr_tab(unsigned int wr_len) { ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 (text+ko) ==== @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -94,13 +95,13 @@ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); -#ifdef notyet #define VM_HOLD_WRITEABLE 0x1 -static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, - int *count, int flags); -#endif +static int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags); static void vm_fault_unhold_pages(vm_page_t *m, int count); #define TMP_IOV_MAX 16 +#ifndef PG_FRAME +#define PG_FRAME ~PAGE_MASK +#endif void t3_init_socket_ops(void) @@ -123,7 +124,6 @@ #endif } - struct cxgb_dma_info { size_t cdi_mapped; int cdi_nsegs; @@ -182,21 +182,72 @@ } } +static void +cxgb_zero_copy_free(void *cl, void *arg) +{ + struct mbuf_vec *mv; + struct mbuf *m = (struct mbuf *)cl; + + mv = mtomv(m); + /* + * Physical addresses, don't try to free should be unheld separately from sbdrop + * + */ + mv->mv_count = 0; + m_free_iovec(m, m->m_type); +} -static void -cxgb_zero_copy_free(void *cl, void *arg) {} static int cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) { + struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + int err, i, count, totcount, maxcount, totbytes, npages, curbytes; + uint64_t start, end; + vm_page_t *mp; + + totbytes = totcount = 0; + maxcount = *held; + + mp = m; + for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) { + count = maxcount - totcount; + + start = (uint64_t)iov->iov_base; + end = (uint64_t)((caddr_t)iov->iov_base + iov->iov_len); + start &= PG_FRAME; + end += PAGE_MASK; + end &= PG_FRAME; + npages = (end - start) >> PAGE_SHIFT; + + count = min(count, npages); - return (EINVAL); + err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags); + if (err) { + vm_fault_unhold_pages(m, totcount); + return (err); + } + mp += count; + totcount += count; + curbytes = iov->iov_len; + if (count != npages) + curbytes = count*PAGE_SIZE - (((uint64_t)iov->iov_base)&PAGE_MASK); + totbytes += curbytes; + } + uio->uio_resid -= totbytes; + + return (0); } static void -cxgb_wait_dma_completion(struct toepcb *tp) +cxgb_wait_dma_completion(struct toepcb *toep) { + struct mtx *lock; + lock = &toep->tp_tp->t_inpcb->inp_mtx; + INP_LOCK(toep->tp_tp->t_inpcb); + cv_wait_unlock(&toep->tp_cv, lock); } static int @@ -233,7 +284,13 @@ mi_collapse_sge(mi, segs); *m = m0; - + + /* + * This appears to be a no-op at the moment + * as busdma is all or nothing need to make + * sure the tag values are large enough + * + */ if (cdi.cdi_mapped < uio->uio_resid) { uio->uio_resid -= cdi.cdi_mapped; } else @@ -304,10 +361,11 @@ } uio->uio_resid -= m->m_pkthdr.len; sent += m->m_pkthdr.len; - sbappend_locked(&so->so_snd, m); + sbappend(&so->so_snd, m); t3_push_frames(so, TRUE); iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); } + /* * Wait for pending I/O to be DMA'd to the card * @@ -454,51 +512,45 @@ * - hold all pages * - return number of pages in count */ -#ifdef notyet static int -vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags) +vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags) { vm_offset_t start, va; vm_paddr_t pa; int pageslen, faults, rv; - + struct thread *td; vm_map_t map; pmap_t pmap; vm_page_t m, *pages; vm_prot_t prot; - - start = addr & ~PAGE_MASK; - pageslen = roundup2(addr + len, PAGE_SIZE); - if (*count < (pageslen >> PAGE_SHIFT)) - return (EFBIG); - *count = pageslen >> PAGE_SHIFT; /* * Check that virtual address range is legal * This check is somewhat bogus as on some architectures kernel * and user do not share VA - however, it appears that all FreeBSD * architectures define it */ - if (addr + len > VM_MAXUSER_ADDRESS) + pageslen = count * PAGE_SIZE; + if (addr + pageslen > VM_MAXUSER_ADDRESS) return (EFAULT); - + td = curthread; map = &td->td_proc->p_vmspace->vm_map; pmap = &td->td_proc->p_vmspace->vm_pmap; pages = mp; prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ; - bzero(pages, sizeof(vm_page_t *) * (*count)); + bzero(pages, sizeof(vm_page_t *) * count); retry: - + /* * First optimistically assume that all pages are resident (and R/W if for write) * if so just mark pages as held (and dirty if for write) and return */ vm_page_lock_queues(); - for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) { + for (start = addr, pages = mp, faults = 0, va = addr; va < start + pageslen; va += PAGE_SIZE, pages++) { /* * Assure that we only hold the page once */ @@ -514,9 +566,10 @@ faults++; continue; } + *pages = m; - if (flags & VM_HOLD_WRITEABLE) - vm_page_dirty(m); + if (flags & VM_HOLD_WRITEABLE) + vm_page_dirty(m); } } vm_page_unlock_queues(); @@ -546,13 +599,15 @@ error: vm_page_lock_queues(); - for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++) + for (pages = mp, + va = start; va < start + pageslen; + va += PAGE_SIZE, + pages++) if (*pages) vm_page_unhold(*pages); vm_page_unlock_queues(); return (EFAULT); } -#endif static void vm_fault_unhold_pages(vm_page_t *mp, int count) @@ -567,3 +622,276 @@ vm_page_unlock_queues(); } +/** + * t3_pin_pages - pin a user memory range and prepare it for DDP + * @addr - the starting address + * @len - the length of the range + * @newgl - contains the pages and physical addresses of the pinned range + * @gl - an existing gather list, may be %NULL + * + * Pins the pages in the user-space memory range [addr, addr + len) and + * maps them for DMA. Returns a gather list with the pinned pages and + * their physical addresses. If @gl is non NULL the pages it describes + * are compared against the pages for [addr, addr + len), and if the + * existing gather list already covers the range a new list is not + * allocated. Returns 0 on success, or a negative errno. On success if + * a new gather list was allocated it is returned in @newgl. + */ +static int +t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, unsigned long addr, + size_t len, struct ddp_gather_list **newgl, + const struct ddp_gather_list *gl) +{ + int i, err; + size_t pg_off; + unsigned int npages; + struct ddp_gather_list *p; + + if (addr >= VM_MAXUSER_ADDRESS) + return (EINVAL); +#if 0 + if (!access_ok(VERIFY_WRITE, addr, len)) + return (EFAULT); +#endif + pg_off = addr & ~PAGE_MASK; + npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *), + M_DEVBUF, M_NOWAIT); + if (!p) + return (ENOMEM); + + + err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE); + + if (err) + goto free_gl; + + if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages && + gl->dgl_length >= len) { + for (i = 0; i < npages; ++i) + if (p->dgl_pages[i] != gl->dgl_pages[i]) + goto different_gl; + err = 0; + goto unpin; + } + +different_gl: + p->dgl_length = len; + p->dgl_offset = pg_off; + p->dgl_nelem = npages; +#if 0 + p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off, + PAGE_SIZE - pg_off, + PCI_DMA_FROMDEVICE) - pg_off; + for (i = 1; i < npages; ++i) + p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); +#endif + + *newgl = p; + return 0; +unpin: + vm_fault_unhold_pages(p->dgl_pages, npages); + +free_gl: + free(p, M_DEVBUF); + *newgl = NULL; + return err; +} + +/* + * Return the # of page pods needed to accommodate a # of pages. + */ +static inline unsigned int +pages2ppods(unsigned int pages) +{ + return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS; +} + +/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */ +#define MAX_PPODS 64U + +/* + * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in + * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we + * try to allocate enough page pods to accommodate the whole buffer, subject to + * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page + * pods before failing entirely. + */ +static int +alloc_buf1_ppods(struct socket *so, struct ddp_state *p, + unsigned long addr, unsigned int len) +{ + int tag, npages, nppods; + struct tom_data *d = TOM_DATA(TOE_DEV(so)); + + npages = ((addr & ~PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + nppods = min(pages2ppods(npages), MAX_PPODS); +#ifdef notyet + nppods = ALIGN(nppods, PPOD_CLUSTER_SIZE); +#endif + tag = t3_alloc_ppods(d, nppods); + if (tag < 0 && nppods > PPOD_CLUSTER_SIZE) { + nppods = PPOD_CLUSTER_SIZE; + tag = t3_alloc_ppods(d, nppods); + } + if (tag < 0) + return (ENOMEM); + + p->ubuf_nppods = nppods; + p->ubuf_tag = tag; +#if NUM_DDP_KBUF == 1 + t3_set_ddp_tag(so, 1, tag << 6); +#endif + return 0; +} + +/* + * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush + * won't block indefinitely if there's nothing to place (which should be rare). + */ +#define UBUF_OFFSET 1 + +static __inline unsigned long +select_ddp_flags(const struct socket *so, int buf_idx, + int nonblock, int rcv_flags) +{ + if (buf_idx == 1) { + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE(1) | + V_TF_DDP_PUSH_DISABLE_1(1); + if (nonblock) + return V_TF_DDP_BUF1_FLUSH(1); + + return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so), + ddp_push_wait)); + } + + if (__predict_false(rcv_flags & MSG_WAITALL)) + return V_TF_DDP_PSH_NO_INVALIDATE(1) | + V_TF_DDP_PUSH_DISABLE_0(1); + if (nonblock) + return V_TF_DDP_BUF0_FLUSH(1); + + return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait)); +} + +/** + * setup_iovec_ppods - setup HW page pods for a user iovec + * @sk: the associated socket + * @iov: the iovec + * @oft: additional bytes to map before the start of the buffer + * + * Pins a user iovec and sets up HW page pods for DDP into it. We allocate + * page pods for user buffers on the first call per socket. Afterwards we + * limit the buffer length to whatever the existing page pods can accommodate. + * Returns a negative error code or the length of the mapped buffer. + * + * The current implementation handles iovecs with only one entry. + */ +static int +setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft) +{ + int err; + unsigned int len; + struct ddp_gather_list *gl = NULL; + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + unsigned long addr = (unsigned long)iov->iov_base - oft; + + if (__predict_false(!p->ubuf_nppods)) { + err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft); + if (err) + return err; + } + + len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE; + len -= addr & ~PAGE_MASK; + if (len > M_TCB_RX_DDP_BUF0_LEN) + len = M_TCB_RX_DDP_BUF0_LEN; + len = min(len, sototcpcb(so)->rcv_wnd - 32768); + len = min(len, iov->iov_len + oft); + + if (len <= p->kbuf[0]->dgl_length) + return -EINVAL; + + err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf); + if (err < 0) + return err; + if (gl) { + if (p->ubuf) + t3_free_ddp_gl(p->pdev, p->ubuf); + p->ubuf = gl; + t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len, + gl->dgl_offset, 0); + } + return len; +} + +#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE(1) | \ + V_TF_DDP_BUF1_FLUSH(1) | \ + V_TF_DDP_BUF0_FLUSH(1) | \ + V_TF_DDP_PUSH_DISABLE_1(1) | \ + V_TF_DDP_PUSH_DISABLE_0(1) | \ + V_TF_DDP_INDICATE_OUT(1)) + +/* + * Post a user buffer as an overlay on top of the current kernel buffer. + */ +int +t3_overlay_ubuf(struct socket *so, const struct iovec *iov, + int nonblock, int rcv_flags, int modulate, int post_kbuf) +{ + int len, ubuf_idx; + unsigned long flags; + struct toepcb *toep = sototcpcb(so)->t_toe; + struct ddp_state *p = &toep->tp_ddp_state; + + if (!p || !p->pdev) + return -1; + + len = setup_iovec_ppods(so, iov, 0); + if (len < 0) + return len; + + ubuf_idx = p->kbuf_idx; + p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP; + /* Use existing offset */ + /* Don't need to update .gl, user buffer isn't copied. */ + p->cur_buf = ubuf_idx; + + flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags); + + if (post_kbuf) { + struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1]; + + dbs->cur_offset = 0; + dbs->flags = 0; + dbs->gl = p->kbuf[ubuf_idx ^ 1]; + p->kbuf_idx ^= 1; + flags |= p->kbuf_idx ? + V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) : + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0); + } + + if (ubuf_idx == 0) { + t3_overlay_ddpbuf(so, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6, + len); + t3_setup_ddpbufs(so, 0, 0, p->kbuf[1]->dgl_length, 0, + flags, + OVERLAY_MASK | flags, 1); + } else { + t3_overlay_ddpbuf(so, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6, + len); + t3_setup_ddpbufs(so, p->kbuf[0]->dgl_length, 0, 0, 0, + flags, + OVERLAY_MASK | flags, 1); + } +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d " + " kbuf_idx %d", + p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx); +#endif + return 0; +} ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 (text+ko) ==== @@ -1,4 +1,3 @@ - /************************************************************************** Copyright (c) 2007, Chelsio Inc. @@ -96,8 +95,7 @@ unsigned int dgl_length; unsigned int dgl_offset; unsigned int dgl_nelem; - vm_page_t *dgl_pages; - bus_addr_t dgl_phys_addr[0]; + vm_page_t dgl_pages[0]; }; struct ddp_buf_state { @@ -161,9 +159,6 @@ int t3_alloc_ppods(struct tom_data *td, unsigned int n); void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl); -int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len, - struct ddp_gather_list **newgl, - const struct ddp_gather_list *gl); int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to, int len); //void t3_repost_kbuf(struct socket *so, int modulate, int activate); ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 (text+ko) ==== @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,7 @@ { bzero(toep, sizeof(*toep)); toep->tp_refcount = 1; + cv_init(&toep->tp_cv, "toep cv"); } void