From owner-p4-projects@FreeBSD.ORG Fri Jan 25 07:08:25 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id DCD4216A421; Fri, 25 Jan 2008 07:08:24 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 8860416A41A for ; Fri, 25 Jan 2008 07:08:24 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 7759E13C474 for ; Fri, 25 Jan 2008 07:08:24 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0P78Ov5085019 for ; Fri, 25 Jan 2008 07:08:24 GMT (envelope-from kmacy@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0P78Odc085016 for perforce@freebsd.org; Fri, 25 Jan 2008 07:08:24 GMT (envelope-from kmacy@freebsd.org) Date: Fri, 25 Jan 2008 07:08:24 GMT Message-Id: <200801250708.m0P78Odc085016@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to kmacy@freebsd.org using -f From: Kip Macy To: Perforce Change Reviews Cc: Subject: PERFORCE change 134065 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 25 Jan 2008 07:08:25 -0000 http://perforce.freebsd.org/chv.cgi?CH=134065 Change 134065 by kmacy@kmacy:storage:toehead on 2008/01/25 07:07:45 split ddp support and vm functions in to separate files Affected files ... .. //depot/projects/toehead/sys/dev/cxgb/sys/mbufq.h#3 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#7 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#6 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#6 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h#3 edit .. //depot/projects/toehead/sys/modules/cxgb/tom/Makefile#2 edit Differences ... ==== //depot/projects/toehead/sys/dev/cxgb/sys/mbufq.h#3 (text+ko) ==== @@ -103,7 +103,7 @@ } static __inline struct mbuf * -mbufq_peek(struct mbuf_head *l) +mbufq_peek(const struct mbuf_head *l) { return (l->head); } ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#7 (text+ko) ==== @@ -490,12 +490,10 @@ m = m_gethdr_nofail(sizeof(*req)); -#ifdef notyet - req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req)); + req = mtod(m, struct cpl_rx_data_ack *); req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); -#else - req = mtod(m, struct cpl_rx_data_ack *); -#endif + m->m_pkthdr.len = m->m_len = sizeof(*req); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | V_RX_DACK_MODE(1) | @@ -1163,6 +1161,20 @@ return V_FLAVORS_VALID(flv_valid) | V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); } + +#if DEBUG_WR > 1 +static int +count_pending_wrs(const struct toepcb *toep) +{ + const struct mbuf *m; + int n = 0; + + wr_queue_walk(toep, m) + n += m->m_pkthdr.csum_data; + return (n); +} +#endif + #if 0 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) #endif @@ -2132,7 +2144,7 @@ unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ if (tp->rcv_nxt == rcv_nxt) /* no data */ - return 0; + return (0); if (__predict_false(so_no_receive(so))) { handle_excess_rx(toep, m); @@ -2189,7 +2201,6 @@ goto out; } - if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { keep = handle_peer_close_data(so, m); if (keep < 0) @@ -3533,7 +3544,6 @@ if (__predict_false(credits < p->m_pkthdr.csum_data)) { #if DEBUG_WR > 1 struct tx_data_wr *w = cplhdr(p); -#ifdef notyet log(LOG_ERR, "TID %u got %u WR credits, need %u, len %u, " "main body %u, frags %u, seq # %u, ACK una %u," @@ -3541,8 +3551,7 @@ toep->tp_tid, credits, p->csum, p->len, p->len - p->data_len, skb_shinfo(p)->nr_frags, ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), - WR_AVAIL(tp), count_pending_wrs(tp) - credits); -#endif + toep->tp_wr_avail, count_pending_wrs(tp) - credits); #endif p->m_pkthdr.csum_data -= credits; break; @@ -3880,11 +3889,9 @@ wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); m = m_gethdr_nofail(wrlen); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); -#ifdef notyet - wr = (struct work_request_hdr *)__skb_put(skb, wrlen); -#else wr = mtod(m, struct work_request_hdr *); -#endif + m->m_pkthdr.len = m->m_len = wrlen; + /* Set the ATOMIC flag to make sure that TP processes the following * CPLs in an atomic manner and no wire segments can be interleaved. */ @@ -3955,12 +3962,10 @@ (modulate ? sizeof(struct cpl_rx_data_ack) : 0); m = m_gethdr_nofail(wrlen); m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); -#ifdef notyet - wr = (struct work_request_hdr *)__skb_put(skb, wrlen); + wr = mtod(m, struct work_request_hdr *); wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); -#else - wr = mtod(m, struct work_request_hdr *); -#endif + m->m_pkthdr.len = m->m_len = wrlen; + req = (struct cpl_set_tcb_field *)(wr + 1); if (len0) { /* program buffer 0 offset and length */ mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#6 (text+ko) ==== @@ -73,6 +73,7 @@ #include #include #include + #include #include #include @@ -86,6 +87,7 @@ #include #include #include +#include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, @@ -95,9 +97,6 @@ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); -#define VM_HOLD_WRITEABLE 0x1 -static int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags); -static void vm_fault_unhold_pages(vm_page_t *m, int count); #define TMP_IOV_MAX 16 #ifndef PG_FRAME #define PG_FRAME ~PAGE_MASK @@ -240,6 +239,29 @@ return (0); } +/* + * Returns whether a connection should enable DDP. This happens when all of + * the following conditions are met: + * - the connection's ULP mode is DDP + * - DDP is not already enabled + * - the last receive was above the DDP threshold + * - receive buffers are in user space + * - receive side isn't shutdown (handled by caller) + * - the connection's receive window is big enough so that sizable buffers + * can be posted without closing the window in the middle of DDP (checked + * when the connection is offloaded) + */ +#ifdef notyet +static int +so_should_ddp(const struct toepcb *toep, int last_recv_len) +{ + return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.ubuf == NULL) && + last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && + toep->tp_tp->rcv_wnd > + (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); +} +#endif + static void cxgb_wait_dma_completion(struct toepcb *toep) { @@ -501,749 +523,3 @@ so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend; so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive; } - -/* - * This routine takes a user address range and does the following: - * - validate that the user has access to those pages (flags indicates read or write) - if not fail - * - validate that count is enough to hold range number of pages - if not fail - * - fault in any non-resident pages - * - if the user is doing a read force a write fault for any COWed pages - * - if the user is doing a read mark all pages as dirty - * - hold all pages - * - return number of pages in count - */ -static int -vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags) -{ - - vm_offset_t start, va; - vm_paddr_t pa; - int pageslen, faults, rv; - - struct thread *td; - vm_map_t map; - pmap_t pmap; - vm_page_t m, *pages; - vm_prot_t prot; - - /* - * Check that virtual address range is legal - * This check is somewhat bogus as on some architectures kernel - * and user do not share VA - however, it appears that all FreeBSD - * architectures define it - */ - pageslen = count * PAGE_SIZE; - if (addr + pageslen > VM_MAXUSER_ADDRESS) - return (EFAULT); - - td = curthread; - map = &td->td_proc->p_vmspace->vm_map; - pmap = &td->td_proc->p_vmspace->vm_pmap; - pages = mp; - - prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ; - bzero(pages, sizeof(vm_page_t *) * count); -retry: - - /* - * First optimistically assume that all pages are resident (and R/W if for write) - * if so just mark pages as held (and dirty if for write) and return - */ - vm_page_lock_queues(); - for (start = addr, pages = mp, faults = 0, va = addr; va < start + pageslen; va += PAGE_SIZE, pages++) { - /* - * Assure that we only hold the page once - */ - if (*pages == NULL) { - /* - * page queue mutex is recursable so this is OK - * it would be really nice if we had an unlocked version of this so - * we were only acquiring the pmap lock 1 time as opposed to potentially - * many dozens of times - */ - m = pmap_extract_and_hold(pmap, va, prot); - if (m == NULL) { - faults++; - continue; - } - - *pages = m; - if (flags & VM_HOLD_WRITEABLE) - vm_page_dirty(m); - } - } - vm_page_unlock_queues(); - - if (faults == 0) - return (0); - /* - * Pages either have insufficient permissions or are not present - * trigger a fault where neccessary - * - */ - for (va = start; va < pageslen; va += PAGE_SIZE) { - m = NULL; - pa = pmap_extract(pmap, va); - rv = 0; - if (pa) - m = PHYS_TO_VM_PAGE(pa); - if (flags & VM_HOLD_WRITEABLE) { - if (m == NULL || (m->flags & PG_WRITEABLE) == 0) - rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY); - } else if (m == NULL) - rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL); - if (rv) - goto error; - } - goto retry; - -error: - vm_page_lock_queues(); - for (pages = mp, - va = start; va < start + pageslen; - va += PAGE_SIZE, - pages++) - if (*pages) - vm_page_unhold(*pages); - vm_page_unlock_queues(); - return (EFAULT); -} - -static void -vm_fault_unhold_pages(vm_page_t *mp, int count) -{ - - KASSERT(count >= 0, ("negative count %d", count)); - vm_page_lock_queues(); - while (count--) { - vm_page_unhold(*mp); - mp++; - } - vm_page_unlock_queues(); -} - -/** - * t3_pin_pages - pin a user memory range and prepare it for DDP - * @addr - the starting address - * @len - the length of the range - * @newgl - contains the pages and physical addresses of the pinned range - * @gl - an existing gather list, may be %NULL - * - * Pins the pages in the user-space memory range [addr, addr + len) and - * maps them for DMA. Returns a gather list with the pinned pages and - * their physical addresses. If @gl is non NULL the pages it describes - * are compared against the pages for [addr, addr + len), and if the - * existing gather list already covers the range a new list is not - * allocated. Returns 0 on success, or a negative errno. On success if - * a new gather list was allocated it is returned in @newgl. - */ -static int -t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, unsigned long addr, - size_t len, struct ddp_gather_list **newgl, - const struct ddp_gather_list *gl) -{ - int i, err; - size_t pg_off; - unsigned int npages; - struct ddp_gather_list *p; - - if (addr >= VM_MAXUSER_ADDRESS) - return (EINVAL); -#if 0 - if (!access_ok(VERIFY_WRITE, addr, len)) - return (EFAULT); -#endif - pg_off = addr & ~PAGE_MASK; - npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *), - M_DEVBUF, M_NOWAIT); - if (!p) - return (ENOMEM); - - - err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE); - - if (err) - goto free_gl; - - if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages && - gl->dgl_length >= len) { - for (i = 0; i < npages; ++i) - if (p->dgl_pages[i] != gl->dgl_pages[i]) - goto different_gl; - err = 0; - goto unpin; - } - -different_gl: - p->dgl_length = len; - p->dgl_offset = pg_off; - p->dgl_nelem = npages; -#ifdef notyet - p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off, - PAGE_SIZE - pg_off, - PCI_DMA_FROMDEVICE) - pg_off; - for (i = 1; i < npages; ++i) - p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE, - PCI_DMA_FROMDEVICE); -#endif - - *newgl = p; - return 0; -unpin: - vm_fault_unhold_pages(p->dgl_pages, npages); - -free_gl: - free(p, M_DEVBUF); - *newgl = NULL; - return err; -} - -/* - * Return the # of page pods needed to accommodate a # of pages. - */ -static inline unsigned int -pages2ppods(unsigned int pages) -{ - return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS; -} - - -static void -unmap_ddp_gl(const struct ddp_gather_list *gl) -{ -#ifdef notyet - int i; - - if (!gl->nelem) - return; - - pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset, - PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE); - for (i = 1; i < gl->nelem; ++i) - pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE, - PCI_DMA_FROMDEVICE); - -#endif -} - -static void -ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty) -{ -#ifdef notyet - int i; - - for (i = 0; i < gl->nelem; ++i) { - if (dirty) - set_page_dirty_lock(gl->pages[i]); - put_page(gl->pages[i]); - } -#endif -} - -void -t3_free_ddp_gl(struct ddp_gather_list *gl) -{ - unmap_ddp_gl(gl); - ddp_gl_free_pages(gl, 0); - free(gl, M_DEVBUF); -} - -/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */ -#define MAX_PPODS 64U - -/* - * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in - * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we - * try to allocate enough page pods to accommodate the whole buffer, subject to - * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page - * pods before failing entirely. - */ -static int -alloc_buf1_ppods(struct socket *so, struct ddp_state *p, - unsigned long addr, unsigned int len) -{ - int tag, npages, nppods; - struct tom_data *d = TOM_DATA(TOE_DEV(so)); - - npages = ((addr & ~PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - nppods = min(pages2ppods(npages), MAX_PPODS); -#ifdef notyet - nppods = ALIGN(nppods, PPOD_CLUSTER_SIZE); -#endif - tag = t3_alloc_ppods(d, nppods); - if (tag < 0 && nppods > PPOD_CLUSTER_SIZE) { - nppods = PPOD_CLUSTER_SIZE; - tag = t3_alloc_ppods(d, nppods); - } - if (tag < 0) - return (ENOMEM); - - p->ubuf_nppods = nppods; - p->ubuf_tag = tag; -#if NUM_DDP_KBUF == 1 - t3_set_ddp_tag(so, 1, tag << 6); -#endif - return 0; -} - - - -/* - * Reposts the kernel DDP buffer after it has been previously become full and - * invalidated. We just need to reset the offset and adjust the DDP flags. - * Conveniently, we can set the flags and the offset with a single message. - * Note that this function does not set the buffer length. Again conveniently - * our kernel buffer is of fixed size. If the length needs to be changed it - * needs to be done separately. - */ -static void -t3_repost_kbuf(struct socket *so, unsigned int bufidx, int modulate, - int activate) -{ - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - - p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset; - p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0; - p->buf_state[bufidx].gl = p->kbuf[bufidx]; - p->cur_buf = bufidx; - p->kbuf_idx = bufidx; - if (!bufidx) - t3_setup_ddpbufs(toep, 0, 0, 0, 0, - V_TF_DDP_PSH_NO_INVALIDATE(p->kbuf_noinval) | - V_TF_DDP_BUF0_VALID(1), - V_TF_DDP_PSH_NO_INVALIDATE(1) | V_TF_DDP_OFF(1) | - V_TF_DDP_BUF0_VALID(1) | - V_TF_DDP_ACTIVE_BUF(activate), modulate); - else - t3_setup_ddpbufs(toep, 0, 0, 0, 0, - V_TF_DDP_PSH_NO_INVALIDATE(p->kbuf_noinval) | - V_TF_DDP_BUF1_VALID(1) | - V_TF_DDP_ACTIVE_BUF(activate), - V_TF_DDP_PSH_NO_INVALIDATE(1) | V_TF_DDP_OFF(1) | - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), - modulate); - -} - -/* - * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush - * won't block indefinitely if there's nothing to place (which should be rare). - */ -#define UBUF_OFFSET 1 - -static __inline unsigned long -select_ddp_flags(const struct socket *so, int buf_idx, - int nonblock, int rcv_flags) -{ - if (buf_idx == 1) { - if (__predict_false(rcv_flags & MSG_WAITALL)) - return V_TF_DDP_PSH_NO_INVALIDATE(1) | - V_TF_DDP_PUSH_DISABLE_1(1); - if (nonblock) - return V_TF_DDP_BUF1_FLUSH(1); - - return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so), - ddp_push_wait)); - } - - if (__predict_false(rcv_flags & MSG_WAITALL)) - return V_TF_DDP_PSH_NO_INVALIDATE(1) | - V_TF_DDP_PUSH_DISABLE_0(1); - if (nonblock) - return V_TF_DDP_BUF0_FLUSH(1); - - return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait)); -} - -/** - * setup_iovec_ppods - setup HW page pods for a user iovec - * @sk: the associated socket - * @iov: the iovec - * @oft: additional bytes to map before the start of the buffer - * - * Pins a user iovec and sets up HW page pods for DDP into it. We allocate - * page pods for user buffers on the first call per socket. Afterwards we - * limit the buffer length to whatever the existing page pods can accommodate. - * Returns a negative error code or the length of the mapped buffer. - * - * The current implementation handles iovecs with only one entry. - */ -static int -setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft) -{ - int err; - unsigned int len; - struct ddp_gather_list *gl = NULL; - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - unsigned long addr = (unsigned long)iov->iov_base - oft; - - if (__predict_false(!p->ubuf_nppods)) { - err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft); - if (err) - return err; - } - - len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE; - len -= addr & ~PAGE_MASK; - if (len > M_TCB_RX_DDP_BUF0_LEN) - len = M_TCB_RX_DDP_BUF0_LEN; - len = min(len, sototcpcb(so)->rcv_wnd - 32768); - len = min(len, iov->iov_len + oft); - - if (len <= p->kbuf[0]->dgl_length) - return -EINVAL; - - err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf); - if (err < 0) - return err; - if (gl) { - if (p->ubuf) - t3_free_ddp_gl(p->ubuf); - p->ubuf = gl; - t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len, - gl->dgl_offset, 0); - } - return len; -} - -#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE(1) | \ - V_TF_DDP_BUF1_FLUSH(1) | \ - V_TF_DDP_BUF0_FLUSH(1) | \ - V_TF_DDP_PUSH_DISABLE_1(1) | \ - V_TF_DDP_PUSH_DISABLE_0(1) | \ - V_TF_DDP_INDICATE_OUT(1)) - -/* - * Post a user buffer as an overlay on top of the current kernel buffer. - */ -int -t3_overlay_ubuf(struct socket *so, const struct iovec *iov, - int nonblock, int rcv_flags, int modulate, int post_kbuf) -{ - int len, ubuf_idx; - unsigned long flags; - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - - if (!p) - return -1; - - len = setup_iovec_ppods(so, iov, 0); - if (len < 0) - return len; - - ubuf_idx = p->kbuf_idx; - p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP; - /* Use existing offset */ - /* Don't need to update .gl, user buffer isn't copied. */ - p->cur_buf = ubuf_idx; - - flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags); - - if (post_kbuf) { - struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1]; - - dbs->cur_offset = 0; - dbs->flags = 0; - dbs->gl = p->kbuf[ubuf_idx ^ 1]; - p->kbuf_idx ^= 1; - flags |= p->kbuf_idx ? - V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) : - V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0); - } - - if (ubuf_idx == 0) { - t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6, - len); - t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0, - flags, - OVERLAY_MASK | flags, 1); - } else { - t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6, - len); - t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0, - flags, - OVERLAY_MASK | flags, 1); - } -#ifdef T3_TRACE - T3_TRACE5(TIDTB(so), - "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d " - " kbuf_idx %d", - p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx); -#endif - return 0; -} - - - -/* - * Returns whether a connection should enable DDP. This happens when all of - * the following conditions are met: - * - the connection's ULP mode is DDP - * - DDP is not already enabled - * - the last receive was above the DDP threshold - * - receive buffers are in user space - * - receive side isn't shutdown (handled by caller) - * - the connection's receive window is big enough so that sizable buffers - * can be posted without closing the window in the middle of DDP (checked - * when the connection is offloaded) - */ -#ifdef notyet -static int -so_should_ddp(const struct toepcb *toep, int last_recv_len) -{ - return toep->tp_ulp_mode == ULP_MODE_TCPDDP && !toep->tp_dpp_state.cur_buf && - last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) && - toep->tp_tp->rcv_wnd > - (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + - DDP_RSVD_WIN); -} - -static inline int -is_ddp(const struct mbuf *m) -{ - return (m->m_flags & M_DDP); -} - -static inline int -is_ddp_psh(const struct mbuf *m) -{ - return is_ddp(skb) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); -} - -/* - * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the - * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a - * DDP buffer. - */ -static inline int -copy_data(const struct mbuf *m, int offset, struct iovec *to, int len) -{ - if (__predict_true(!is_ddp(m))) /* RX_DATA */ - return mbuf_copy_datagram_iovec(m, offset, to, len); - if (__predict_true(m->pkthdr.csum_flags & DDP_BF_NOCOPY)) { /* user DDP */ - to->iov_len -= len; - to->iov_base += len; - return 0; - } - return t3_ddp_copy(m, offset, to, len); /* kernel DDP */ -} - - -#endif -/* - * Clean up DDP state that needs to survive until socket close time, such as the - * DDP buffers. The buffers are already unmapped at this point as unmapping - * needs the PCI device and a socket may close long after the device is removed. - */ -void -t3_cleanup_ddp(struct socket *so) -{ - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - int idx; - - for (idx = 0; idx < NUM_DDP_KBUF; idx++) - if (p->kbuf[idx]) { - ddp_gl_free_pages(p->kbuf[idx], 0); - free(p->kbuf[idx], M_DEVBUF); - } - - if (p->ubuf) { - ddp_gl_free_pages(p->ubuf, 0); - free(p->ubuf, M_DEVBUF); - p->ubuf = NULL; - } - toep->tp_ulp_mode = 0; -} - -/* - * This is a companion to t3_cleanup_ddp() and releases the HW resources - * associated with a connection's DDP state, such as the page pods. - * It's called when HW is done with a connection. The rest of the state - * remains available until both HW and the app are done with the connection. - */ -void -t3_release_ddp_resources(struct toepcb *toep) -{ - struct ddp_state *p = &toep->tp_ddp_state; - struct tom_data *d = TOM_DATA(toep->tp_toedev); - int idx; - - for (idx = 0; idx < NUM_DDP_KBUF; idx++) { - t3_free_ppods(d, p->kbuf_tag[idx], - p->kbuf_nppods[idx]); - unmap_ddp_gl(p->kbuf[idx]); - } - - if (p->ubuf_nppods) { - t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods); - p->ubuf_nppods = 0; - } - if (p->ubuf) - unmap_ddp_gl(p->ubuf); - -} - -void -t3_post_kbuf(struct socket *so, int modulate) -{ - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - - t3_set_ddp_tag(so, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6); - t3_set_ddp_buf(so, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length); - t3_repost_kbuf(so, p->cur_buf, modulate, 1); - -#ifdef T3_TRACE - T3_TRACE1(TIDTB(so), - "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf); -#endif -} - -/* - * Prepare a socket for DDP. Must be called when the socket is known to be - * open. - */ -int -t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall) -{ - int err = ENOMEM; - unsigned int nppods, kbuf_pages, idx = 0; - struct toepcb *toep = sototcpcb(so)->t_toe; - struct ddp_state *p = &toep->tp_ddp_state; - struct tom_data *d = TOM_DATA(toep->tp_toedev); - - if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN) - return (EINVAL); - - kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - nppods = pages2ppods(kbuf_pages); - - p->kbuf_noinval = !!waitall; - - p->kbuf_tag[NUM_DDP_KBUF - 1] = -1; - for (idx = 0; idx < NUM_DDP_KBUF; idx++) { - p->kbuf[idx] = - malloc(sizeof (struct ddp_gather_list) + kbuf_pages * - sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO); - if (!p->kbuf[idx]) - goto err; - - p->kbuf_tag[idx] = t3_alloc_ppods(d, nppods); - if (p->kbuf_tag[idx] < 0) - goto err; - - p->kbuf_nppods[idx] = nppods; - p->kbuf[idx]->dgl_length = kbuf_size; - p->kbuf[idx]->dgl_offset = 0; - p->kbuf[idx]->dgl_nelem = kbuf_pages; -#ifdef notyet - p->kbuf[idx]->pages = - (struct page **)&p->kbuf[idx]->phys_addr[kbuf_pages]; - - - for (i = 0; i < kbuf_pages; ++i) { - - p->kbuf[idx]->pages[i] = alloc_page(sk->sk_allocation); - if (!p->kbuf[idx]->pages[i]) { - p->kbuf[idx]->nelem = i; - goto err; - } - - } - - for (i = 0; i < kbuf_pages; ++i) - p->kbuf[idx]->phys_addr[i] = - pci_map_page(p->pdev, p->kbuf[idx]->pages[i], - 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); -#endif - t3_setup_ppods(so, p->kbuf[idx], nppods, p->kbuf_tag[idx], - p->kbuf[idx]->dgl_length, 0, 0); - } - t3_set_ddp_tag(so, 0, p->kbuf_tag[0] << 6); - t3_set_ddp_buf(so, 0, 0, p->kbuf[0]->dgl_length); - t3_repost_kbuf(so, 0, 0, 1); - t3_set_rcv_coalesce_enable(so, - TOM_TUNABLE(TOE_DEV(so), ddp_rcvcoalesce)); - -#ifdef T3_TRACE - T3_TRACE4(TIDTB(so), - "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d", - kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]); -#endif - - return 0; - -err: - t3_release_ddp_resources(toep); - t3_cleanup_ddp(so); - return err; -} - -int -t3_ddp_copy(const struct mbuf *m, int offset, struct iovec *to, int len) -{ -#ifdef notyet - int err, page_no, page_off; - struct ddp_gather_list *gl = (struct ddp_gather_list *)skb->mac.raw; - - if (!gl->pages) { - dump_stack(); - BUG_ON(1); - } - - offset += gl->offset + TCP_SKB_CB(skb)->when; - page_no = offset >> PAGE_SHIFT; - page_off = offset & ~PAGE_MASK; - - while (len) { - int copy = min_t(int, len, PAGE_SIZE - page_off); - - err = memcpy_toiovec(to, page_address(gl->pages[page_no]) + - page_off, copy); - if (err) - return -EFAULT; - page_no++; - page_off = 0; - len -= copy; - } -#endif - return 0; -} - -/* - * Allocate n page pods. Returns -1 on failure or the page pod tag. - */ -int t3_alloc_ppods(struct tom_data *td, unsigned int n) -{ - unsigned int i, j; - - if (__predict_false(!td->ppod_map)) - return -1; - - mtx_lock(&td->ppod_map_lock); - for (i = 0; i < td->nppods; ) { - for (j = 0; j < n; ++j) /* scan ppod_map[i..i+n-1] */ - if (td->ppod_map[i + j]) { - i = i + j + 1; - goto next; - } - - memset(&td->ppod_map[i], 1, n); /* allocate range */ - mtx_unlock(&td->ppod_map_lock); - return i; -next: ; - } - mtx_unlock(&td->ppod_map_lock); - return (0); -} - -void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n) -{ - /* No need to take ppod_lock here */ - memset(&td->ppod_map[tag], 0, n); -} ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#6 (text+ko) ==== @@ -150,20 +150,20 @@ int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl, unsigned int nppods, unsigned int tag, unsigned int maxoff, unsigned int pg_off, unsigned int color); -int t3_alloc_ppods(struct tom_data *td, unsigned int n); +int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag); void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); void t3_free_ddp_gl(struct ddp_gather_list *gl); -int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to, +int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len); //void t3_repost_kbuf(struct socket *so, int modulate, int activate); void t3_post_kbuf(struct socket *so, int modulate); int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock, int rcv_flags, int modulate, int post_kbuf); -void t3_cancel_ubuf(struct socket *so); +void t3_cancel_ubuf(struct toepcb *toep); int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock, int rcv_flags, int modulate, int post_kbuf); int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall); -void t3_cleanup_ddp(struct socket *so); +void t3_cleanup_ddp(struct toepcb *toep); void t3_release_ddp_resources(struct toepcb *toep); void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx); void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0, ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h#3 (text+ko) ==== @@ -99,7 +99,7 @@ } static inline struct mbuf * -peek_wr(struct toepcb *toep) +peek_wr(const struct toepcb *toep) { return (mbufq_peek(&toep->wr_list)); @@ -112,5 +112,10 @@ return (mbufq_dequeue(&toep->wr_list)); } +#define wr_queue_walk(toep, m) \ + for (m = peek_wr(toep); m; m = m->m_nextpkt) + + + #endif ==== //depot/projects/toehead/sys/modules/cxgb/tom/Makefile#2 (text+ko) ==== @@ -4,7 +4,7 @@ KMOD= tom SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -#SRCS+= cxgb_tcp_subr.c cxgb_tcp_usrreq.c +SRCS+= cxgb_ddp.c cxgb_vm.c >>> TRUNCATED FOR MAIL (1000 lines) <<<