Date: Fri, 23 Nov 2007 23:32:22 GMT From: Kip Macy <kmacy@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 129436 for review Message-ID: <200711232332.lANNWMpf039948@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=129436 Change 129436 by kmacy@kmacy:storage:toestack on 2007/11/23 23:32:18 add initial DDP infrastructure Affected files ... .. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#23 edit .. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#3 edit .. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.c#12 edit Differences ... ==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#23 (text+ko) ==== @@ -80,6 +80,7 @@ #include <dev/cxgb/ulp/tom/cxgb_defs.h> #include <dev/cxgb/ulp/tom/cxgb_tom.h> #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> /* * For ULP connections HW may add headers, e.g., for digests, that aren't part @@ -1155,7 +1156,6 @@ m_free(m); } INP_UNLOCK(inp); - } /* @@ -1444,7 +1444,213 @@ return (0); } +static void +new_rx_data_ddp(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data_ddp *hdr; + unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; + +#ifdef notyet + if (unlikely(sk_no_receive(sk))) { + handle_excess_rx(so, m); + return; + } +#endif + tp = sototcpcb(so); + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->u.ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + bsp = &q->buf_state[buf_idx]; + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " + "hdr seq 0x%x len %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), + ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); + T3_TRACE1(TIDTB(sk), + "new_rx_data_ddp: ddp_report 0x%x", + ddp_report); +#endif + + ddp_len = ntohs(hdr->len); + rcv_nxt = ntohl(hdr->seq) + ddp_len; + + /* + * Overload to store old rcv_next + */ + m->m_pkthdr.csum_data = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + + /* + * Store the length in m->m_len. We are changing the meaning of + * m->m_len here, we need to be very careful that nothing from now on + * interprets ->len of this packet the usual way. + */ + m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; + + /* + * Figure out where the new data was placed in the buffer and store it + * in when. Assumes the buffer offset starts at 0, consumer needs to + * account for page pod's pg_offset. + */ + end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; +#ifdef notyet + TCP_SKB_CB(skb)->when = end_offset - skb->len; + + /* + * We store in mac.raw the address of the gather list where the + * placement happened. + */ + skb->mac.raw = (unsigned char *)bsp->gl; +#endif + bsp->cur_offset = end_offset; + + /* + * Bit 0 of flags stores whether the DDP buffer is completed. + * Note that other parts of the code depend on this being in bit 0. + */ + if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->length) { +#if 0 + TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ +#endif + panic("spurious ddp completion"); + } else { + m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); + if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; /* flip buffers */ + } + + if (bsp->flags & DDP_BF_NOCOPY) { + m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); + bsp->flags &= ~DDP_BF_NOCOPY; + } + + if (ddp_report & F_DDP_PSH) + m->m_pkthdr.csum_flags |= DDP_BF_PSH; + + tp->t_rcvtime = ticks; + sbappendstream_locked(&so->so_rcv, m); +#ifdef notyet + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); +#endif +} + +#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ + F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ + F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ + F_DDP_INVALID_PPOD) + +/* + * Handler for RX_DATA_DDP CPL messages. + */ +static int +do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + struct socket *so = toeptoso(toep); + const struct cpl_rx_data_ddp *hdr = cplhdr(m); + + VALIDATE_SOCK(so); + + if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { + log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", + GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); + return CPL_RET_BUF_DONE; + } +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + new_rx_data_ddp(so, m); + return (0); +} + +static void +process_ddp_complete(struct socket *so, struct mbuf *m) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_ddp_complete *hdr; + unsigned int ddp_report, buf_idx, when; + +#ifdef notyet + if (unlikely(sk_no_receive(sk))) { + handle_excess_rx(sk, skb); + return; + } +#endif + q = &toep->tp_ddp_state; + hdr = cplhdr(m); + ddp_report = ntohl(hdr->ddp_report); + buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; + bsp = &q->buf_state[buf_idx]; + + when = bsp->cur_offset; + m->m_len = G_DDP_OFFSET(ddp_report) - when; + +#ifdef T3_TRACE + T3_TRACE5(TIDTB(sk), + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report 0x%x offset %u, len %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report), skb->len); +#endif + + bsp->cur_offset += m->m_len; + + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; /* flip buffers */ + +#ifdef T3_TRACE + T3_TRACE4(TIDTB(sk), + "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " + "ddp_report %u offset %u", + tp->rcv_nxt, bsp->cur_offset, ddp_report, + G_DDP_OFFSET(ddp_report)); +#endif +#if 0 + skb->mac.raw = (unsigned char *)bsp->gl; +#endif + m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + m->m_pkthdr.csum_data = tp->rcv_nxt; + tp->rcv_nxt += m->m_len; + + tp->t_rcvtime = ticks; + sbappendstream_locked(&so->so_rcv, m); +#ifdef notyet + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); +#endif +} + /* + * Handler for RX_DDP_COMPLETE CPL messages. + */ +static int +do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = ctx; + struct socket *so = toeptoso(toep); + + VALIDATE_SOCK(so); +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif + process_ddp_complete(so, m); + return (0); +} + +/* * Move a socket to TIME_WAIT state. We need to make some adjustments to the * socket state before calling tcp_time_wait to comply with its expectations. */ @@ -1902,8 +2108,7 @@ struct toepcb *toep = tp->t_toe; if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { - toep->tp_flags |= TP_ABORT_REQ_RCVD; - toep->tp_flags |= TP_ABORT_SHUTDOWN; + toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); m_free(m); return; } @@ -2055,7 +2260,7 @@ UNIMPLEMENTED(); #ifdef notyet - struct sock *newso; + struct socket *newso; struct l2t_entry *e; struct rtentry *dst; struct tcpcb *newtp; @@ -3008,11 +3213,10 @@ t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); + t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); + t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); #ifdef notyet - t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); - t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); - t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); #endif ==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#3 (text+ko) ==== @@ -70,12 +70,14 @@ #include <dev/cxgb/cxgb_l2t.h> #include <dev/cxgb/cxgb_offload.h> #include <vm/vm.h> +#include <vm/vm_extern.h> #include <vm/pmap.h> #include <dev/cxgb/sys/mvec.h> #include <dev/cxgb/ulp/toecore/cxgb_toedev.h> #include <dev/cxgb/ulp/tom/cxgb_defs.h> #include <dev/cxgb/ulp/tom/cxgb_tom.h> #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, @@ -85,6 +87,7 @@ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); +#define TMP_IOV_MAX 16 void t3_init_socket_ops(void) @@ -96,30 +99,311 @@ pru_soreceive = prp->pr_usrreqs->pru_soreceive; } + +struct cxgb_dma_info { + size_t cdi_mapped; + int cdi_nsegs; + bus_dma_segment_t *cdi_segs; + +}; + +static void +cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs, + bus_size_t mapsize, int error) +{ + struct cxgb_dma_info *cdi = arg; + + cdi->cdi_mapped = mapsize; + cdi->cdi_nsegs = nsegs; + cdi->cdi_segs = segs; +} + +static void +iov_adj(struct iovec **iov, int *iovcnt, size_t count) +{ + struct iovec *iovtmp; + int iovcnttmp; + caddr_t ptmp; + + if (count > 0) { + iovtmp = *iov; + iovcnttmp = *iovcnt; + while (count > 0) { + if (count < iovtmp->iov_len) { + ptmp = iovtmp->iov_base; + ptmp += count; + iovtmp->iov_base = ptmp; + iovtmp->iov_len -= count; + break; + } else + count -= iovtmp->iov_len; + iovtmp++; + iovcnttmp--; + } + *iov = iovtmp; + *iovcnt = iovcnttmp; + } else if (count < 0) { + iovtmp = &(*iov)[*iovcnt - 1]; + iovcnttmp = *iovcnt; + while (count < 0) { + if (-count < iovtmp->iov_len) { + iovtmp->iov_len += count; + break; + } else + count += iovtmp->iov_len; + iovtmp--; + iovcnttmp--; + } + *iovcnt = iovcnttmp; + } +} + + +static void +cxgb_zero_copy_free(void *cl, void *arg) {} + +static int +cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags) +{ + + return (EINVAL); +} + +static void +cxgb_wait_dma_completion(struct toepcb *tp) +{ + +} + +static int +cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m) +{ + int i, seg_count, err, type; + struct mbuf *m0; + struct cxgb_dma_info cdi; + struct mbuf_vec *mv; + struct mbuf_iovec *mi; + bus_dma_segment_t *segs; + + err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio, + cxgb_dma_callback, &cdi, 0); + + if (err) + return (err); + seg_count = cdi.cdi_nsegs; + if ((m0 = mcl_alloc(seg_count, &type)) == NULL) { + bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap); + return (ENOMEM); + } + segs = cdi.cdi_segs; + m0->m_type = type; + m0->m_flags = (M_EXT|M_NOFREE); + m0->m_ext.ext_type = EXT_EXTREF; + m0->m_ext.ext_free = cxgb_zero_copy_free; + m0->m_ext.ext_args = NULL; + + mv = mtomv(m0); + mv->mv_count = seg_count; + mv->mv_first = 0; + for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++) + mi_collapse_sge(mi, segs); + + *m = m0; + + if (cdi.cdi_mapped < uio->uio_resid) { + uio->uio_resid -= cdi.cdi_mapped; + } else + uio->uio_resid = 0; + + return (0); +} + +static int +t3_sosend(struct socket *so, struct uio *uio) +{ + int rv, count, hold_resid, sent, iovcnt; + struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + struct uio uiotmp; + + /* + * Events requiring iteration: + * - number of pages exceeds max hold pages for process or system + * - number of pages exceeds maximum sg entries for a single WR + * + * We're limited to holding 128 pages at once - and we're limited to + * 34 SG entries per work request, but each SG entry can be any number + * of contiguous pages + * + */ + + uiotmp = *uio; + iovcnt = uio->uio_iovcnt; + iov = uio->uio_iov; + sent = 0; +sendmore: + /* + * Make sure we don't exceed the socket buffer + */ + count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE); + rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0); + hold_resid = uiotmp.uio_resid; + if (rv) + return (rv); + + /* + * Bump past sent and shave off the unheld amount + */ + if (hold_resid > 0) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + if (sent) + iov_adj(&iovtmpp, &iovcnt, sent); + iov_adj(&iovtmpp, &iovcnt, -hold_resid); + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + + } + uiotmp.uio_resid = uio->uio_resid - hold_resid; + + /* + * Push off all held pages + * + */ + while (uiotmp.uio_resid > 0) { + rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m); + if (rv) { + vm_fault_unhold_pages(toep->tp_pages, count); + return (rv); + } + uio->uio_resid -= m->m_pkthdr.len; + sent += m->m_pkthdr.len; + sbappend_locked(&so->so_snd, m); + t3_push_frames(so, TRUE); + iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid); + } + /* + * Wait for pending I/O to be DMA'd to the card + * + */ + cxgb_wait_dma_completion(toep); + vm_fault_unhold_pages(toep->tp_pages, count); + /* + * If there is more data to send adjust local copy of iov + * to point to teh start + */ + if (hold_resid) { + iovtmpp = iovtmp; + memcpy(iovtmp, iov, iovcnt*sizeof(*iov)); + iov_adj(&iovtmpp, &iovcnt, sent); + uiotmp = *uio; + uiotmp.uio_iov = iovtmpp; + uiotmp.uio_iovcnt = iovcnt; + goto sendmore; + } + + return (0); +} + static int cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + struct toedev *tdev = TOE_DEV(so); + int zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres); + int zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled); + int rv; + struct tcpcb *tp = sototcpcb(so); + /* - * punt it back to the stack if the overhead of copying is thought to - * be less than the VM and DMA overhead of setting up page pods + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * + */ + if ((tp->t_flags & TF_TOE) && (uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0) + && zcopy_enabled) { + rv = t3_sosend(so, uio); + if (rv != EAGAIN) + return (rv); + } + + return pru_sosend(so, addr, uio, top, control, flags, td); +} + + +static int +t3_soreceive(struct socket *so, struct uio *uio) +{ +#ifdef notyet + int i, rv, count, hold_resid, sent, iovcnt; + struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct mbuf *m; + struct uio uiotmp; + + /* + * Events requiring iteration: + * - number of pages exceeds max hold pages for process or system + * - number of pages exceeds maximum sg entries for a single WR + * + * We're limited to holding 128 pages at once - and we're limited to + * 34 SG entries per work request, but each SG entry can be any number + * of contiguous pages + * */ -#ifdef notyet - if (uio->uio_resid < (40 << 10) /* XXX use tunable */) -#endif - return pru_sosend(so, addr, uio, top, control, flags, td); - + uiotmp = *uio; + iovcnt = uio->uio_iovcnt; + iov = uio->uio_iov; + sent = 0; + re; +#endif + return (0); } static int cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { -#ifdef notyet - if (uio->uio_resid < (40 << 10) /* XXX use tunable */) -#endif - return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); + struct toedev *tdev = TOE_DEV(so); + int zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); + int zcopy_enabled = TOM_TUNABLE(tdev, ddp); + int rv; + struct tcpcb *tp = sototcpcb(so); + + /* + * In order to use DMA direct from userspace the following + * conditions must be met: + * - the connection is currently offloaded + * - ddp is enabled + * - the number of bytes to be transferred exceeds the threshold + * - the number of bytes currently in flight won't exceed the in-flight + * threshold XXX TODO + * - vm_fault_hold_user_pages succeeds + * - blocking socket XXX for now + * - iovcnt is 1 + * + */ + if ((tp->t_flags & TF_TOE) && (uio->uio_resid > zcopy_thres) && + (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0) + && zcopy_enabled) { + rv = t3_soreceive(so, uio); + if (rv != EAGAIN) + return (rv); + } + + return pru_soreceive(so, psa, uio, mp0, controlp, flagsp); } ==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.c#12 (text+ko) ==== @@ -77,6 +77,7 @@ #include <dev/cxgb/ulp/tom/cxgb_tom.h> #include <dev/cxgb/ulp/tom/cxgb_defs.h> #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> +#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> static int activated = 1; TUNABLE_INT("hw.t3toe.activated", &activated);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200711232332.lANNWMpf039948>