From owner-p4-projects@FreeBSD.ORG Sat Jan 26 08:00:17 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 3ADE316A41A; Sat, 26 Jan 2008 08:00:17 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id F1D1716A417 for ; Sat, 26 Jan 2008 08:00:16 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id D826213C458 for ; Sat, 26 Jan 2008 08:00:16 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0Q80GNj048400 for ; Sat, 26 Jan 2008 08:00:16 GMT (envelope-from kmacy@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0Q80G5u048397 for perforce@freebsd.org; Sat, 26 Jan 2008 08:00:16 GMT (envelope-from kmacy@freebsd.org) Date: Sat, 26 Jan 2008 08:00:16 GMT Message-Id: <200801260800.m0Q80G5u048397@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to kmacy@freebsd.org using -f From: Kip Macy To: Perforce Change Reviews Cc: Subject: PERFORCE change 134128 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 26 Jan 2008 08:00:17 -0000 http://perforce.freebsd.org/chv.cgi?CH=134128 Change 134128 by kmacy@kmacy:storage:toehead on 2008/01/26 07:59:26 first cut at implementing zero copy soreceive Affected files ... .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 edit Differences ... ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 (text+ko) ==== @@ -101,6 +101,7 @@ #ifndef PG_FRAME #define PG_FRAME ~PAGE_MASK #endif +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) void t3_init_socket_ops(void) @@ -251,7 +252,6 @@ * can be posted without closing the window in the middle of DDP (checked * when the connection is offloaded) */ -#ifdef notyet static int so_should_ddp(const struct toepcb *toep, int last_recv_len) { @@ -260,7 +260,67 @@ toep->tp_tp->rcv_wnd > (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); } -#endif + +static inline int +is_ddp(const struct mbuf *m) +{ + return (m->m_flags & M_DDP); +} + +static inline int +is_ddp_psh(const struct mbuf *m) +{ + return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); +} + +static int +m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + int curlen, err = 0; + caddr_t buf; + + while (m && len) { + buf = mtod(m, caddr_t); + curlen = m->m_len; + if (offset < curlen) { + curlen -= offset; + buf += offset; + offset = 0; + } else { + offset -= curlen; + m = m->m_next; + continue; + } + + err = uiomove_frombuf(buf, min(len, curlen), uio); + if (err) + return (err); + len -= min(len, m->m_len); + m = m->m_next; + } + return (err); +} + +/* + * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the + * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a + * DDP buffer. + */ +static inline int +copy_data(const struct mbuf *m, int offset, int len, struct uio *uio) +{ + struct iovec *to = uio->uio_iov; + + if (__predict_true(!is_ddp(m))) /* RX_DATA */ + return m_uiomove(m, offset, len, uio); + if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ + to->iov_len -= len; + to->iov_base = ((caddr_t)to->iov_base) + len; + uio->uio_iov = to; + return (0); + } + return t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ +} static void cxgb_wait_dma_completion(struct toepcb *toep) @@ -449,34 +509,258 @@ static int -t3_soreceive(struct socket *so, struct uio *uio) +t3_soreceive(struct socket *so, int *flagsp, struct uio *uio) { -#ifdef notyet - int i, rv, count, hold_resid, sent, iovcnt; - struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov; struct tcpcb *tp = sototcpcb(so); struct toepcb *toep = tp->t_toe; struct mbuf *m; - struct uio uiotmp; + uint32_t offset; + int err, flags, avail, len, buffers_freed = 0, copied = 0; + int target; /* Read at least this many bytes */ + long timeo; + int user_ddp_ok, user_ddp_pending = 0; + struct ddp_state *p; + struct inpcb *inp = sotoinpcb(so); + + flags = flagsp ? (*flagsp &~ MSG_EOR) : 0; + + err = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + if (err) + return (err); +restart: + SOCKBUF_LOCK(&so->so_rcv); + len = uio->uio_resid; + m = so->so_rcv.sb_mb; + target = (flags & MSG_WAITALL) ? min(len, so->so_rcv.sb_hiwat) : so->so_rcv.sb_lowat; + timeo = so->so_rcv.sb_timeo; + p = &toep->tp_ddp_state; + user_ddp_ok = p->ubuf_ddp_ready; + p->cancel_ubuf = 0; + + /* + * XXX check timeo/signal/urgent + */ + if (m) + goto got_mbuf; + + /* empty receive queue */ + if (copied >= target && /* !sk->sk_backlog.tail && */ + !user_ddp_pending) + goto done; + if (copied) { + if (so->so_error || tp->t_state == TCPS_CLOSED || + (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))) + goto done; + } else { + if (so->so_state & SS_NOFDREF) + goto done; + if (so->so_error) { + err = so->so_error; + so->so_error = 0; + goto done; + } + if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) + goto done; + if (tp->t_state == TCPS_CLOSED) { + err = ENOTCONN; + goto done; + } + } + if (so->so_rcv.sb_mb && !user_ddp_pending) { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp); + INP_UNLOCK(inp); + goto restart; + } + if (p->ubuf && user_ddp_ok && !user_ddp_pending && + uio->uio_iov->iov_len > p->kbuf[0]->dgl_length && + p->ubuf_ddp_ready) { + user_ddp_pending = + !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1); + if (user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + } + if (user_ddp_pending) { + /* One shot at DDP if we already have enough data */ + if (copied >= target) + user_ddp_ok = 0; + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; +//for timers to work await_ddp_completion(sk, flags, &timeo); + } else if (copied >= target) + goto done; + else { + SOCKBUF_UNLOCK(&so->so_rcv); + INP_LOCK(inp); + t3_cleanup_rbuf(tp); + INP_UNLOCK(inp); + if ((err = sbwait(&so->so_rcv)) != 0) + goto done; + } + goto restart; +got_mbuf: + if (m->m_pkthdr.len == 0) { + if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0) + panic("empty mbuf and NOCOPY not set\n"); + user_ddp_pending = 0; + sbfree(&so->so_rcv, m); + m = so->so_rcv.sb_mb = m_free(m); + goto done; + } + offset = toep->tp_copied_seq - m->m_seq; + if (offset > m->m_pkthdr.len) + panic("t3_soreceive: BUG: OFFSET > LEN seq 0x%x " + "skb->len %d flags 0x%x", m->m_seq, + m->m_pkthdr.len, m->m_ddp_flags); + avail = m->m_pkthdr.len - offset; + if (len < avail) { + if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY)) + panic("bad state in t3_soreceive\n"); + avail = len; + } +#ifdef notyet /* - * Events requiring iteration: - * - number of pages exceeds max hold pages for process or system - * - number of pages exceeds maximum sg entries for a single WR - * - * We're limited to holding 128 pages at once - and we're limited to - * 34 SG entries per work request, but each SG entry can be any number - * of contiguous pages - * + * Check if the data we are preparing to copy contains urgent + * data. Either stop short of urgent data or skip it if it's + * first and we are not delivering urgent data inline. + */ + if (unlikely(tp->urg_data)) { + u32 urg_offset = tp->urg_seq - tp->copied_seq; + + if (urg_offset < avail) { + if (urg_offset) { + /* stop short of the urgent data */ + avail = urg_offset; + } else if (!sock_flag(sk, SOCK_URGINLINE)) { + /* First byte is urgent, skip */ + tp->copied_seq++; + offset++; + avail--; + if (!avail) + goto skip_copy; + } + } + } +#endif + if (is_ddp_psh(m) || offset) { + user_ddp_ok = 0; +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), "t3_sosend: PSH"); +#endif + } + + if (user_ddp_ok && !user_ddp_pending && + /* + * XXX + */ +#ifdef notyet + uio->uio_iovlen > p->kbuf[0]->length && +#endif + p->ubuf_ddp_ready) { + user_ddp_pending = + !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1); + if (user_ddp_pending) { + p->kbuf_posted++; + user_ddp_ok = 0; + } + } + + /* + * If MSG_TRUNC is specified the data is discarded. + * XXX need to check pr_atomic + */ + if (__predict_true(!(flags & MSG_TRUNC))) + if ((err = copy_data(m, offset, avail, uio))) { + if (err) + err = EFAULT; + goto done; + } + + toep->tp_copied_seq += avail; + copied += avail; + len -= avail; +#ifdef notyet +skip_copy: + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) + tp->urg_data = 0; +#endif + /* + * If the buffer is fully consumed free it. If it's a DDP + * buffer also handle any events it indicates. + */ + if (avail + offset >= m->m_pkthdr.len) { + unsigned int fl = m->m_ddp_flags; + int got_psh = 0; + + if (p->ubuf != NULL && is_ddp(m) && (fl & 1)) { + if (is_ddp_psh(m) && user_ddp_pending) + got_psh = 1; + + if (fl & DDP_BF_NOCOPY) + user_ddp_pending = 0; + else { + p->kbuf_posted--; + p->ubuf_ddp_ready = 1; + } + } + sbfree(&so->so_rcv, m); + m = so->so_rcv.sb_mb = m_free(m); + buffers_freed++; + + if ((so->so_rcv.sb_mb == NULL) && got_psh) + goto done; + } + if (len > 0) + goto restart; + +done: + /* + * If we can still receive decide what to do in preparation for the + * next receive. Note that RCV_SHUTDOWN is set if the connection + * transitioned to CLOSE but not if it was in that state to begin with. */ + if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) { + if (user_ddp_pending) { + user_ddp_ok = 0; + t3_cancel_ubuf(toep); + if (so->so_rcv.sb_mb) { + if (copied < 0) + copied = 0; + goto restart; + } + user_ddp_pending = 0; + } + if (p->kbuf_posted == 0) { +#ifdef T3_TRACE + T3_TRACE0(TIDTB(so), + "chelsio_recvmsg: about to exit, repost kbuf"); +#endif + + t3_post_kbuf(so, 1); + p->kbuf_posted++; + } else if (so_should_ddp(toep, copied)) { + t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so), + ddp_copy_limit), 0); + p->kbuf_posted = 1; + } + } + if (buffers_freed) + t3_cleanup_rbuf(tp); +#ifdef T3_TRACE + T3_TRACE5(TIDTB(so), + "chelsio_recvmsg <-: copied %d len %d buffers_freed %d " + "kbuf_posted %d user_ddp_pending %u", + copied, len, buffers_freed, p ? p->kbuf_posted : -1, + user_ddp_pending); +#endif + SOCKBUF_UNLOCK(&so->so_rcv); + sbunlock(&so->so_rcv); - uiotmp = *uio; - iovcnt = uio->uio_iovcnt; - iov = uio->uio_iov; - sent = 0; - re; -#endif - return (0); + return (err); } static int @@ -484,9 +768,11 @@ struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct toedev *tdev; - int rv, zcopy_thres, zcopy_enabled; + int rv, zcopy_thres, zcopy_enabled, flags; struct tcpcb *tp = sototcpcb(so); + flags = flagsp ? *flagsp &~ MSG_EOR : 0; + /* * In order to use DMA direct from userspace the following * conditions must be met: @@ -500,14 +786,16 @@ * - iovcnt is 1 * */ - if (tp->t_flags & TF_TOE) { + if ((tp->t_flags & TF_TOE) && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0) + && ((so->so_state & SS_NBIO) == 0) && (uio->uio_iovcnt == 1) && + ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) && (mp0 == NULL)) { tdev = TOE_DEV(so); zcopy_thres = TOM_TUNABLE(tdev, ddp_thres); zcopy_enabled = TOM_TUNABLE(tdev, ddp); if ((uio->uio_resid > zcopy_thres) && (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0) && zcopy_enabled) { - rv = t3_soreceive(so, uio); + rv = t3_soreceive(so, flagsp, uio); if (rv != EAGAIN) return (rv); } ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 (text+ko) ==== @@ -326,9 +326,9 @@ } /** - * setup_iovec_ppods - setup HW page pods for a user iovec + * setup_uio_ppods - setup HW page pods for a user iovec * @sk: the associated socket - * @iov: the iovec + * @uio: the uio * @oft: additional bytes to map before the start of the buffer * * Pins a user iovec and sets up HW page pods for DDP into it. We allocate @@ -339,13 +339,14 @@ * The current implementation handles iovecs with only one entry. */ static int -setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft, int *length) +setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length) { int err; unsigned int len; struct ddp_gather_list *gl = NULL; struct toepcb *toep = sototcpcb(so)->t_toe; struct ddp_state *p = &toep->tp_ddp_state; + struct iovec *iov = uio->uio_iov; unsigned long addr = (unsigned long)iov->iov_base - oft; if (__predict_false(!p->ubuf_nppods)) { @@ -424,7 +425,7 @@ * Post a user buffer as an overlay on top of the current kernel buffer. */ int -t3_overlay_ubuf(struct socket *so, const struct iovec *iov, +t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock, int rcv_flags, int modulate, int post_kbuf) { int err, len, ubuf_idx; @@ -435,7 +436,7 @@ if (p->ubuf == NULL) return (EINVAL); - err = setup_iovec_ppods(so, iov, 0, &len); + err = setup_uio_ppods(so, uio, 0, &len); if (err) return (err); @@ -481,67 +482,6 @@ return (0); } -static inline int -is_ddp(const struct mbuf *m) -{ - return (m->m_flags & M_DDP); -} - -static inline int -is_ddp_psh(const struct mbuf *m) -{ - return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); -} - -static int -m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio) -{ - int curlen, err = 0; - caddr_t buf; - - while (m && len) { - buf = mtod(m, caddr_t); - curlen = m->m_len; - if (offset < curlen) { - curlen -= offset; - buf += offset; - offset = 0; - } else { - offset -= curlen; - m = m->m_next; - continue; - } - - err = uiomove_frombuf(buf, min(len, curlen), uio); - if (err) - return (err); - len -= min(len, m->m_len); - m = m->m_next; - } - return (err); -} - -/* - * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the - * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a - * DDP buffer. - */ -static inline int -copy_data(const struct mbuf *m, int offset, struct uio *uio, int len) -{ - struct iovec *to = uio->uio_iov; - - if (__predict_true(!is_ddp(m))) /* RX_DATA */ - return m_uiomove(m, offset, len, uio); - if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */ - to->iov_len -= len; - to->iov_base = ((caddr_t)to->iov_base) + len; - uio->uio_iov = to; - return (0); - } - return t3_ddp_copy(m, offset, uio, len); /* kernel DDP */ -} - /* * Clean up DDP state that needs to survive until socket close time, such as the * DDP buffers. The buffers are already unmapped at this point as unmapping ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 (text+ko) ==== @@ -153,14 +153,13 @@ int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag); void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n); void t3_free_ddp_gl(struct ddp_gather_list *gl); -int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, - int len); +int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len); //void t3_repost_kbuf(struct socket *so, int modulate, int activate); void t3_post_kbuf(struct socket *so, int modulate); -int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock, +int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock, int rcv_flags, int modulate, int post_kbuf); void t3_cancel_ubuf(struct toepcb *toep); -int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock, +int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock, int rcv_flags, int modulate, int post_kbuf); int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall); void t3_cleanup_ddp(struct toepcb *toep);