From owner-dev-commits-src-branches@freebsd.org Mon Mar 15 03:01:27 2021 Return-Path: Delivered-To: dev-commits-src-branches@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 3E8B356C8DB; Mon, 15 Mar 2021 03:01:27 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "R3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4DzLm61p16z3QK5; Mon, 15 Mar 2021 03:01:26 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id D81DD1201; Mon, 15 Mar 2021 03:01:25 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.16.1/8.16.1) with ESMTP id 12F31PN9097953; Mon, 15 Mar 2021 03:01:25 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.16.1/8.16.1/Submit) id 12F31Pt1097952; Mon, 15 Mar 2021 03:01:25 GMT (envelope-from git) Date: Mon, 15 Mar 2021 03:01:25 GMT Message-Id: <202103150301.12F31Pt1097952@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org From: Alexander Motin Subject: git: 6469aab051de - stable/12 - Coalesce socket reads in software iSCSI. MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: mav X-Git-Repository: src X-Git-Refname: refs/heads/stable/12 X-Git-Reftype: branch X-Git-Commit: 6469aab051dee71c88e27aa906c18efa09e77189 Auto-Submitted: auto-generated X-BeenThere: dev-commits-src-branches@freebsd.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: Commits to the stable branches of the FreeBSD src repository List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 15 Mar 2021 03:01:27 -0000 The branch stable/12 has been updated by mav: URL: https://cgit.FreeBSD.org/src/commit/?id=6469aab051dee71c88e27aa906c18efa09e77189 commit 6469aab051dee71c88e27aa906c18efa09e77189 Author: Alexander Motin AuthorDate: 2021-02-22 17:23:35 +0000 Commit: Alexander Motin CommitDate: 2021-03-15 02:45:25 +0000 Coalesce socket reads in software iSCSI. Instead of 2-4 socket reads per PDU this can do as low as one read per megabyte, dramatically reducing TCP overhead and lock contention. With this on iSCSI target I can write more than 4GB/s through a single connection. MFC after: 1 month (cherry picked from commit 6895f89fe54e0858aea70d2bd2a9651f45d7998e) --- sys/dev/iscsi/icl_soft.c | 258 ++++++++++++++++------------------------------- 1 file changed, 89 insertions(+), 169 deletions(-) diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c index dd9210642ffa..caeddc9247ba 100644 --- a/sys/dev/iscsi/icl_soft.c +++ b/sys/dev/iscsi/icl_soft.c @@ -143,68 +143,6 @@ icl_conn_fail(struct icl_conn *ic) (ic->ic_error)(ic); } -static struct mbuf * -icl_conn_receive(struct icl_conn *ic, size_t len) -{ - struct uio uio; - struct socket *so; - struct mbuf *m; - int error, flags; - - so = ic->ic_socket; - - memset(&uio, 0, sizeof(uio)); - uio.uio_resid = len; - - flags = MSG_DONTWAIT; - error = soreceive(so, NULL, &uio, &m, NULL, &flags); - if (error != 0) { - ICL_DEBUG("soreceive error %d", error); - return (NULL); - } - if (uio.uio_resid != 0) { - m_freem(m); - ICL_DEBUG("short read"); - return (NULL); - } - - return (m); -} - -static int -icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len) -{ - struct iovec iov[1]; - struct uio uio; - struct socket *so; - int error, flags; - - so = ic->ic_socket; - - memset(&uio, 0, sizeof(uio)); - iov[0].iov_base = buf; - iov[0].iov_len = len; - uio.uio_iov = iov; - uio.uio_iovcnt = 1; - uio.uio_offset = 0; - uio.uio_resid = len; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = UIO_READ; - - flags = MSG_DONTWAIT; - error = soreceive(so, NULL, &uio, NULL, NULL, &flags); - if (error != 0) { - ICL_DEBUG("soreceive error %d", error); - return (-1); - } - if (uio.uio_resid != 0) { - ICL_DEBUG("short read"); - return (-1); - } - - return (0); -} - static void icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { @@ -318,37 +256,28 @@ icl_pdu_size(const struct icl_pdu *response) return (len); } -static int -icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep) +static void +icl_soft_receive_buf(struct mbuf **r, size_t *rs, void *buf, size_t s) { - if (icl_conn_receive_buf(request->ip_conn, - request->ip_bhs, sizeof(struct iscsi_bhs))) { - ICL_DEBUG("failed to receive BHS"); - return (-1); - } - - *availablep -= sizeof(struct iscsi_bhs); - return (0); + m_copydata(*r, 0, s, buf); + m_adj(*r, s); + while ((*r) != NULL && (*r)->m_len == 0) + *r = m_free(*r); + *rs -= s; } -static int -icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep) +static void +icl_pdu_receive_ahs(struct icl_pdu *request, struct mbuf **r, size_t *rs) { request->ip_ahs_len = icl_pdu_ahs_length(request); if (request->ip_ahs_len == 0) - return (0); - - request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn, - request->ip_ahs_len); - if (request->ip_ahs_mbuf == NULL) { - ICL_DEBUG("failed to receive AHS"); - return (-1); - } + return; - *availablep -= request->ip_ahs_len; - return (0); + request->ip_ahs_mbuf = *r; + *r = m_split(request->ip_ahs_mbuf, request->ip_ahs_len, M_WAITOK); + *rs -= request->ip_ahs_len; } static uint32_t @@ -367,7 +296,7 @@ icl_mbuf_to_crc32c(const struct mbuf *m0) } static int -icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) +icl_pdu_check_header_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs) { uint32_t received_digest, valid_digest; @@ -375,12 +304,7 @@ icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep) return (0); CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); - if (icl_conn_receive_buf(request->ip_conn, - &received_digest, ISCSI_HEADER_DIGEST_SIZE)) { - ICL_DEBUG("failed to receive header digest"); - return (-1); - } - *availablep -= ISCSI_HEADER_DIGEST_SIZE; + icl_soft_receive_buf(r, rs, &received_digest, ISCSI_HEADER_DIGEST_SIZE); /* Temporary attach AHS to BHS to calculate header digest. */ request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf; @@ -448,8 +372,8 @@ icl_pdu_data_segment_receive_len(const struct icl_pdu *request) } static int -icl_pdu_receive_data_segment(struct icl_pdu *request, - size_t *availablep, bool *more_neededp) +icl_pdu_receive_data_segment(struct icl_pdu *request, struct mbuf **r, + size_t *rs, bool *more_neededp) { struct icl_conn *ic; size_t len, padding = 0; @@ -473,7 +397,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request, KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); len -= request->ip_data_len; - if (len + padding > *availablep) { + if (len + padding > *rs) { /* * Not enough data in the socket buffer. Receive as much * as we can. Don't receive padding, since, obviously, it's @@ -481,9 +405,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request, */ #if 0 ICL_DEBUG("limited from %zd to %zd", - len + padding, *availablep - padding)); + len + padding, *rs - padding)); #endif - len = *availablep - padding; + len = *rs - padding; *more_neededp = true; padding = 0; } @@ -493,11 +417,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request, * of actual data segment. */ if (len > 0) { - m = icl_conn_receive(request->ip_conn, len + padding); - if (m == NULL) { - ICL_DEBUG("failed to receive data segment"); - return (-1); - } + m = *r; + *r = m_split(m, len + padding, M_WAITOK); + *rs -= len + padding; if (request->ip_data_mbuf == NULL) request->ip_data_mbuf = m; @@ -505,7 +427,6 @@ icl_pdu_receive_data_segment(struct icl_pdu *request, m_cat(request->ip_data_mbuf, m); request->ip_data_len += len; - *availablep -= len + padding; } else ICL_DEBUG("len 0"); @@ -517,7 +438,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request, } static int -icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) +icl_pdu_check_data_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs) { uint32_t received_digest, valid_digest; @@ -528,12 +449,7 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) return (0); CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); - if (icl_conn_receive_buf(request->ip_conn, - &received_digest, ISCSI_DATA_DIGEST_SIZE)) { - ICL_DEBUG("failed to receive data digest"); - return (-1); - } - *availablep -= ISCSI_DATA_DIGEST_SIZE; + icl_soft_receive_buf(r, rs, &received_digest, ISCSI_DATA_DIGEST_SIZE); /* * Note that ip_data_mbuf also contains padding; since digest @@ -555,16 +471,13 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep) * "part" of PDU at a time; call it repeatedly until it returns non-NULL. */ static struct icl_pdu * -icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) +icl_conn_receive_pdu(struct icl_conn *ic, struct mbuf **r, size_t *rs) { struct icl_pdu *request; - struct socket *so; size_t len; - int error; + int error = 0; bool more_needed; - so = ic->ic_socket; - if (ic->ic_receive_state == ICL_CONN_STATE_BHS) { KASSERT(ic->ic_receive_pdu == NULL, ("ic->ic_receive_pdu != NULL")); @@ -582,23 +495,11 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) request = ic->ic_receive_pdu; } - if (*availablep < ic->ic_receive_len) { -#if 0 - ICL_DEBUG("not enough data; need %zd, " - "have %zd", ic->ic_receive_len, *availablep); -#endif - return (NULL); - } - switch (ic->ic_receive_state) { case ICL_CONN_STATE_BHS: //ICL_DEBUG("receiving BHS"); - error = icl_pdu_receive_bhs(request, availablep); - if (error != 0) { - ICL_DEBUG("failed to receive BHS; " - "dropping connection"); - break; - } + icl_soft_receive_buf(r, rs, request->ip_bhs, + sizeof(struct iscsi_bhs)); /* * We don't enforce any limit for AHS length; @@ -622,12 +523,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) case ICL_CONN_STATE_AHS: //ICL_DEBUG("receiving AHS"); - error = icl_pdu_receive_ahs(request, availablep); - if (error != 0) { - ICL_DEBUG("failed to receive AHS; " - "dropping connection"); - break; - } + icl_pdu_receive_ahs(request, r, rs); ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST; if (ic->ic_header_crc32c == false) ic->ic_receive_len = 0; @@ -637,7 +533,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) case ICL_CONN_STATE_HEADER_DIGEST: //ICL_DEBUG("receiving header digest"); - error = icl_pdu_check_header_digest(request, availablep); + error = icl_pdu_check_header_digest(request, r, rs); if (error != 0) { ICL_DEBUG("header digest failed; " "dropping connection"); @@ -651,7 +547,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) case ICL_CONN_STATE_DATA: //ICL_DEBUG("receiving data segment"); - error = icl_pdu_receive_data_segment(request, availablep, + error = icl_pdu_receive_data_segment(request, r, rs, &more_needed); if (error != 0) { ICL_DEBUG("failed to receive data segment;" @@ -671,7 +567,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) case ICL_CONN_STATE_DATA_DIGEST: //ICL_DEBUG("receiving data digest"); - error = icl_pdu_check_data_digest(request, availablep); + error = icl_pdu_check_data_digest(request, r, rs); if (error != 0) { ICL_DEBUG("data digest failed; " "dropping connection"); @@ -703,44 +599,27 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep) } static void -icl_conn_receive_pdus(struct icl_conn *ic, size_t available) +icl_conn_receive_pdus(struct icl_conn *ic, struct mbuf **r, size_t *rs) { struct icl_pdu *response; - struct socket *so; - - so = ic->ic_socket; - - /* - * This can never happen; we're careful to only mess with ic->ic_socket - * pointer when the send/receive threads are not running. - */ - KASSERT(so != NULL, ("NULL socket")); for (;;) { if (ic->ic_disconnecting) return; - if (so->so_error != 0) { - ICL_DEBUG("connection error %d; " - "dropping connection", so->so_error); - icl_conn_fail(ic); - return; - } - /* * Loop until we have a complete PDU or there is not enough * data in the socket buffer. */ - if (available < ic->ic_receive_len) { + if (*rs < ic->ic_receive_len) { #if 0 - ICL_DEBUG("not enough data; have %zd, " - "need %zd", available, - ic->ic_receive_len); + ICL_DEBUG("not enough data; have %zd, need %zd", + *rs, ic->ic_receive_len); #endif return; } - response = icl_conn_receive_pdu(ic, &available); + response = icl_conn_receive_pdu(ic, r, rs); if (response == NULL) continue; @@ -761,15 +640,19 @@ static void icl_receive_thread(void *arg) { struct icl_conn *ic; - size_t available; + size_t available, read = 0; struct socket *so; + struct mbuf *m, *r = NULL; + struct uio uio; + int error, flags; ic = arg; so = ic->ic_socket; for (;;) { + SOCKBUF_LOCK(&so->so_rcv); if (ic->ic_disconnecting) { - //ICL_DEBUG("terminating"); + SOCKBUF_UNLOCK(&so->so_rcv); break; } @@ -779,18 +662,50 @@ icl_receive_thread(void *arg) * to avoid unnecessary wakeups until there * is enough data received to read the PDU. */ - SOCKBUF_LOCK(&so->so_rcv); available = sbavail(&so->so_rcv); - if (available < ic->ic_receive_len) { - so->so_rcv.sb_lowat = ic->ic_receive_len; + if (read + available < ic->ic_receive_len) { + so->so_rcv.sb_lowat = ic->ic_receive_len - read; cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx); - } else so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1; + available = sbavail(&so->so_rcv); + } SOCKBUF_UNLOCK(&so->so_rcv); - icl_conn_receive_pdus(ic, available); + if (available == 0) { + if (so->so_error != 0) { + ICL_DEBUG("connection error %d; " + "dropping connection", so->so_error); + icl_conn_fail(ic); + break; + } + continue; + } + + memset(&uio, 0, sizeof(uio)); + uio.uio_resid = available; + flags = MSG_DONTWAIT; + error = soreceive(so, NULL, &uio, &m, NULL, &flags); + if (error != 0) { + ICL_DEBUG("soreceive error %d", error); + break; + } + if (uio.uio_resid != 0) { + m_freem(m); + ICL_DEBUG("short read"); + break; + } + if (r) + m_cat(r, m); + else + r = m; + read += available; + + icl_conn_receive_pdus(ic, &r, &read); } + if (r) + m_freem(r); + ICL_CONN_LOCK(ic); ic->ic_receive_running = false; cv_signal(&ic->ic_send_cv); @@ -1366,12 +1281,17 @@ icl_soft_conn_close(struct icl_conn *ic) struct icl_pdu *pdu; struct socket *so; - ICL_CONN_LOCK(ic); - /* * Wake up the threads, so they can properly terminate. + * Receive thread sleeps on so->so_rcv lock, send on ic->ic_lock. */ - ic->ic_disconnecting = true; + ICL_CONN_LOCK(ic); + if (!ic->ic_disconnecting) { + so = ic->ic_socket; + SOCKBUF_LOCK(&so->so_rcv); + ic->ic_disconnecting = true; + SOCKBUF_UNLOCK(&so->so_rcv); + } while (ic->ic_receive_running || ic->ic_send_running) { cv_signal(&ic->ic_receive_cv); cv_signal(&ic->ic_send_cv);