Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 15 Mar 2021 02:39:48 GMT
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: 31d41a6a9b65 - stable/13 - Coalesce socket reads in software iSCSI.
Message-ID:  <202103150239.12F2dmDB061629@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=31d41a6a9b65e1ff1f71250fd900ab2d451902f0

commit 31d41a6a9b65e1ff1f71250fd900ab2d451902f0
Author:     Alexander Motin <mav@FreeBSD.org>
AuthorDate: 2021-02-22 17:23:35 +0000
Commit:     Alexander Motin <mav@FreeBSD.org>
CommitDate: 2021-03-15 02:34:28 +0000

    Coalesce socket reads in software iSCSI.
    
    Instead of 2-4 socket reads per PDU this can do as low as one read
    per megabyte, dramatically reducing TCP overhead and lock contention.
    
    With this on iSCSI target I can write more than 4GB/s through a
    single connection.
    
    MFC after:      1 month
    
    (cherry picked from commit 6895f89fe54e0858aea70d2bd2a9651f45d7998e)
---
 sys/dev/iscsi/icl_soft.c | 258 ++++++++++++++++-------------------------------
 1 file changed, 89 insertions(+), 169 deletions(-)

diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c
index a8986b3d4253..a5696647169a 100644
--- a/sys/dev/iscsi/icl_soft.c
+++ b/sys/dev/iscsi/icl_soft.c
@@ -165,68 +165,6 @@ icl_conn_fail(struct icl_conn *ic)
 	(ic->ic_error)(ic);
 }
 
-static struct mbuf *
-icl_conn_receive(struct icl_conn *ic, size_t len)
-{
-	struct uio uio;
-	struct socket *so;
-	struct mbuf *m;
-	int error, flags;
-
-	so = ic->ic_socket;
-
-	memset(&uio, 0, sizeof(uio));
-	uio.uio_resid = len;
-
-	flags = MSG_DONTWAIT;
-	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
-	if (error != 0) {
-		ICL_DEBUG("soreceive error %d", error);
-		return (NULL);
-	}
-	if (uio.uio_resid != 0) {
-		m_freem(m);
-		ICL_DEBUG("short read");
-		return (NULL);
-	}
-
-	return (m);
-}
-
-static int
-icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
-{
-	struct iovec iov[1];
-	struct uio uio;
-	struct socket *so;
-	int error, flags;
-
-	so = ic->ic_socket;
-
-	memset(&uio, 0, sizeof(uio));
-	iov[0].iov_base = buf;
-	iov[0].iov_len = len;
-	uio.uio_iov = iov;
-	uio.uio_iovcnt = 1;
-	uio.uio_offset = 0;
-	uio.uio_resid = len;
-	uio.uio_segflg = UIO_SYSSPACE;
-	uio.uio_rw = UIO_READ;
-
-	flags = MSG_DONTWAIT;
-	error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
-	if (error != 0) {
-		ICL_DEBUG("soreceive error %d", error);
-		return (-1);
-	}
-	if (uio.uio_resid != 0) {
-		ICL_DEBUG("short read");
-		return (-1);
-	}
-
-	return (0);
-}
-
 static void
 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
@@ -384,37 +322,28 @@ icl_pdu_size(const struct icl_pdu *response)
 	return (len);
 }
 
-static int
-icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
+static void
+icl_soft_receive_buf(struct mbuf **r, size_t *rs, void *buf, size_t s)
 {
 
-	if (icl_conn_receive_buf(request->ip_conn,
-	    request->ip_bhs, sizeof(struct iscsi_bhs))) {
-		ICL_DEBUG("failed to receive BHS");
-		return (-1);
-	}
-
-	*availablep -= sizeof(struct iscsi_bhs);
-	return (0);
+	m_copydata(*r, 0, s, buf);
+	m_adj(*r, s);
+	while ((*r) != NULL && (*r)->m_len == 0)
+		*r = m_free(*r);
+	*rs -= s;
 }
 
-static int
-icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
+static void
+icl_pdu_receive_ahs(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 
 	request->ip_ahs_len = icl_pdu_ahs_length(request);
 	if (request->ip_ahs_len == 0)
-		return (0);
-
-	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
-	    request->ip_ahs_len);
-	if (request->ip_ahs_mbuf == NULL) {
-		ICL_DEBUG("failed to receive AHS");
-		return (-1);
-	}
+		return;
 
-	*availablep -= request->ip_ahs_len;
-	return (0);
+	request->ip_ahs_mbuf = *r;
+	*r = m_split(request->ip_ahs_mbuf, request->ip_ahs_len, M_WAITOK);
+	*rs -= request->ip_ahs_len;
 }
 
 static uint32_t
@@ -433,7 +362,7 @@ icl_mbuf_to_crc32c(const struct mbuf *m0)
 }
 
 static int
-icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
+icl_pdu_check_header_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 	uint32_t received_digest, valid_digest;
 
@@ -441,12 +370,7 @@ icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
-	if (icl_conn_receive_buf(request->ip_conn,
-	    &received_digest, ISCSI_HEADER_DIGEST_SIZE)) {
-		ICL_DEBUG("failed to receive header digest");
-		return (-1);
-	}
-	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
+	icl_soft_receive_buf(r, rs, &received_digest, ISCSI_HEADER_DIGEST_SIZE);
 
 	/* Temporary attach AHS to BHS to calculate header digest. */
 	request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
@@ -514,8 +438,8 @@ icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
 }
 
 static int
-icl_pdu_receive_data_segment(struct icl_pdu *request,
-    size_t *availablep, bool *more_neededp)
+icl_pdu_receive_data_segment(struct icl_pdu *request, struct mbuf **r,
+    size_t *rs, bool *more_neededp)
 {
 	struct icl_conn *ic;
 	size_t len, padding = 0;
@@ -539,7 +463,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
 	len -= request->ip_data_len;
 
-	if (len + padding > *availablep) {
+	if (len + padding > *rs) {
 		/*
 		 * Not enough data in the socket buffer.  Receive as much
 		 * as we can.  Don't receive padding, since, obviously, it's
@@ -547,9 +471,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
 		 */
 #if 0
 		ICL_DEBUG("limited from %zd to %zd",
-		    len + padding, *availablep - padding));
+		    len + padding, *rs - padding));
 #endif
-		len = *availablep - padding;
+		len = *rs - padding;
 		*more_neededp = true;
 		padding = 0;
 	}
@@ -559,11 +483,9 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
 	 * of actual data segment.
 	 */
 	if (len > 0) {
-		m = icl_conn_receive(request->ip_conn, len + padding);
-		if (m == NULL) {
-			ICL_DEBUG("failed to receive data segment");
-			return (-1);
-		}
+		m = *r;
+		*r = m_split(m, len + padding, M_WAITOK);
+		*rs -= len + padding;
 
 		if (request->ip_data_mbuf == NULL)
 			request->ip_data_mbuf = m;
@@ -571,7 +493,6 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
 			m_cat(request->ip_data_mbuf, m);
 
 		request->ip_data_len += len;
-		*availablep -= len + padding;
 	} else
 		ICL_DEBUG("len 0");
 
@@ -583,7 +504,7 @@ icl_pdu_receive_data_segment(struct icl_pdu *request,
 }
 
 static int
-icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
+icl_pdu_check_data_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs)
 {
 	uint32_t received_digest, valid_digest;
 
@@ -594,12 +515,7 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
 		return (0);
 
 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
-	if (icl_conn_receive_buf(request->ip_conn,
-	    &received_digest, ISCSI_DATA_DIGEST_SIZE)) {
-		ICL_DEBUG("failed to receive data digest");
-		return (-1);
-	}
-	*availablep -= ISCSI_DATA_DIGEST_SIZE;
+	icl_soft_receive_buf(r, rs, &received_digest, ISCSI_DATA_DIGEST_SIZE);
 
 	/*
 	 * Note that ip_data_mbuf also contains padding; since digest
@@ -621,16 +537,13 @@ icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
  */
 static struct icl_pdu *
-icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
+icl_conn_receive_pdu(struct icl_conn *ic, struct mbuf **r, size_t *rs)
 {
 	struct icl_pdu *request;
-	struct socket *so;
 	size_t len;
-	int error;
+	int error = 0;
 	bool more_needed;
 
-	so = ic->ic_socket;
-
 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
 		KASSERT(ic->ic_receive_pdu == NULL,
 		    ("ic->ic_receive_pdu != NULL"));
@@ -648,23 +561,11 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 		request = ic->ic_receive_pdu;
 	}
 
-	if (*availablep < ic->ic_receive_len) {
-#if 0
-		ICL_DEBUG("not enough data; need %zd, "
-		    "have %zd", ic->ic_receive_len, *availablep);
-#endif
-		return (NULL);
-	}
-
 	switch (ic->ic_receive_state) {
 	case ICL_CONN_STATE_BHS:
 		//ICL_DEBUG("receiving BHS");
-		error = icl_pdu_receive_bhs(request, availablep);
-		if (error != 0) {
-			ICL_DEBUG("failed to receive BHS; "
-			    "dropping connection");
-			break;
-		}
+		icl_soft_receive_buf(r, rs, request->ip_bhs,
+		    sizeof(struct iscsi_bhs));
 
 		/*
 		 * We don't enforce any limit for AHS length;
@@ -686,12 +587,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 
 	case ICL_CONN_STATE_AHS:
 		//ICL_DEBUG("receiving AHS");
-		error = icl_pdu_receive_ahs(request, availablep);
-		if (error != 0) {
-			ICL_DEBUG("failed to receive AHS; "
-			    "dropping connection");
-			break;
-		}
+		icl_pdu_receive_ahs(request, r, rs);
 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
 		if (ic->ic_header_crc32c == false)
 			ic->ic_receive_len = 0;
@@ -701,7 +597,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 
 	case ICL_CONN_STATE_HEADER_DIGEST:
 		//ICL_DEBUG("receiving header digest");
-		error = icl_pdu_check_header_digest(request, availablep);
+		error = icl_pdu_check_header_digest(request, r, rs);
 		if (error != 0) {
 			ICL_DEBUG("header digest failed; "
 			    "dropping connection");
@@ -715,7 +611,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 
 	case ICL_CONN_STATE_DATA:
 		//ICL_DEBUG("receiving data segment");
-		error = icl_pdu_receive_data_segment(request, availablep,
+		error = icl_pdu_receive_data_segment(request, r, rs,
 		    &more_needed);
 		if (error != 0) {
 			ICL_DEBUG("failed to receive data segment;"
@@ -735,7 +631,7 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 
 	case ICL_CONN_STATE_DATA_DIGEST:
 		//ICL_DEBUG("receiving data digest");
-		error = icl_pdu_check_data_digest(request, availablep);
+		error = icl_pdu_check_data_digest(request, r, rs);
 		if (error != 0) {
 			ICL_DEBUG("data digest failed; "
 			    "dropping connection");
@@ -767,44 +663,27 @@ icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
 }
 
 static void
-icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
+icl_conn_receive_pdus(struct icl_conn *ic, struct mbuf **r, size_t *rs)
 {
 	struct icl_pdu *response;
-	struct socket *so;
-
-	so = ic->ic_socket;
-
-	/*
-	 * This can never happen; we're careful to only mess with ic->ic_socket
-	 * pointer when the send/receive threads are not running.
-	 */
-	KASSERT(so != NULL, ("NULL socket"));
 
 	for (;;) {
 		if (ic->ic_disconnecting)
 			return;
 
-		if (so->so_error != 0) {
-			ICL_DEBUG("connection error %d; "
-			    "dropping connection", so->so_error);
-			icl_conn_fail(ic);
-			return;
-		}
-
 		/*
 		 * Loop until we have a complete PDU or there is not enough
 		 * data in the socket buffer.
 		 */
-		if (available < ic->ic_receive_len) {
+		if (*rs < ic->ic_receive_len) {
 #if 0
-			ICL_DEBUG("not enough data; have %zd, "
-			    "need %zd", available,
-			    ic->ic_receive_len);
+			ICL_DEBUG("not enough data; have %zd, need %zd",
+			    *rs, ic->ic_receive_len);
 #endif
 			return;
 		}
 
-		response = icl_conn_receive_pdu(ic, &available);
+		response = icl_conn_receive_pdu(ic, r, rs);
 		if (response == NULL)
 			continue;
 
@@ -825,15 +704,19 @@ static void
 icl_receive_thread(void *arg)
 {
 	struct icl_conn *ic;
-	size_t available;
+	size_t available, read = 0;
 	struct socket *so;
+	struct mbuf *m, *r = NULL;
+	struct uio uio;
+	int error, flags;
 
 	ic = arg;
 	so = ic->ic_socket;
 
 	for (;;) {
+		SOCKBUF_LOCK(&so->so_rcv);
 		if (ic->ic_disconnecting) {
-			//ICL_DEBUG("terminating");
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			break;
 		}
 
@@ -843,18 +726,50 @@ icl_receive_thread(void *arg)
 		 * to avoid unnecessary wakeups until there
 		 * is enough data received to read the PDU.
 		 */
-		SOCKBUF_LOCK(&so->so_rcv);
 		available = sbavail(&so->so_rcv);
-		if (available < ic->ic_receive_len) {
-			so->so_rcv.sb_lowat = ic->ic_receive_len;
+		if (read + available < ic->ic_receive_len) {
+			so->so_rcv.sb_lowat = ic->ic_receive_len - read;
 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
-		} else
 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
+			available = sbavail(&so->so_rcv);
+		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
-		icl_conn_receive_pdus(ic, available);
+		if (available == 0) {
+			if (so->so_error != 0) {
+				ICL_DEBUG("connection error %d; "
+				    "dropping connection", so->so_error);
+				icl_conn_fail(ic);
+				break;
+			}
+			continue;
+		}
+
+		memset(&uio, 0, sizeof(uio));
+		uio.uio_resid = available;
+		flags = MSG_DONTWAIT;
+		error = soreceive(so, NULL, &uio, &m, NULL, &flags);
+		if (error != 0) {
+			ICL_DEBUG("soreceive error %d", error);
+			break;
+		}
+		if (uio.uio_resid != 0) {
+			m_freem(m);
+			ICL_DEBUG("short read");
+			break;
+		}
+		if (r)
+			m_cat(r, m);
+		else
+			r = m;
+		read += available;
+
+		icl_conn_receive_pdus(ic, &r, &read);
 	}
 
+	if (r)
+		m_freem(r);
+
 	ICL_CONN_LOCK(ic);
 	ic->ic_receive_running = false;
 	cv_signal(&ic->ic_send_cv);
@@ -1440,12 +1355,17 @@ icl_soft_conn_close(struct icl_conn *ic)
 	struct icl_pdu *pdu;
 	struct socket *so;
 
-	ICL_CONN_LOCK(ic);
-
 	/*
 	 * Wake up the threads, so they can properly terminate.
+	 * Receive thread sleeps on so->so_rcv lock, send on ic->ic_lock.
 	 */
-	ic->ic_disconnecting = true;
+	ICL_CONN_LOCK(ic);
+	if (!ic->ic_disconnecting) {
+		so = ic->ic_socket;
+		SOCKBUF_LOCK(&so->so_rcv);
+		ic->ic_disconnecting = true;
+		SOCKBUF_UNLOCK(&so->so_rcv);
+	}
 	while (ic->ic_receive_running || ic->ic_send_running) {
 		cv_signal(&ic->ic_receive_cv);
 		cv_signal(&ic->ic_send_cv);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202103150239.12F2dmDB061629>