Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 7 Feb 2001 18:58:07 -0800 (PST)
From:      Luigi Rizzo <rizzo@aciri.org>
To:        net@freebsd.org
Cc:        rizzo@aciri.org
Subject:   [patch] fast sbappend*, please try...
Message-ID:  <200102080258.f182w7c08367@iguana.aciri.org>

next in thread | raw e-mail | index | archive | help
Hi,

I would be grateful if someone could test the attached patch which
deals with the following problem:

	on all *BSD version, socket buffers contain a list of
	incoming and/or outgoing mbufs. Unfortunately the list only
	has a pointer to the head, meaning that all append operations
	require to scan the full list. The overhead can be very
	bad in some cases (e.g. small UDP packets), and becomes
	worse and worse as the socket buffer size increases (which
	is what one would commonly do when expecting a lot of
	traffic!).

The attached patch implements a tail pointer to the list, so that
you can append in constant time. By default, the code works exactly
as before -- the tail of the list is reached with the usual linear
scan, and the pointer to the tail is only used for comparison
purposes to make sure that it yields the same value.

If you enable the fast behaviour with

	sysctl -w kern.ipc.fastscan=1

then the new code takes over and linear scans are replaced by
dereferences of the tail pointer.

Apart from the obvious benefits of using O(1) instead of O(n)
algorithms, your mileage may vary.

When the socket buffer is almost always empty (fast receivers) then
you have no gain. When the socket buffer is almost always full you
also have no gain, because the decision to drop the packet only
requires a comparison. However, this code can really avoid trashing
in those cases where the queue size oscillates.

I'd like to commit this (or similar) code after proper testing,
so i'd like people to try it out -- I am reasonably confident
about it, and have done a fair amount of testing under heavy
udp load, but want to be sure that there are no side effects.

The attached patch applies to 4.2 (and hopefully to CURRENT
as well).

	cheers
	luigi


Index: sys/socketvar.h
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.4
diff -u -r1.46.2.4 socketvar.h
--- sys/socketvar.h	2000/11/26 02:30:08	1.46.2.4
+++ sys/socketvar.h	2001/02/08 00:58:31
@@ -93,6 +93,7 @@
 		u_long	sb_mbmax;	/* max chars of mbufs to use */
 		long	sb_lowat;	/* low water mark */
 		struct	mbuf *sb_mb;	/* the mbuf chain */
+		struct	mbuf *sb_mb_tail; /* last pkt in chain (if sb_mb != NULL) */
 		struct	selinfo sb_sel;	/* process selecting read/write */
 		short	sb_flags;	/* flags, see below */
 		short	sb_timeo;	/* timeout for read/write */
Index: kern/uipc_socket.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.10
diff -u -r1.68.2.10 uipc_socket.c
--- kern/uipc_socket.c	2000/11/17 19:47:27	1.68.2.10
+++ kern/uipc_socket.c	2001/02/08 01:40:04
@@ -772,6 +772,12 @@
 		goto restart;
 	}
 dontblock:
+	/* 
+	 * On entry here, m points to the first record on the socket buffer.
+	 * While we process the initial mbufs containing address and control
+	 * info we save a copy of m->m_nextpkt into nextrecord. We do need
+	 * to take care of sb_mb_tail until later.
+	 */
 	if (uio->uio_procp)
 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
 	nextrecord = m->m_nextpkt;
@@ -815,9 +821,17 @@
 			controlp = &(*controlp)->m_next;
 		}
 	}
+	/*
+	 * If m is non-null, we have some data to read. From now on, make
+	 * sure to keep sb_mb_tail consistent when working on the last
+	 * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt.
+	 */
 	if (m) {
-		if ((flags & MSG_PEEK) == 0)
+		if ((flags & MSG_PEEK) == 0) {
 			m->m_nextpkt = nextrecord;
+			if (nextrecord == NULL)
+				so->so_rcv.sb_mb_tail = m ;
+		}
 		type = m->m_type;
 		if (type == MT_OOBDATA)
 			flags |= MSG_OOB;
@@ -873,8 +887,11 @@
 					MFREE(m, so->so_rcv.sb_mb);
 					m = so->so_rcv.sb_mb;
 				}
-				if (m)
+				if (m) {
 					m->m_nextpkt = nextrecord;
+					if (nextrecord == NULL)
+						so->so_rcv.sb_mb_tail = m ;
+				}
 			}
 		} else {
 			if (flags & MSG_PEEK)
@@ -931,8 +948,11 @@
 			(void) sbdroprecord(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
-		if (m == 0)
+		if (m == 0) {
 			so->so_rcv.sb_mb = nextrecord;
+			if (nextrecord == NULL || nextrecord->m_nextpkt == NULL)
+				so->so_rcv.sb_mb_tail = nextrecord ;
+		}
 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 	}
Index: kern/uipc_socket2.c
===================================================================
RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.55.2.7
diff -u -r1.55.2.7 uipc_socket2.c
--- kern/uipc_socket2.c	2000/09/07 19:13:37	1.55.2.7
+++ kern/uipc_socket2.c	2001/02/08 01:44:53
@@ -55,6 +55,8 @@
 
 int	maxsockets;
 
+int fastscan; /* XXX see below */
+
 /*
  * Primitive routines for operating on sockets and socket buffers
  */
@@ -477,7 +479,51 @@
  * or sbdroprecord() when the data is acknowledged by the peer.
  */
 
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0);
 /*
+ * sbgettail returns a pointer to the last record of the socketbuffer.
+ * If m0 is non-null, it also appends m0 to the chain.
+ */
+static struct mbuf *
+sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0)
+{
+    struct mbuf *m = sb->sb_mb;
+
+    if (m == NULL)
+	goto done;
+    if (sb->sb_mb_tail == NULL)
+	printf("%s: null tail\n", msg);
+    if (fastscan && sb->sb_mb_tail != NULL) {
+	m = sb->sb_mb_tail ;
+	if (m == NULL)
+	    panic ("sbgettail returns NULL");
+	if (m->m_nextpkt == NULL) /* ok ... */
+	    goto done;
+	/* otherwise continue scan */
+	printf("%s: sbgettail m_nextpkt != NULL\n", msg);
+    }
+    while (m->m_nextpkt)
+	m = m->m_nextpkt ;
+    if (m != sb->sb_mb_tail) {
+	if (sb->sb_mb_tail != NULL)
+	    printf("%s: bad tail 0x%p instead of 0x%p\n",
+		msg, sb->sb_mb_tail, m);
+	sb->sb_mb_tail = m ;
+    }
+done:
+    if (m0) {
+	if (m)
+	    m->m_nextpkt = m0;
+	else
+	    sb->sb_mb = m0;
+	sb->sb_mb_tail = m0 ;
+	m = m0 ;
+    }
+    return m ;
+}
+
+/*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
  * the mbuf chain is recorded in sb.  Empty mbufs are
@@ -492,10 +538,8 @@
 
 	if (m == 0)
 		return;
-	n = sb->sb_mb;
+	n = sbgettail(sb, "sbappend", NULL);
 	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
 		do {
 			if (n->m_flags & M_EOR) {
 				sbappendrecord(sb, m); /* XXXXXX!!!! */
@@ -545,19 +589,13 @@
 
 	if (m0 == 0)
 		return;
-	m = sb->sb_mb;
-	if (m)
-		while (m->m_nextpkt)
-			m = m->m_nextpkt;
 	/*
 	 * Put the first mbuf on the queue.
 	 * Note this permits zero length records.
 	 */
 	sballoc(sb, m0);
-	if (m)
-		m->m_nextpkt = m0;
-	else
-		sb->sb_mb = m0;
+	m = sbgettail(sb, "sbappendrecord", m0);
+	if (m0->m_nextpkt != NULL) printf("ouch! sbappendrecord nextpkt!=NULL\n");
 	m = m0->m_next;
 	m0->m_next = 0;
 	if (m && (m0->m_flags & M_EOR)) {
@@ -603,6 +641,8 @@
 	 */
 	sballoc(sb, m0);
 	m0->m_nextpkt = *mp;
+	if (*mp == NULL) /* m0 is actually the new tail */
+		sb->sb_mb_tail = m0 ;
 	*mp = m0;
 	m = m0->m_next;
 	m0->m_next = 0;
@@ -653,13 +693,7 @@
 	m->m_next = control;
 	for (n = m; n; n = n->m_next)
 		sballoc(sb, n);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = m;
-	} else
-		sb->sb_mb = m;
+	n = sbgettail(sb, "sbappendaddr", m);
 	return (1);
 }
 
@@ -686,13 +720,7 @@
 	n->m_next = m0;			/* concatenate data to control */
 	for (m = control; m; m = m->m_next)
 		sballoc(sb, m);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = control;
-	} else
-		sb->sb_mb = control;
+	n = sbgettail(sb, "sbappendcontrol", control);
 	return (1);
 }
 
@@ -731,7 +759,7 @@
 		if (n)
 			n->m_next = m;
 		else
-			sb->sb_mb = m;
+			sb->sb_mb = sb->sb_mb_tail = m;
 		sballoc(sb, m);
 		n = m;
 		m->m_flags &= ~M_EOR;
@@ -1003,6 +1031,8 @@
 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
     &sb_efficiency, 0, "");
 
+SYSCTL_INT(_kern_ipc, OID_AUTO, fastscan, CTLFLAG_RW,
+    &fastscan, 0, "Fast scanning of socket queues for append");
 /*
  * Initialise maxsockets 
  */


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200102080258.f182w7c08367>