From owner-freebsd-net Wed Feb 7 18:58:32 2001 Delivered-To: freebsd-net@freebsd.org Received: from iguana.aciri.org (iguana.aciri.org [192.150.187.36]) by hub.freebsd.org (Postfix) with ESMTP id 2B71637B65D for ; Wed, 7 Feb 2001 18:58:07 -0800 (PST) Received: (from rizzo@localhost) by iguana.aciri.org (8.11.1/8.11.1) id f182w7c08367; Wed, 7 Feb 2001 18:58:07 -0800 (PST) (envelope-from rizzo) From: Luigi Rizzo Message-Id: <200102080258.f182w7c08367@iguana.aciri.org> Subject: [patch] fast sbappend*, please try... To: net@freebsd.org Date: Wed, 7 Feb 2001 18:58:07 -0800 (PST) Cc: rizzo@aciri.org X-Mailer: ELM [version 2.4ME+ PL43 (25)] MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-freebsd-net@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.org Hi, I would be grateful if someone could test the attached patch which deals with the following problem: on all *BSD version, socket buffers contain a list of incoming and/or outgoing mbufs. Unfortunately the list only has a pointer to the head, meaning that all append operations require to scan the full list. The overhead can be very bad in some cases (e.g. small UDP packets), and becomes worse and worse as the socket buffer size increases (which is what one would commonly do when expecting a lot of traffic!). The attached patch implements a tail pointer to the list, so that you can append in constant time. By default, the code works exactly as before -- the tail of the list is reached with the usual linear scan, and the pointer to the tail is only used for comparison purposes to make sure that it yields the same value. If you enable the fast behaviour with sysctl -w kern.ipc.fastscan=1 then the new code takes over and linear scans are replaced by dereferences of the tail pointer. Apart from the obvious benefits of using O(1) instead of O(n) algorithms, your mileage may vary. When the socket buffer is almost always empty (fast receivers) then you have no gain. When the socket buffer is almost always full you also have no gain, because the decision to drop the packet only requires a comparison. However, this code can really avoid trashing in those cases where the queue size oscillates. I'd like to commit this (or similar) code after proper testing, so i'd like people to try it out -- I am reasonably confident about it, and have done a fair amount of testing under heavy udp load, but want to be sure that there are no side effects. The attached patch applies to 4.2 (and hopefully to CURRENT as well). cheers luigi Index: sys/socketvar.h =================================================================== RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/sys/socketvar.h,v retrieving revision 1.46.2.4 diff -u -r1.46.2.4 socketvar.h --- sys/socketvar.h 2000/11/26 02:30:08 1.46.2.4 +++ sys/socketvar.h 2001/02/08 00:58:31 @@ -93,6 +93,7 @@ u_long sb_mbmax; /* max chars of mbufs to use */ long sb_lowat; /* low water mark */ struct mbuf *sb_mb; /* the mbuf chain */ + struct mbuf *sb_mb_tail; /* last pkt in chain (if sb_mb != NULL) */ struct selinfo sb_sel; /* process selecting read/write */ short sb_flags; /* flags, see below */ short sb_timeo; /* timeout for read/write */ Index: kern/uipc_socket.c =================================================================== RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.68.2.10 diff -u -r1.68.2.10 uipc_socket.c --- kern/uipc_socket.c 2000/11/17 19:47:27 1.68.2.10 +++ kern/uipc_socket.c 2001/02/08 01:40:04 @@ -772,6 +772,12 @@ goto restart; } dontblock: + /* + * On entry here, m points to the first record on the socket buffer. + * While we process the initial mbufs containing address and control + * info we save a copy of m->m_nextpkt into nextrecord. We do need + * to take care of sb_mb_tail until later. + */ if (uio->uio_procp) uio->uio_procp->p_stats->p_ru.ru_msgrcv++; nextrecord = m->m_nextpkt; @@ -815,9 +821,17 @@ controlp = &(*controlp)->m_next; } } + /* + * If m is non-null, we have some data to read. From now on, make + * sure to keep sb_mb_tail consistent when working on the last + * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt. + */ if (m) { - if ((flags & MSG_PEEK) == 0) + if ((flags & MSG_PEEK) == 0) { m->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_mb_tail = m ; + } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; @@ -873,8 +887,11 @@ MFREE(m, so->so_rcv.sb_mb); m = so->so_rcv.sb_mb; } - if (m) + if (m) { m->m_nextpkt = nextrecord; + if (nextrecord == NULL) + so->so_rcv.sb_mb_tail = m ; + } } } else { if (flags & MSG_PEEK) @@ -931,8 +948,11 @@ (void) sbdroprecord(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { - if (m == 0) + if (m == 0) { so->so_rcv.sb_mb = nextrecord; + if (nextrecord == NULL || nextrecord->m_nextpkt == NULL) + so->so_rcv.sb_mb_tail = nextrecord ; + } if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pru_rcvd)(so, flags); } Index: kern/uipc_socket2.c =================================================================== RCS file: /home/iguana/u0/rizzo/ncvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.55.2.7 diff -u -r1.55.2.7 uipc_socket2.c --- kern/uipc_socket2.c 2000/09/07 19:13:37 1.55.2.7 +++ kern/uipc_socket2.c 2001/02/08 01:44:53 @@ -55,6 +55,8 @@ int maxsockets; +int fastscan; /* XXX see below */ + /* * Primitive routines for operating on sockets and socket buffers */ @@ -477,7 +479,51 @@ * or sbdroprecord() when the data is acknowledged by the peer. */ +static struct mbuf * +sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0); /* + * sbgettail returns a pointer to the last record of the socketbuffer. + * If m0 is non-null, it also appends m0 to the chain. + */ +static struct mbuf * +sbgettail(struct sockbuf *sb, char *msg, struct mbuf *m0) +{ + struct mbuf *m = sb->sb_mb; + + if (m == NULL) + goto done; + if (sb->sb_mb_tail == NULL) + printf("%s: null tail\n", msg); + if (fastscan && sb->sb_mb_tail != NULL) { + m = sb->sb_mb_tail ; + if (m == NULL) + panic ("sbgettail returns NULL"); + if (m->m_nextpkt == NULL) /* ok ... */ + goto done; + /* otherwise continue scan */ + printf("%s: sbgettail m_nextpkt != NULL\n", msg); + } + while (m->m_nextpkt) + m = m->m_nextpkt ; + if (m != sb->sb_mb_tail) { + if (sb->sb_mb_tail != NULL) + printf("%s: bad tail 0x%p instead of 0x%p\n", + msg, sb->sb_mb_tail, m); + sb->sb_mb_tail = m ; + } +done: + if (m0) { + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + sb->sb_mb_tail = m0 ; + m = m0 ; + } + return m ; +} + +/* * Append mbuf chain m to the last record in the * socket buffer sb. The additional space associated * the mbuf chain is recorded in sb. Empty mbufs are @@ -492,10 +538,8 @@ if (m == 0) return; - n = sb->sb_mb; + n = sbgettail(sb, "sbappend", NULL); if (n) { - while (n->m_nextpkt) - n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord(sb, m); /* XXXXXX!!!! */ @@ -545,19 +589,13 @@ if (m0 == 0) return; - m = sb->sb_mb; - if (m) - while (m->m_nextpkt) - m = m->m_nextpkt; /* * Put the first mbuf on the queue. * Note this permits zero length records. */ sballoc(sb, m0); - if (m) - m->m_nextpkt = m0; - else - sb->sb_mb = m0; + m = sbgettail(sb, "sbappendrecord", m0); + if (m0->m_nextpkt != NULL) printf("ouch! sbappendrecord nextpkt!=NULL\n"); m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { @@ -603,6 +641,8 @@ */ sballoc(sb, m0); m0->m_nextpkt = *mp; + if (*mp == NULL) /* m0 is actually the new tail */ + sb->sb_mb_tail = m0 ; *mp = m0; m = m0->m_next; m0->m_next = 0; @@ -653,13 +693,7 @@ m->m_next = control; for (n = m; n; n = n->m_next) sballoc(sb, n); - n = sb->sb_mb; - if (n) { - while (n->m_nextpkt) - n = n->m_nextpkt; - n->m_nextpkt = m; - } else - sb->sb_mb = m; + n = sbgettail(sb, "sbappendaddr", m); return (1); } @@ -686,13 +720,7 @@ n->m_next = m0; /* concatenate data to control */ for (m = control; m; m = m->m_next) sballoc(sb, m); - n = sb->sb_mb; - if (n) { - while (n->m_nextpkt) - n = n->m_nextpkt; - n->m_nextpkt = control; - } else - sb->sb_mb = control; + n = sbgettail(sb, "sbappendcontrol", control); return (1); } @@ -731,7 +759,7 @@ if (n) n->m_next = m; else - sb->sb_mb = m; + sb->sb_mb = sb->sb_mb_tail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; @@ -1003,6 +1031,8 @@ SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, fastscan, CTLFLAG_RW, + &fastscan, 0, "Fast scanning of socket queues for append"); /* * Initialise maxsockets */ To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-net" in the body of the message