Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 13 Mar 2014 18:42:12 +0000 (UTC)
From:      Alan Somers <asomers@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r263116 - in head: sys/kern sys/sys tests/sys/kern
Message-ID:  <201403131842.s2DIgCcM032500@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: asomers
Date: Thu Mar 13 18:42:12 2014
New Revision: 263116
URL: http://svnweb.freebsd.org/changeset/base/263116

Log:
  Replace 4.4BSD Lite's unix domain socket backpressure hack with a cleaner
  mechanism, based on the new SB_STOP sockbuf flag.  The old hack dynamically
  changed the sending sockbuf's high water mark whenever adding or removing
  data from the receiving sockbuf.  It worked for stream sockets, but it never
  worked for SOCK_SEQPACKET sockets because of their atomic nature.  If the
  sockbuf was partially full, it might return EMSGSIZE instead of blocking.
  
  The new solution is based on DragonFlyBSD's fix from commit
  3a6117bbe0ed6a87605c1e43e12a1438d8844380 on 2008-05-27.  It adds an SB_STOP
  flag to sockbufs.  Whenever uipc_send surpasses the socket's size limit, it
  sets SB_STOP on the sending sockbuf.  sbspace() will then return 0 for that
  sockbuf, causing sosend_generic and friends to block.  uipc_rcvd will
  likewise clear SB_STOP.  There are two fringe benefits: uipc_{send,rcvd} no
  longer need to call chgsbsize() on every send and receive because they don't
  change the sockbuf's high water mark.  Also, uipc_sense no longer needs to
  acquire the UIPC linkage lock, because it's simpler to compute the
  st_blksizes.
  
  There is one drawback: since sbspace() will only ever return 0 or the
  maximum, sosend_generic will allow the sockbuf to exceed its nominal maximum
  size by at most one packet of size less than the max.  I don't think that's
  a serious problem.  In fact, I'm not even positive that FreeBSD guarantees a
  socket will always stay within its nominal size limit.
  
  sys/sys/sockbuf.h
  	Add the SB_STOP flag and adjust sbspace()
  
  sys/sys/unpcb.h
  	Delete the obsolete unp_cc and unp_mbcnt fields from struct unpcb.
  
  sys/kern/uipc_usrreq.c
  	Adjust uipc_rcvd, uipc_send, and uipc_sense to use the SB_STOP
  	backpressure mechanism.  Removing obsolete unpcb fields from
  	db_show_unpcb.
  
  tests/sys/kern/unix_seqpacket_test.c
  	Clear expected failures from ATF.
  
  Obtained from:	DragonFly BSD
  PR:		kern/185812
  Reviewed by:	silence from freebsd-net@ and rwatson@
  MFC after:	3 weeks
  Sponsored by:	Spectra Logic Corporation

Modified:
  head/sys/kern/uipc_usrreq.c
  head/sys/sys/sockbuf.h
  head/sys/sys/unpcb.h
  head/tests/sys/kern/unix_seqpacket_test.c

Modified: head/sys/kern/uipc_usrreq.c
==============================================================================
--- head/sys/kern/uipc_usrreq.c	Thu Mar 13 18:17:18 2014	(r263115)
+++ head/sys/kern/uipc_usrreq.c	Thu Mar 13 18:42:12 2014	(r263116)
@@ -51,7 +51,6 @@
  *
  * TODO:
  *	RDM
- *	distinguish datagram size limits from flow control limits in SEQPACKET
  *	rethink name space problems
  *	need a proper out-of-band
  */
@@ -789,7 +788,6 @@ uipc_rcvd(struct socket *so, int flags)
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
-	u_long newhiwat;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
@@ -811,6 +809,15 @@ uipc_rcvd(struct socket *so, int flags)
 	mbcnt = so->so_rcv.sb_mbcnt;
 	sbcc = so->so_rcv.sb_cc;
 	SOCKBUF_UNLOCK(&so->so_rcv);
+	/*
+	 * There is a benign race condition at this point.  If we're planning to
+	 * clear SB_STOP, but uipc_send is called on the connected socket at
+	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
+	 * we would erroneously clear SB_STOP below, even though the sockbuf is
+	 * full.  The race is benign because the only ill effect is to allow the
+	 * sockbuf to exceed its size limit, and the size limits are not
+	 * strictly guaranteed anyway.
+	 */
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL) {
@@ -819,13 +826,9 @@ uipc_rcvd(struct socket *so, int flags)
 	}
 	so2 = unp2->unp_socket;
 	SOCKBUF_LOCK(&so2->so_snd);
-	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
-	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
-	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
-	    newhiwat, RLIM_INFINITY);
+	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
+		so2->so_snd.sb_flags &= ~SB_STOP;
 	sowwakeup_locked(so2);
-	unp->unp_mbcnt = mbcnt;
-	unp->unp_cc = sbcc;
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
@@ -836,8 +839,7 @@ uipc_send(struct socket *so, int flags, 
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
-	u_int mbcnt_delta, sbcc;
-	u_int newhiwat;
+	u_int mbcnt, sbcc;
 	int error = 0;
 
 	unp = sotounpcb(so);
@@ -991,27 +993,21 @@ uipc_send(struct socket *so, int flags, 
 			}
 		}
 
-		/*
-		 * XXXRW: While fine for SOCK_STREAM, this conflates maximum
-		 * datagram size and back-pressure for SOCK_SEQPACKET, which
-		 * can lead to undesired return of EMSGSIZE on send instead
-		 * of more desirable blocking.
-		 */
-		mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
-		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		mbcnt = so2->so_rcv.sb_mbcnt;
 		sbcc = so2->so_rcv.sb_cc;
 		sorwakeup_locked(so2);
 
+		/*
+		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
+		 * it would be possible for uipc_rcvd to be called at this
+		 * point, drain the receiving sockbuf, clear SB_STOP, and then
+		 * we would set SB_STOP below.  That could lead to an empty
+		 * sockbuf having SB_STOP set
+		 */
 		SOCKBUF_LOCK(&so->so_snd);
-		if ((int)so->so_snd.sb_hiwat >= (int)(sbcc - unp2->unp_cc))
-			newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
-		else
-			newhiwat = 0;
-		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
-		    newhiwat, RLIM_INFINITY);
-		so->so_snd.sb_mbmax -= mbcnt_delta;
+		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
+			so->so_snd.sb_flags |= SB_STOP;
 		SOCKBUF_UNLOCK(&so->so_snd);
-		unp2->unp_cc = sbcc;
 		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
@@ -1049,27 +1045,18 @@ release:
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
-	struct unpcb *unp, *unp2;
-	struct socket *so2;
+	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
 
 	sb->st_blksize = so->so_snd.sb_hiwat;
-	UNP_LINK_RLOCK();
 	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) &&
-	    unp2 != NULL) {
-		so2 = unp2->unp_socket;
-		sb->st_blksize += so2->so_rcv.sb_cc;
-	}
 	sb->st_dev = NODEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
 	sb->st_ino = unp->unp_ino;
 	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
@@ -2497,8 +2484,7 @@ DB_SHOW_COMMAND(unpcb, db_show_unpcb)
 	/* XXXRW: Would be nice to print the full address, if any. */
 	db_printf("unp_addr: %p\n", unp->unp_addr);
 
-	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
-	    unp->unp_cc, unp->unp_mbcnt,
+	db_printf("unp_gencnt: %llu\n",
 	    (unsigned long long)unp->unp_gencnt);
 
 	db_printf("unp_flags: %x (", unp->unp_flags);

Modified: head/sys/sys/sockbuf.h
==============================================================================
--- head/sys/sys/sockbuf.h	Thu Mar 13 18:17:18 2014	(r263115)
+++ head/sys/sys/sockbuf.h	Thu Mar 13 18:42:12 2014	(r263116)
@@ -52,6 +52,7 @@
 #define	SB_NOCOALESCE	0x200		/* don't coalesce new data into existing mbufs */
 #define	SB_IN_TOE	0x400		/* socket buffer is in the middle of an operation */
 #define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
+#define	SB_STOP		0x1000		/* backpressure indicator */
 
 #define	SBS_CANTSENDMORE	0x0010	/* can't send more data to peer */
 #define	SBS_CANTRCVMORE		0x0020	/* can't receive more data from peer */
@@ -168,9 +169,19 @@ void	sbunlock(struct sockbuf *sb);
  * still be negative (cc > hiwat or mbcnt > mbmax).  Should detect
  * overflow and return 0.  Should use "lmin" but it doesn't exist now.
  */
-#define	sbspace(sb) \
-    ((long) imin((int)((sb)->sb_hiwat - (sb)->sb_cc), \
-	 (int)((sb)->sb_mbmax - (sb)->sb_mbcnt)))
+static __inline
+long
+sbspace(struct sockbuf *sb)
+{
+	long bleft;
+	long mleft;
+
+	if (sb->sb_flags & SB_STOP)
+		return(0);
+	bleft = sb->sb_hiwat - sb->sb_cc;
+	mleft = sb->sb_mbmax - sb->sb_mbcnt;
+	return((bleft < mleft) ? bleft : mleft);
+}
 
 /* adjust counters in sb reflecting allocation of m */
 #define	sballoc(sb, m) { \

Modified: head/sys/sys/unpcb.h
==============================================================================
--- head/sys/sys/unpcb.h	Thu Mar 13 18:17:18 2014	(r263115)
+++ head/sys/sys/unpcb.h	Thu Mar 13 18:42:12 2014	(r263116)
@@ -74,8 +74,8 @@ struct unpcb {
 	struct	unp_head unp_refs;	/* referencing socket linked list */
 	LIST_ENTRY(unpcb) unp_reflink;	/* link in unp_refs list */
 	struct	sockaddr_un *unp_addr;	/* bound address of socket */
-	int	unp_cc;			/* copy of rcv.sb_cc */
-	int	unp_mbcnt;		/* copy of rcv.sb_mbcnt */
+	int	reserved1;
+	int	reserved2;
 	unp_gen_t unp_gencnt;		/* generation count of this instance */
 	short	unp_flags;		/* flags */
 	short	unp_gcflag;		/* Garbage collector flags. */

Modified: head/tests/sys/kern/unix_seqpacket_test.c
==============================================================================
--- head/tests/sys/kern/unix_seqpacket_test.c	Thu Mar 13 18:17:18 2014	(r263115)
+++ head/tests/sys/kern/unix_seqpacket_test.c	Thu Mar 13 18:42:12 2014	(r263116)
@@ -844,25 +844,21 @@ ATF_TC_BODY(emsgsize_nonblocking, tc)
 ATF_TC_WITHOUT_HEAD(eagain_8k_8k);
 ATF_TC_BODY(eagain_8k_8k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_eagain(8192, 8192);
 }
 ATF_TC_WITHOUT_HEAD(eagain_8k_128k);
 ATF_TC_BODY(eagain_8k_128k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_eagain(8192, 131072);
 }
 ATF_TC_WITHOUT_HEAD(eagain_128k_8k);
 ATF_TC_BODY(eagain_128k_8k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_eagain(131072, 8192);
 }
 ATF_TC_WITHOUT_HEAD(eagain_128k_128k);
 ATF_TC_BODY(eagain_128k_128k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_eagain(131072, 131072);
 }
 
@@ -969,37 +965,24 @@ ATF_TC_BODY(pipe_simulator_128k_128k, tc
 ATF_TC_WITHOUT_HEAD(pipe_8k_8k);
 ATF_TC_BODY(pipe_8k_8k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_pipe(8192, 8192);
 }
 
 ATF_TC_WITHOUT_HEAD(pipe_8k_128k);
 ATF_TC_BODY(pipe_8k_128k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_pipe(8192, 131072);
 }
 
 ATF_TC_WITHOUT_HEAD(pipe_128k_8k);
 ATF_TC_BODY(pipe_128k_8k, tc)
 {
-	/* 
-	 * kern/185812 causes this test case to both fail and timeout.  The
-	 * atf-c-api(3) doesn't have a way to set such an expectation.
-	 * If you use atf_tc_expect_fail, then it will timeout.  If you use
-	 * atf_tc_expect_timeout, then it will fail.  If you use both, then it
-	 * will show up as an unexpected pass, which is much worse
-	 *
-	 * https://code.google.com/p/kyua/issues/detail?id=76
-	 */
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_pipe(131072, 8192);
 }
 
 ATF_TC_WITHOUT_HEAD(pipe_128k_128k);
 ATF_TC_BODY(pipe_128k_128k, tc)
 {
-	atf_tc_expect_fail("PR kern/185812 send(2) on a UNIX domain SEQPACKET socket returns EMSGSIZE instead of EAGAIN");
 	test_pipe(131072, 131072);
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201403131842.s2DIgCcM032500>