Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 30 Jun 2006 00:14:32 +0100 (BST)
From:      Robert Watson <rwatson@FreeBSD.org>
To:        performance@FreeBSD.org
Subject:   Updated fine-grain locking patch for UNIX domain sockets
Message-ID:  <20060630001142.Y67344@fledge.watson.org>

next in thread | raw e-mail | index | archive | help

Attached, and at the below URL, find an updated copy of the UNIX domain socket 
fine-grained locking patch.  Since the last revision, I've updated the patch 
to close several race conditions historically present in UNIX domain sockets 
(which should be merged regardless of the rest of the patch), as well as move 
to an rwlock for the global lock.

     http://www.watson.org/~robert/freebsd/netperf/20060630-uds-fine-grain.diff

This patch increases locking overhead, but decreases contention.  Depending on 
the number of CPUs, it may improve (or not) performance to varying degrees; 
very good reports on sun4v; middling reports on 2-proc, etc.  Stability and 
performance results for UNIX domain socket sensitive workloads, such as MySQL, 
X11, etc, would be appreciated.  Micro-benchmark performance should show a 
small loss, but load under contention and scalability are (ideally) improved.

Robert N M Watson
Computer Laboratory
University of Cambridge

--- kern/uipc_usrreq.c.old	2006/06/26 16:20:56
+++ kern/uipc_usrreq.c	2006/06/29 23:09:34
@@ -54,6 +54,7 @@
  #include <sys/proc.h>
  #include <sys/protosw.h>
  #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
  #include <sys/socket.h>
  #include <sys/socketvar.h>
  #include <sys/signalvar.h>
@@ -88,32 +89,124 @@
  struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);

  /*
- * Currently, UNIX domain sockets are protected by a single subsystem lock,
- * which covers global data structures and variables, the contents of each
- * per-socket unpcb structure, and the so_pcb field in sockets attached to
- * the UNIX domain.  This provides for a moderate degree of paralellism, as
- * receive operations on UNIX domain sockets do not need to acquire the
- * subsystem lock.  Finer grained locking to permit send() without acquiring
- * a global lock would be a logical next step.
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+static int	unp_rights;			/* file descriptors in flight */
+
+SYSCTL_DECL(_net_local_stream);
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+	   &unpst_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_DECL(_net_local_dgram);
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_DECL(_net_local);
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+/*-
+ * Locking and synchronization:
+ *
+ * A global UNIX domain socket rwlock protects all global variables in the
+ * implementation, as well as the linked lists tracking the set of allocated
+ * UNIX domain sockets.  These variables/fields may be read lockless using
+ * atomic operations if stale or mutually inconsistent values are
+ * permissible; otherwise the global rwlock is required to read or
+ * read-modify-write.  The global rwlock also serves to prevent deadlock when
+ * multiple PCB locks may be acquired at once (see below).  Finally, the
+ * global rwlock protects uncounted references from vnodes to sockets bound
+ * to those vnodes: to safely dereference the v_socket pointer, the global
+ * rwlock must be held while a full reference is acquired.  Some cases:
+ *
+ * - For consistent read-modify-write of global state, hold the global lock
+ *   writable.
+ *
+ * - For consistent multiple-read and non-staleness of global state, hold the
+ *   global lock writable.
+ *
+ * - To prevent changes to the global linkage, hold the global lock readable.
+ *
+ * - When modifying global linkage, hold the global lock writable.
+ *
+ * - When acquiring multiple unpcb locks at a time, hold the global lock
+ *   writable.
+ *
+ * UNIX domain sockets each have one unpcb PCB associated with them from
+ * pru_attach() to pru_detach() via the so_pcb pointer.  The validity of that
+ * reference is an invariant for the lifetime of the socket, so no lock is
+ * required to dereference the so_pcb pointer if a valid socket reference is
+ * held.
+ *
+ * Each PCB has a back-pointer to its socket, unp_socket.  This pointer may
+ * only be safely dereferenced as long as a valid reference to the PCB is
+ * held.  Typically, this reference will be from the socket, or from another
+ * PCB when the referring PCB's lock is held (in order that the reference not
+ * be invalidated during use).  In particular, to follow
+ * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn.
+ *
+ * Fields of PCBs are locked using a per-unpcb lock, unp_mtx.  Individual
+ * atomic reads without the lock may be performed "lockless", but more
+ * complex reads and read-modify-writes require the mutex to be held.  No
+ * lock order is defined between PCB locks -- multiple PCB locks may be
+ * acquired at the same time only when holding the global UNIX domain socket
+ * rwlock, which prevents deadlocks.  To prevent inter-PCB references from
+ * becoming invalid, either the per-unpcb lock of the unpcb holding the
+ * reference must be held, or the global rwlock must be held readable for the
+ * lifetime of the use of the reference.
   *
- * The UNIX domain socket lock preceds all socket layer locks, including the
- * socket lock and socket buffer lock, permitting UNIX domain socket code to
- * call into socket support routines without releasing its locks.
+ * Blocking with UNIX domain sockets is a tricky issue: unlike most network
+ * protocols, bind() is a non-atomic operation, and connect() requires
+ * potential sleeping in the protocol, due to potentially waiting on local or
+ * distributed file systems.  We try to separate "lookup" operations, which
+ * may sleep, and the IPC operations themselves, which typically can occur
+ * with relative atomicity as locks can be held over the entire operation.
   *
- * Some caution is required in areas where the UNIX domain socket code enters
- * VFS in order to create or find rendezvous points.  This results in
- * dropping of the UNIX domain socket subsystem lock, acquisition of the
- * Giant lock, and potential sleeping.  This increases the chances of races,
- * and exposes weaknesses in the socket->protocol API by offering poor
- * failure modes.
+ * Another tricky issue is simultaneous multi-threaded or multi-process
+ * access to a single UNIX domain socket.  These are handled by the flags
+ * UNP_CONNECTING and UNP_BINDING.
   */
-static struct mtx unp_mtx;
-#define	UNP_LOCK_INIT() \
-	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
-#define	UNP_LOCK()		mtx_lock(&unp_mtx)
-#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
-#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
-#define	UNP_UNLOCK_ASSERT()	mtx_assert(&unp_mtx, MA_NOTOWNED)
+static struct rwlock	unp_global_rwlock;
+
+#define	UNP_GLOBAL_LOCK_INIT()		rw_init(&unp_global_rwlock,	\
+					    "unp_global_rwlock")
+
+#define	UNP_GLOBAL_LOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_LOCKED)
+#define	UNP_GLOBAL_UNLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_UNLOCKED)
+
+#define	UNP_GLOBAL_WLOCK()		rw_wlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_WUNLOCK()		rw_wunlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_WLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_WLOCKED)
+
+#define	UNP_GLOBAL_RLOCK()		rw_rlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_RUNLOCK()		rw_runlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_RLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_RLOCKED)
+
+#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
+					    "unp_mtx", "unp_mtx",	\
+					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
+#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)

  /*
   * Garbage collection of cyclic file descriptor/socket references occurs
@@ -123,12 +216,10 @@
   */
  static struct task	unp_gc_task;

-static int     unp_attach(struct socket *);
  static void    unp_detach(struct unpcb *);
-static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
  static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
  static int     unp_connect2(struct socket *so, struct socket *so2, int);
-static void    unp_disconnect(struct unpcb *);
+static void    unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
  static void    unp_shutdown(struct unpcb *);
  static void    unp_drop(struct unpcb *, int);
  static void    unp_gc(__unused void *, int);
@@ -137,8 +228,6 @@
  static void    unp_discard(struct file *);
  static void    unp_freerights(struct file **, int);
  static int     unp_internalize(struct mbuf **, struct thread *);
-static int     unp_listen(struct socket *, struct unpcb *, int,
-		   struct thread *);

  static void
  uipc_abort(struct socket *so)
@@ -147,54 +236,203 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
  	unp_drop(unp, ECONNABORTED);
  	unp_detach(unp);
-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();
  }

  static int
  uipc_accept(struct socket *so, struct sockaddr **nam)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	const struct sockaddr *sa;

  	/*
-	 * Pass back name of connected socket,
-	 * if it was bound and we are still connected
-	 * (our peer may have closed already!).
+	 * Pass back name of connected socket, if it was bound and we are
+	 * still connected (our peer may have closed already!).
  	 */
  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
+
  	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
-	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
-		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-	else
+	UNP_GLOBAL_RLOCK();
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL && unp2->unp_addr != NULL) {
+		UNP_PCB_LOCK(unp2);
+		sa = (struct sockaddr *) unp2->unp_addr;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
  		sa = &sun_noname;
-	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+		bcopy(sa, *nam, sa->sa_len);
+	}
+	UNP_GLOBAL_RUNLOCK();
  	return (0);
  }

  static int
  uipc_attach(struct socket *so, int proto, struct thread *td)
  {
+	u_long sendspace, recvspace;
+	struct unpcb *unp;
+	int error;
+
+	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+		case SOCK_STREAM:
+			sendspace = unpst_sendspace;
+			recvspace = unpst_recvspace;
+			break;
+
+		case SOCK_DGRAM:
+			sendspace = unpdg_sendspace;
+			recvspace = unpdg_recvspace;
+			break;

-	return (unp_attach(so));
+		default:
+			panic("uipc_attach");
+		}
+		error = soreserve(so, sendspace, recvspace);
+		if (error)
+			return (error);
+	}
+	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
+	if (unp == NULL)
+		return (ENOBUFS);
+	LIST_INIT(&unp->unp_refs);
+	UNP_PCB_LOCK_INIT(unp);
+	unp->unp_socket = so;
+	so->so_pcb = unp;
+
+	UNP_GLOBAL_WLOCK();
+	unp->unp_gencnt = ++unp_gencnt;
+	unp_count++;
+	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
+			 : &unp_shead, unp, unp_link);
+	UNP_GLOBAL_WUNLOCK();
+
+	return (0);
  }

  static int
  uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
  {
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	struct vnode *vp;
+	struct mount *mp;
+	struct vattr vattr;
+	int error, namelen;
+	struct nameidata nd;
  	struct unpcb *unp;
-	int error;
+	char *buf;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
-	UNP_LOCK();
-	error = unp_bind(unp, nam, td);
-	UNP_UNLOCK();
+
+	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+	if (namelen <= 0)
+		return (EINVAL);
+
+	/*
+	 * We don't allow simultaneous bind() calls on a single UNIX domain
+	 * socket, so flag in-progress operations, and return an error if an
+	 * operation is already in progress.
+	 *
+	 * Historically, we have not allowed a socket to be rebound, so this
+	 * also returns an error.  Not allowing re-binding certainly
+	 * simplifies the implementation and avoids a great many possible
+	 * failure modes.
+	 */
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode != NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (EINVAL);
+	}
+	if (unp->unp_flags & UNP_BINDING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	unp->unp_flags |= UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+
+	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
+	strlcpy(buf, soun->sun_path, namelen + 1);
+
+	mtx_lock(&Giant);
+restart:
+	mtx_assert(&Giant, MA_OWNED);
+	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
+	    buf, td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+	error = namei(&nd);
+	if (error)
+		goto error;
+	vp = nd.ni_vp;
+	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp != NULL) {
+			vrele(vp);
+			error = EADDRINUSE;
+			goto error;
+		}
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error)
+			goto error;
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VSOCK;
+	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+#ifdef MAC
+	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+#endif
+	if (error == 0) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error) {
+		vn_finished_write(mp);
+		goto error;
+	}
+	vp = nd.ni_vp;
+	ASSERT_VOP_LOCKED(vp, "uipc_bind");
+	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
+
+	/*
+	 * XXXRW: handle race against another consumer also frobbing
+	 * v_socket?  Or not.
+	 */
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	vp->v_socket = unp->unp_socket;
+	unp->unp_vnode = vp;
+	unp->unp_addr = soun;
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	mtx_unlock(&Giant);
+	free(buf, M_TEMP);
+	return (0);
+
+error:
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	mtx_unlock(&Giant);
+	free(buf, M_TEMP);
  	return (error);
  }

@@ -204,23 +442,30 @@
  	int error;

  	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
  	error = unp_connect(so, nam, td);
-	UNP_UNLOCK();
+	UNP_GLOBAL_WUNLOCK();
  	return (error);
  }

  int
  uipc_connect2(struct socket *so1, struct socket *so2)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	int error;

-	unp = sotounpcb(so1);
+	UNP_GLOBAL_WLOCK();
+	unp = so1->so_pcb;
  	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
-	UNP_LOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = so2->so_pcb;
+	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
+	UNP_PCB_LOCK(unp2);
  	error = unp_connect2(so1, so2, PRU_CONNECT2);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
  	return (error);
  }

@@ -233,21 +478,31 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
  	unp_detach(unp);
-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();
  }

  static int
  uipc_disconnect(struct socket *so)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
-	UNP_LOCK();
-	unp_disconnect(unp);
-	UNP_UNLOCK();
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
  	return (0);
  }

@@ -259,81 +514,108 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
-	UNP_LOCK();
+
+	UNP_PCB_LOCK(unp);
  	if (unp->unp_vnode == NULL) {
-		UNP_UNLOCK();
+		UNP_PCB_UNLOCK(unp);
  		return (EINVAL);
  	}
-	error = unp_listen(so, unp, backlog, td);
-	UNP_UNLOCK();
+
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0) {
+		cru2x(td->td_ucred, &unp->unp_peercred);
+		unp->unp_flags |= UNP_HAVEPCCACHED;
+		solisten_proto(so, backlog);
+	}
+	SOCK_UNLOCK(so);
+	UNP_PCB_UNLOCK(unp);
  	return (error);
  }

  static int
  uipc_peeraddr(struct socket *so, struct sockaddr **nam)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	const struct sockaddr *sa;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
+
  	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
-	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
-		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-	else {
-		/*
-		 * XXX: It seems that this test always fails even when
-		 * connection is established.  So, this else clause is
-		 * added as workaround to return PF_LOCAL sockaddr.
-		 */
+	UNP_PCB_LOCK(unp);
+	/*
+	 * XXX: It seems that this test always fails even when connection is
+	 * established.  So, this else clause is added as workaround to
+	 * return PF_LOCAL sockaddr.
+	 */
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		if (unp2->unp_addr != NULL)
+			sa = (struct sockaddr *) unp->unp_conn->unp_addr;
+		else
+			sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
  		sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
  	}
-	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
  	return (0);
  }

  static int
  uipc_rcvd(struct socket *so, int flags)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	struct socket *so2;
+	u_int mbcnt, sbcc;
  	u_long newhiwat;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
-	UNP_LOCK();
-	switch (so->so_type) {
-	case SOCK_DGRAM:
+
+	if (so->so_type == SOCK_DGRAM)
  		panic("uipc_rcvd DGRAM?");
-		/*NOTREACHED*/

-	case SOCK_STREAM:
-		if (unp->unp_conn == NULL)
-			break;
-		so2 = unp->unp_conn->unp_socket;
-		SOCKBUF_LOCK(&so2->so_snd);
-		SOCKBUF_LOCK(&so->so_rcv);
-		/*
-		 * Adjust backpressure on sender
-		 * and wakeup any waiting to write.
-		 */
-		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
-		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
-		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
-		    so->so_rcv.sb_cc;
-		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
-		    newhiwat, RLIM_INFINITY);
-		unp->unp_cc = so->so_rcv.sb_cc;
-		SOCKBUF_UNLOCK(&so->so_rcv);
-		sowwakeup_locked(so2);
-		break;
+	if (so->so_type != SOCK_STREAM)
+		panic("uipc_rcvd unknown socktype");

-	default:
-		panic("uipc_rcvd unknown socktype");
+	/*
+	 * Adjust backpressure on sender and wakeup any waiting to write.
+	 *
+	 * The consistency requirements here are a bit complex: we must
+	 * acquire the lock for our own unpcb in order to prevent it from
+	 * disconnecting while in use, changing the unp_conn peer.  We do not
+	 * need unp2's lock, since the unp2->unp_socket pointer will remain
+	 * static as long as the unp2 pcb is valid, which it will be until we
+	 * release unp's lock to allow a disconnect.  We do need socket
+	 * mutexes for both socket endpoints since we manipulate fields in
+	 * both; we hold both locks at once since we access both
+	 * simultaneously.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	mbcnt = so->so_rcv.sb_mbcnt;
+	sbcc = so->so_rcv.sb_cc;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (0);
  	}
-	UNP_UNLOCK();
+	so2 = unp2->unp_socket;
+	SOCKBUF_LOCK(&so2->so_snd);
+	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
+	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
+	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+	    newhiwat, RLIM_INFINITY);
+	sowwakeup_locked(so2);
+	unp->unp_cc = sbcc;
+	unp->unp_mbcnt = mbcnt;
+	UNP_PCB_UNLOCK(unp);
  	return (0);
  }

@@ -343,13 +625,15 @@
  uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
      struct mbuf *control, struct thread *td)
  {
-	int error = 0;
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	struct socket *so2;
+	u_int mbcnt, sbcc;
  	u_long newhiwat;
+	int error = 0;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+
  	if (flags & PRUS_OOB) {
  		error = EOPNOTSUPP;
  		goto release;
@@ -358,32 +642,44 @@
  	if (control != NULL && (error = unp_internalize(&control, td)))
  		goto release;

-	UNP_LOCK();
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_GLOBAL_WLOCK();
+	else
+		UNP_GLOBAL_RLOCK();
+
  	switch (so->so_type) {
  	case SOCK_DGRAM:
  	{
  		const struct sockaddr *from;

+		unp2 = unp->unp_conn;
  		if (nam != NULL) {
-			if (unp->unp_conn != NULL) {
+			if (unp2 != NULL) {
  				error = EISCONN;
+				UNP_PCB_LOCK(unp);
  				break;
  			}
  			error = unp_connect(so, nam, td);
+			UNP_PCB_LOCK(unp);
  			if (error)
  				break;
+			unp2 = unp->unp_conn;
  		} else {
-			if (unp->unp_conn == NULL) {
+			UNP_PCB_LOCK(unp);
+			if (unp2 == NULL) {
  				error = ENOTCONN;
+				UNP_PCB_LOCK(unp);
  				break;
  			}
  		}
-		so2 = unp->unp_conn->unp_socket;
+		UNP_PCB_LOCK_ASSERT(unp);
+		UNP_PCB_LOCK(unp2);
+		so2 = unp2->unp_socket;
  		if (unp->unp_addr != NULL)
  			from = (struct sockaddr *)unp->unp_addr;
  		else
  			from = &sun_noname;
-		if (unp->unp_conn->unp_flags & UNP_WANTCRED)
+		if (unp2->unp_flags & UNP_WANTCRED)
  			control = unp_addsockcred(td, control);
  		SOCKBUF_LOCK(&so2->so_rcv);
  		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
@@ -395,43 +691,57 @@
  			error = ENOBUFS;
  		}
  		if (nam != NULL)
-			unp_disconnect(unp);
+			unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
  		break;
  	}

  	case SOCK_STREAM:
  		/* Connect if not connected yet. */
  		/*
-		 * Note: A better implementation would complain
-		 * if not equal to the peer's address.
+		 * Note: A better implementation would complain if not equal
+		 * to the peer's address.
  		 */
  		if ((so->so_state & SS_ISCONNECTED) == 0) {
  			if (nam != NULL) {
  				error = unp_connect(so, nam, td);
+				UNP_PCB_LOCK(unp);
  				if (error)
  					break;	/* XXX */
  			} else {
  				error = ENOTCONN;
+				UNP_PCB_LOCK(unp);
  				break;
  			}
-		}
+		} else
+			UNP_PCB_LOCK(unp);
+		UNP_PCB_LOCK_ASSERT(unp);

-		SOCKBUF_LOCK(&so->so_snd);
  		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
-			SOCKBUF_UNLOCK(&so->so_snd);
  			error = EPIPE;
  			break;
  		}
-		if (unp->unp_conn == NULL)
-			panic("uipc_send connected but no connection?");
-		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Lock order here has to be handled carefully: we hold the
+		 * global lock, so acquiring two unpcb locks is OK.  We must
+		 * acquire both before acquiring any socket mutexes.  We must
+		 * also acquire the local socket send mutex before the remote
+		 * socket receive mutex.  The only tricky thing is making
+		 * sure to acquire the unp2 lock before the local socket send
+		 * lock, or we will experience deadlocks.
+		 */
+		unp2 = unp->unp_conn;
+		KASSERT(unp2 != NULL,
+		    ("uipc_send connected but no connection?"));
+		UNP_PCB_LOCK(unp2);
+		so2 = unp2->unp_socket;
  		SOCKBUF_LOCK(&so2->so_rcv);
-		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
+		if (unp2->unp_flags & UNP_WANTCRED) {
  			/*
  			 * Credentials are passed only once on
  			 * SOCK_STREAM.
  			 */
-			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
+			unp2->unp_flags &= ~UNP_WANTCRED;
  			control = unp_addsockcred(td, control);
  		}
  		/*
@@ -442,19 +752,20 @@
  		if (control != NULL) {
  			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
  				control = NULL;
-		} else {
+		} else
  			sbappend_locked(&so2->so_rcv, m);
-		}
-		so->so_snd.sb_mbmax -=
-			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
-		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
-		newhiwat = so->so_snd.sb_hiwat -
-		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
+		mbcnt = so2->so_rcv.sb_mbcnt;
+		sbcc = so2->so_rcv.sb_cc;
+		sorwakeup_locked(so2);
+		SOCKBUF_LOCK(&so->so_snd);
+		newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
  		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
  		    newhiwat, RLIM_INFINITY);
+		so->so_snd.sb_mbmax -= mbcnt - unp->unp_conn->unp_mbcnt;
  		SOCKBUF_UNLOCK(&so->so_snd);
-		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
-		sorwakeup_locked(so2);
+		unp2->unp_mbcnt = mbcnt;
+		unp2->unp_cc = sbcc;
+		UNP_PCB_UNLOCK(unp2);
  		m = NULL;
  		break;

@@ -470,7 +781,12 @@
  		socantsendmore(so);
  		unp_shutdown(unp);
  	}
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_GLOBAL_WUNLOCK();
+	else
+		UNP_GLOBAL_RUNLOCK();

  	if (control != NULL && error != 0)
  		unp_dispose(control);
@@ -486,22 +802,26 @@
  static int
  uipc_sense(struct socket *so, struct stat *sb)
  {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
  	struct socket *so2;

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
-	UNP_LOCK();
+
  	sb->st_blksize = so->so_snd.sb_hiwat;
-	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
-		so2 = unp->unp_conn->unp_socket;
+	UNP_GLOBAL_RLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (so->so_type == SOCK_STREAM && unp2 != NULL) {
+		so2 = unp2->unp_socket;
  		sb->st_blksize += so2->so_rcv.sb_cc;
  	}
  	sb->st_dev = NODEV;
  	if (unp->unp_ino == 0)
  		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
  	sb->st_ino = unp->unp_ino;
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_RUNLOCK();
  	return (0);
  }

@@ -512,10 +832,13 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
  	socantsendmore(so);
  	unp_shutdown(unp);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
  	return (0);
  }

@@ -527,14 +850,15 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
+
  	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
+	UNP_PCB_LOCK(unp);
  	if (unp->unp_addr != NULL)
  		sa = (struct sockaddr *) unp->unp_addr;
  	else
  		sa = &sun_noname;
  	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
  	return (0);
  }

@@ -571,12 +895,13 @@

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
-	UNP_LOCK();
+
  	error = 0;
  	switch (sopt->sopt_dir) {
  	case SOPT_GET:
  		switch (sopt->sopt_name) {
  		case LOCAL_PEERCRED:
+			UNP_PCB_LOCK(unp);
  			if (unp->unp_flags & UNP_HAVEPC)
  				xu = unp->unp_peercred;
  			else {
@@ -585,22 +910,31 @@
  				else
  					error = EINVAL;
  			}
+			UNP_PCB_UNLOCK(unp);
  			if (error == 0)
  				error = sooptcopyout(sopt, &xu, sizeof(xu));
  			break;
+
  		case LOCAL_CREDS:
+			UNP_PCB_LOCK(unp);
  			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
+			UNP_PCB_UNLOCK(unp);
  			error = sooptcopyout(sopt, &optval, sizeof(optval));
  			break;
+
  		case LOCAL_CONNWAIT:
+			UNP_PCB_LOCK(unp);
  			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
+			UNP_PCB_UNLOCK(unp);
  			error = sooptcopyout(sopt, &optval, sizeof(optval));
  			break;
+
  		default:
  			error = EOPNOTSUPP;
  			break;
  		}
  		break;
+
  	case SOPT_SET:
  		switch (sopt->sopt_name) {
  		case LOCAL_CREDS:
@@ -610,19 +944,24 @@
  			if (error)
  				break;

-#define	OPTSET(bit) \
-	if (optval) \
-		unp->unp_flags |= bit; \
-	else \
-		unp->unp_flags &= ~bit;
+#define	OPTSET(bit) do {						\
+	UNP_PCB_LOCK(unp);						\
+	if (optval)							\
+		unp->unp_flags |= bit;					\
+	else								\
+		unp->unp_flags &= ~bit;					\
+	UNP_PCB_UNLOCK(unp);						\
+} while (0)

  			switch (sopt->sopt_name) {
  			case LOCAL_CREDS:
  				OPTSET(UNP_WANTCRED);
  				break;
+
  			case LOCAL_CONNWAIT:
  				OPTSET(UNP_CONNWAIT);
  				break;
+
  			default:
  				break;
  			}
@@ -633,117 +972,60 @@
  			break;
  		}
  		break;
+
  	default:
  		error = EOPNOTSUPP;
  		break;
  	}
-	UNP_UNLOCK();
  	return (error);
  }

-/*
- * Both send and receive buffers are allocated PIPSIZ bytes of buffering
- * for stream sockets, although the total for sender and receiver is
- * actually only PIPSIZ.
- * Datagram sockets really use the sendspace as the maximum datagram size,
- * and don't really want to reserve the sendspace.  Their recvspace should
- * be large enough for at least one max-size datagram plus address.
- */
-#ifndef PIPSIZ
-#define	PIPSIZ	8192
-#endif
-static u_long	unpst_sendspace = PIPSIZ;
-static u_long	unpst_recvspace = PIPSIZ;
-static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
-static u_long	unpdg_recvspace = 4*1024;
-
-static int	unp_rights;			/* file descriptors in flight */
-
-SYSCTL_DECL(_net_local_stream);
-SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
-	   &unpst_sendspace, 0, "");
-SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpst_recvspace, 0, "");
-SYSCTL_DECL(_net_local_dgram);
-SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
-	   &unpdg_sendspace, 0, "");
-SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpdg_recvspace, 0, "");
-SYSCTL_DECL(_net_local);
-SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
-
-static int
-unp_attach(struct socket *so)
-{
-	struct unpcb *unp;
-	int error;
-
-	KASSERT(so->so_pcb == NULL, ("unp_attach: so_pcb != NULL"));
-	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
-		switch (so->so_type) {
-
-		case SOCK_STREAM:
-			error = soreserve(so, unpst_sendspace, unpst_recvspace);
-			break;
-
-		case SOCK_DGRAM:
-			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
-			break;
-
-		default:
-			panic("unp_attach");
-		}
-		if (error)
-			return (error);
-	}
-	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
-	if (unp == NULL)
-		return (ENOBUFS);
-	LIST_INIT(&unp->unp_refs);
-	unp->unp_socket = so;
-	so->so_pcb = unp;
-
-	UNP_LOCK();
-	unp->unp_gencnt = ++unp_gencnt;
-	unp_count++;
-	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
-			 : &unp_shead, unp, unp_link);
-	UNP_UNLOCK();
-
-	return (0);
-}
-
  static void
  unp_detach(struct unpcb *unp)
  {
+	int local_unp_rights;
  	struct vnode *vp;
-	int local_unp_rights;
+	struct unpcb *unp2;

-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);

  	LIST_REMOVE(unp, unp_link);
  	unp->unp_gencnt = ++unp_gencnt;
  	--unp_count;
+
+	/*
+	 * XXXRW: What if v_socket != our soket?
+	 */
  	if ((vp = unp->unp_vnode) != NULL) {
-		/*
-		 * XXXRW: should v_socket be frobbed only while holding
-		 * Giant?
-		 */
  		unp->unp_vnode->v_socket = NULL;
  		unp->unp_vnode = NULL;
  	}
-	if (unp->unp_conn != NULL)
-		unp_disconnect(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+
+	/*
+	 * We hold the global lock, so it's OK to acquire multiple pcb locks
+	 * at a time.
+	 */
  	while (!LIST_EMPTY(&unp->unp_refs)) {
  		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
+
+		UNP_PCB_LOCK(ref);
  		unp_drop(ref, ECONNRESET);
+		UNP_PCB_UNLOCK(ref);
  	}
+	UNP_GLOBAL_WUNLOCK();
  	soisdisconnected(unp->unp_socket);
  	unp->unp_socket->so_pcb = NULL;
  	local_unp_rights = unp_rights;
-	UNP_UNLOCK();
  	if (unp->unp_addr != NULL)
  		FREE(unp->unp_addr, M_SONAME);
+	UNP_PCB_LOCK_DESTROY(unp);
  	uma_zfree(unp_zone, unp);
  	if (vp) {
  		int vfslocked;
@@ -757,97 +1039,6 @@
  }

  static int
-unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
-{
-	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
-	struct vnode *vp;
-	struct mount *mp;
-	struct vattr vattr;
-	int error, namelen;
-	struct nameidata nd;
-	char *buf;
-
-	UNP_LOCK_ASSERT();
-
-	/*
-	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
-	 * unlocked read here is fine, but the value of unp_vnode needs
-	 * to be tested again after we do all the lookups to see if the
-	 * pcb is still unbound?
-	 */
-	if (unp->unp_vnode != NULL)
-		return (EINVAL);
-
-	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
-	if (namelen <= 0)
-		return (EINVAL);
-
-	UNP_UNLOCK();
-
-	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
-	strlcpy(buf, soun->sun_path, namelen + 1);
-
-	mtx_lock(&Giant);
-restart:
-	mtx_assert(&Giant, MA_OWNED);
-	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
-	    buf, td);
-/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
-	error = namei(&nd);
-	if (error)
-		goto done;
-	vp = nd.ni_vp;
-	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
-		NDFREE(&nd, NDF_ONLY_PNBUF);
-		if (nd.ni_dvp == vp)
-			vrele(nd.ni_dvp);
-		else
-			vput(nd.ni_dvp);
-		if (vp != NULL) {
-			vrele(vp);
-			error = EADDRINUSE;
-			goto done;
-		}
-		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
-		if (error)
-			goto done;
-		goto restart;
-	}
-	VATTR_NULL(&vattr);
-	vattr.va_type = VSOCK;
-	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
-#ifdef MAC
-	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
-	    &vattr);
-#endif
-	if (error == 0) {
-		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
-		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
-	}
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vput(nd.ni_dvp);
-	if (error) {
-		vn_finished_write(mp);
-		goto done;
-	}
-	vp = nd.ni_vp;
-	ASSERT_VOP_LOCKED(vp, "unp_bind");
-	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
-	UNP_LOCK();
-	vp->v_socket = unp->unp_socket;
-	unp->unp_vnode = vp;
-	unp->unp_addr = soun;
-	UNP_UNLOCK();
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-done:
-	mtx_unlock(&Giant);
-	free(buf, M_TEMP);
-	UNP_LOCK();
-	return (error);
-}
-
-static int
  unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
  {
  	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
@@ -859,15 +1050,25 @@
  	char buf[SOCK_MAXADDRLEN];
  	struct sockaddr *sa;

-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_GLOBAL_WUNLOCK();

  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
  	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
  	if (len <= 0)
  		return (EINVAL);
  	strlcpy(buf, soun->sun_path, len + 1);
-	UNP_UNLOCK();
+
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_flags & UNP_CONNECTING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	unp->unp_flags |= UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
+
  	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
  	mtx_lock(&Giant);
  	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
@@ -889,9 +1090,15 @@
  	if (error)
  		goto bad;
  	mtx_unlock(&Giant);
-	UNP_LOCK();
+
  	unp = sotounpcb(so);
  	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+	/*
+	 * Lock global lock for two reasons: make sure v_socket is stable,
+	 * and to protect simultaneous locking of multiple pcbs.
+	 */
+	UNP_GLOBAL_WLOCK();
  	so2 = vp->v_socket;
  	if (so2 == NULL) {
  		error = ECONNREFUSED;
@@ -904,14 +1111,18 @@
  	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
  		if (so2->so_options & SO_ACCEPTCONN) {
  			/*
-			 * NB: drop locks here so unp_attach is entered
+			 * NB: drop locks here so uipc_attach is entered
  			 *     w/o locks; this avoids a recursive lock
  			 *     of the head and holding sleep locks across
  			 *     a (potentially) blocking malloc.
+			 *
+			 * XXXRW: This is actually a non-blocking alloc.
+			 * Lock order issue is real.  Releasing lock here
+			 * may invalidate so2 pointer, however.
  			 */
-			UNP_UNLOCK();
+			UNP_GLOBAL_WUNLOCK();
  			so3 = sonewconn(so2, 0);
-			UNP_LOCK();
+			UNP_GLOBAL_WLOCK();
  		} else
  			so3 = NULL;
  		if (so3 == NULL) {
@@ -921,6 +1132,9 @@
  		unp = sotounpcb(so);
  		unp2 = sotounpcb(so2);
  		unp3 = sotounpcb(so3);
+		UNP_PCB_LOCK(unp);
+		UNP_PCB_LOCK(unp2);
+		UNP_PCB_LOCK(unp3);
  		if (unp2->unp_addr != NULL) {
  			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
  			unp3->unp_addr = (struct sockaddr_un *) sa;
@@ -938,7 +1152,7 @@
  		/*
  		 * The receiver's (server's) credentials are copied
  		 * from the unp_peercred member of socket on which the
-		 * former called listen(); unp_listen() cached that
+		 * former called listen(); uipc_listen() cached that
  		 * process's credentials at that time so we can use
  		 * them now.
  		 */
@@ -949,6 +1163,9 @@
  		unp->unp_flags |= UNP_HAVEPC;
  		if (unp2->unp_flags & UNP_WANTCRED)
  			unp3->unp_flags |= UNP_WANTCRED;
+		UNP_PCB_UNLOCK(unp3);
+		UNP_PCB_UNLOCK(unp2);
+		UNP_PCB_UNLOCK(unp);
  #ifdef MAC
  		SOCK_LOCK(so);
  		mac_set_socket_peer_from_socket(so, so3);
@@ -958,9 +1175,17 @@

  		so2 = so3;
  	}
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
+	UNP_PCB_LOCK(unp);
+	UNP_PCB_LOCK(unp2);
  	error = unp_connect2(so, so2, PRU_CONNECT);
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
  bad2:
-	UNP_UNLOCK();
+	UNP_GLOBAL_WUNLOCK();
  	mtx_lock(&Giant);
  bad:
  	mtx_assert(&Giant, MA_OWNED);
@@ -968,25 +1193,33 @@
  		vput(vp);
  	mtx_unlock(&Giant);
  	free(sa, M_SONAME);
-	UNP_LOCK();
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
  	return (error);
  }

  static int
  unp_connect2(struct socket *so, struct socket *so2, int req)
  {
-	struct unpcb *unp = sotounpcb(so);
+	struct unpcb *unp;
  	struct unpcb *unp2;

-	UNP_LOCK_ASSERT();
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
+
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);

  	if (so2->so_type != so->so_type)
  		return (EPROTOTYPE);
-	unp2 = sotounpcb(so2);
-	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
  	unp->unp_conn = unp2;
+
  	switch (so->so_type) {
-
  	case SOCK_DGRAM:
  		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
  		soisconnected(so);
@@ -1009,15 +1242,16 @@
  }

  static void
-unp_disconnect(struct unpcb *unp)
+unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
  {
-	struct unpcb *unp2 = unp->unp_conn;
  	struct socket *so;

-	UNP_LOCK_ASSERT();
+	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
+
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);

-	if (unp2 == NULL)
-		return;
  	unp->unp_conn = NULL;
  	switch (unp->unp_socket->so_type) {
  	case SOCK_DGRAM:
@@ -1074,10 +1308,10 @@
  	 * OK, now we're committed to doing something.
  	 */
  	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
-	UNP_LOCK();
+	UNP_GLOBAL_RLOCK();
  	gencnt = unp_gencnt;
  	n = unp_count;
-	UNP_UNLOCK();
+	UNP_GLOBAL_RUNLOCK();

  	xug->xug_len = sizeof *xug;
  	xug->xug_count = n;
@@ -1091,23 +1325,34 @@

  	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);

-	UNP_LOCK();
+	/*
+	 * XXXRW: Note, this code relies very explicitly in pcb's being type
+	 * stable.
+	 */
+	UNP_GLOBAL_RLOCK();
  	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
  	     unp = LIST_NEXT(unp, unp_link)) {
+		UNP_PCB_LOCK(unp);
  		if (unp->unp_gencnt <= gencnt) {
  			if (cr_cansee(req->td->td_ucred,
  			    unp->unp_socket->so_cred))
  				continue;
  			unp_list[i++] = unp;
  		}
+		UNP_PCB_UNLOCK(unp);
  	}
-	UNP_UNLOCK();
+	UNP_GLOBAL_RUNLOCK();
  	n = i;			/* in case we lost some during malloc */

+	/*
+	 * XXXRW: The logic below asumes that it is OK to lock a mutex in
+	 * an unpcb that may have been freed.
+	 */
  	error = 0;
  	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
  	for (i = 0; i < n; i++) {
  		unp = unp_list[i];
+		UNP_PCB_LOCK(unp);
  		if (unp->unp_gencnt <= gencnt) {
  			xu->xu_len = sizeof *xu;
  			xu->xu_unpp = unp;
@@ -1125,8 +1370,10 @@
  				      unp->unp_conn->unp_addr->sun_len);
  			bcopy(unp, &xu->xu_unp, sizeof *unp);
  			sotoxsocket(unp->unp_socket, &xu->xu_socket);
+			UNP_PCB_UNLOCK(unp);
  			error = SYSCTL_OUT(req, xu, sizeof *xu);
-		}
+		} else
+			UNP_PCB_UNLOCK(unp);
  	}
  	free(xu, M_TEMP);
  	if (!error) {
@@ -1157,24 +1404,37 @@
  static void
  unp_shutdown(struct unpcb *unp)
  {
+	struct unpcb *unp2;
  	struct socket *so;

-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);

-	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
-	    (so = unp->unp_conn->unp_socket))
-		socantrcvmore(so);
+	unp2 = unp->unp_conn;
+	if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) {
+		so = unp2->unp_socket;
+		if (so != NULL)
+			socantrcvmore(so);
+	}
  }

  static void
  unp_drop(struct unpcb *unp, int errno)
  {
  	struct socket *so = unp->unp_socket;
+	struct unpcb *unp2;

-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);

  	so->so_error = errno;
-	unp_disconnect(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL)
+		return;
+
+	UNP_PCB_LOCK(unp2);
+	unp_disconnect(unp, unp2);
+	UNP_PCB_UNLOCK(unp2);
  }

  static void
@@ -1184,7 +1444,6 @@
  	struct file *fp;

  	for (i = 0; i < fdcount; i++) {
-		fp = *rp;
  		/*
  		 * zero the pointer before calling
  		 * unp_discard since it may end up
@@ -1192,7 +1451,8 @@
  		 *
  		 * XXXRW: This is less true than it used to be.
  		 */
-		*rp++ = 0;
+		fp = *rp;
+		*rp++ = NULL;
  		unp_discard(fp);
  	}
  }
@@ -1212,7 +1472,7 @@
  	int f;
  	u_int newlen;

-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();

  	error = 0;
  	if (controlp != NULL) /* controlp == NULL => free control messages */
@@ -1317,6 +1577,7 @@
  void
  unp_init(void)
  {
+
  	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
  	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  	if (unp_zone == NULL)
@@ -1327,7 +1588,7 @@
  	LIST_INIT(&unp_dhead);
  	LIST_INIT(&unp_shead);
  	TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
-	UNP_LOCK_INIT();
+	UNP_GLOBAL_LOCK_INIT();
  }

  static int
@@ -1347,7 +1608,7 @@
  	int error, oldfds;
  	u_int newlen;

-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();

  	error = 0;
  	*controlp = NULL;
@@ -1741,25 +2002,6 @@
  		unp_scan(m, unp_discard);
  }

-static int
-unp_listen(struct socket *so, struct unpcb *unp, int backlog,
-    struct thread *td)
-{
-	int error;
-
-	UNP_LOCK_ASSERT();
-
-	SOCK_LOCK(so);
-	error = solisten_proto_check(so);
-	if (error == 0) {
-		cru2x(td->td_ucred, &unp->unp_peercred);
-		unp->unp_flags |= UNP_HAVEPCCACHED;
-		solisten_proto(so, backlog);
-	}
-	SOCK_UNLOCK(so);
-	return (error);
-}
-
  static void
  unp_scan(struct mbuf *m0, void (*op)(struct file *))
  {
@@ -1812,6 +2054,9 @@
  static void
  unp_mark(struct file *fp)
  {
+
+	/* XXXRW: Should probably assert file list lock here. */
+
  	if (fp->f_gcflag & FMARK)
  		return;
  	unp_defer++;
@@ -1821,11 +2066,12 @@
  static void
  unp_discard(struct file *fp)
  {
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
  	FILE_LOCK(fp);
  	fp->f_msgcount--;
  	unp_rights--;
  	FILE_UNLOCK(fp);
-	UNP_UNLOCK();
+	UNP_GLOBAL_WUNLOCK();
  	(void) closef(fp, (struct thread *)NULL);
  }
--- sys/unpcb.h.old	2005/04/13 00:05:41
+++ sys/unpcb.h	2006/04/27 20:47:29
@@ -78,6 +78,7 @@
  	unp_gen_t unp_gencnt;		/* generation count of this instance */
  	int	unp_flags;		/* flags */
  	struct	xucred unp_peercred;	/* peer credentials, if applicable */
+	struct	mtx unp_mtx;		/* mutex */
  };

  /*
@@ -98,6 +99,14 @@
  #define	UNP_WANTCRED			0x004	/* credentials wanted */
  #define	UNP_CONNWAIT			0x008	/* connect blocks until accepted */

+/*
+ * These flags are used to handle non-atomicity in connect() and bind()
+ * operations on a socket: in particular, to avoid races between multiple
+ * threads or processes operating simultaneously on the same socket.
+ */
+#define	UNP_CONNECTING			0x010	/* Currently connecting. */
+#define	UNP_BINDING			0x020	/* Currently binding. */
+
  #define	sotounpcb(so)	((struct unpcb *)((so)->so_pcb))

  /* Hack alert -- this structure depends on <sys/socketvar.h>. */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20060630001142.Y67344>