Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 9 Jun 2006 04:28:27 GMT
From:      Kip Macy <kmacy@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 98847 for review
Message-ID:  <200606090428.k594SR38018446@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=98847

Change 98847 by kmacy@kmacy_storage:sun4v_work_ifc on 2006/06/09 04:28:08

	revert changes that were not meant to go in in the last commit

Affected files ...

.. //depot/projects/kmacy_sun4v/src/sys/kern/sys_socket.c#4 edit
.. //depot/projects/kmacy_sun4v/src/sys/kern/uipc_usrreq.c#6 edit

Differences ...

==== //depot/projects/kmacy_sun4v/src/sys/kern/sys_socket.c#4 (text+ko) ====

@@ -63,7 +63,7 @@
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
-	.fo_flags = DFLAG_PASSABLE | DFLAG_MPSAFE
+	.fo_flags = DFLAG_PASSABLE
 };
 
 /* ARGSUSED */

==== //depot/projects/kmacy_sun4v/src/sys/kern/uipc_usrreq.c#6 (text+ko) ====

@@ -88,100 +88,33 @@
 struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
 
 /*
- * Both send and receive buffers are allocated PIPSIZ bytes of buffering
- * for stream sockets, although the total for sender and receiver is
- * actually only PIPSIZ.
- * Datagram sockets really use the sendspace as the maximum datagram size,
- * and don't really want to reserve the sendspace.  Their recvspace should
- * be large enough for at least one max-size datagram plus address.
- */
-#ifndef PIPSIZ
-#define	PIPSIZ	8192
-#endif
-static u_long	unpst_sendspace = PIPSIZ;
-static u_long	unpst_recvspace = PIPSIZ;
-static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
-static u_long	unpdg_recvspace = 4*1024;
-
-static int	unp_rights;			/* file descriptors in flight */
-
-SYSCTL_DECL(_net_local_stream);
-SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
-	   &unpst_sendspace, 0, "");
-SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpst_recvspace, 0, "");
-SYSCTL_DECL(_net_local_dgram);
-SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
-	   &unpdg_sendspace, 0, "");
-SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpdg_recvspace, 0, "");
-SYSCTL_DECL(_net_local);
-SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
-
-/*
- * Locking and synchronization:
+ * Currently, UNIX domain sockets are protected by a single subsystem lock,
+ * which covers global data structures and variables, the contents of each
+ * per-socket unpcb structure, and the so_pcb field in sockets attached to
+ * the UNIX domain.  This provides for a moderate degree of paralellism, as
+ * receive operations on UNIX domain sockets do not need to acquire the
+ * subsystem lock.  Finer grained locking to permit send() without acquiring
+ * a global lock would be a logical next step.
  *
- * A global UNIX domain socket mutex protects all global variables in the
- * implementation, as well as the linked lists tracking the set of allocated
- * UNIX domain sockets.  These variables/fields may be read lockless using
- * atomic operations if stale values are permissible; otherwise the global
- * mutex is required to read or read-modify-write.  The global mutex also
- * serves to prevent deadlock when multiple PCB locks may be acquired at once
- * (see below).  Finally, the global mutex protects uncounted references from
- * vnodes to sockets bound to those vnodes: to safely dereference the
- * v_socket pointer, the global mutex must be held while a full reference is
- * acquired.
+ * The UNIX domain socket lock preceds all socket layer locks, including the
+ * socket lock and socket buffer lock, permitting UNIX domain socket code to
+ * call into socket support routines without releasing its locks.
  *
- * UNIX domain sockets each have one unpcb PCB associated with them from
- * pru_attach() to pru_detach() via the so_pcb pointer.  The validity of that
- * reference is an invariant for the lifetime of the socket, so no lock is
- * required to dereference the so_pcb pointer if a valid socket reference is
- * held.
- *
- * Each PCB has a back-pointer to its socket, unp_socket.  This pointer may
- * only be safely dereferenced as long as a valid reference to the PCB is
- * held.  Typically, this reference will be from the socket, or from another
- * PCB when the referring PCB's lock is held (in order that the reference not
- * be invalidated during use).  In particular, to follow
- * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn.
- *
- * Fields of PCBs are locked using a per-unpcb lock, unp_mtx.  Individual
- * atomic reads without the lock may be performed "lockless", but more
- * complex reads and read-modify-writes require the mutex to be held.  No
- * lock order is defined between PCB locks -- multiple PCB locks may be
- * acquired at the same time only when holding the global UNIX domain socket
- * mutex, which prevents deadlocks.  To prevent inter-PCB references from
- * becoming invalid, the lock protecting the reference must be held for the
- * lifetime of use of the reference.
- *
- * Blocking with UNIX domain sockets is a tricky issue: unlike most network
- * protocols, bind() is a non-atomic operation, and connect() requires
- * potential sleeping in the protocol, due to potentially waiting on local or
- * distributed file systems.  We try to separate "lookup" operations, which
- * may sleep, and the IPC operations themselves, which typically can occur
- * with relative atomicity as locks can be held over the entire operation.
- *
- * Another tricky issue is simultaneous multi-threaded or multi-process
- * access to a single UNIX domain socket.  These are handled by the flags
- * UNP_CONNECTING and UNP_BINDING.
+ * Some caution is required in areas where the UNIX domain socket code enters
+ * VFS in order to create or find rendezvous points.  This results in
+ * dropping of the UNIX domain socket subsystem lock, acquisition of the
+ * Giant lock, and potential sleeping.  This increases the chances of races,
+ * and exposes weaknesses in the socket->protocol API by offering poor
+ * failure modes.
  */
-static struct mtx	unp_global_mtx;
-
-#define	UNP_GLOBAL_LOCK_INIT()		mtx_init(&unp_global_mtx,	\
-					    "unp_global_mtx", NULL, MTX_DEF)
-#define	UNP_GLOBAL_LOCK()		mtx_lock(&unp_global_mtx)
-#define	UNP_GLOBAL_UNLOCK()		mtx_unlock(&unp_global_mtx)
-#define	UNP_GLOBAL_UNLOCK_ASSERT()	mtx_assert(&unp_global_mtx, MA_NOTOWNED)
-#define	UNP_GLOBAL_LOCK_ASSERT()	mtx_assert(&unp_global_mtx, MA_OWNED)
+static struct mtx unp_mtx;
+#define	UNP_LOCK_INIT() \
+	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
+#define	UNP_LOCK()		mtx_lock(&unp_mtx)
+#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
+#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
+#define	UNP_UNLOCK_ASSERT()	mtx_assert(&unp_mtx, MA_NOTOWNED)
 
-#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
-					    "unp_mtx", "unp_mtx",	\
-					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
-#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
-#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
-#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
-#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
-
 /*
  * Garbage collection of cyclic file descriptor/socket references occurs
  * asynchronously in a taskqueue context in order to avoid recursion and
@@ -190,10 +123,12 @@
  */
 static struct task	unp_gc_task;
 
+static int     unp_attach(struct socket *);
 static void    unp_detach(struct unpcb *);
+static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
 static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
 static int     unp_connect2(struct socket *so, struct socket *so2, int);
-static void    unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
+static void    unp_disconnect(struct unpcb *);
 static void    unp_shutdown(struct unpcb *);
 static void    unp_drop(struct unpcb *, int);
 static void    unp_gc(__unused void *, int);
@@ -202,6 +137,8 @@
 static void    unp_discard(struct file *);
 static void    unp_freerights(struct file **, int);
 static int     unp_internalize(struct mbuf **, struct thread *);
+static int     unp_listen(struct socket *, struct unpcb *, int,
+		   struct thread *);
 
 static void
 uipc_abort(struct socket *so)
@@ -210,238 +147,83 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
-
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	unp_drop(unp, ECONNABORTED);
 	unp_detach(unp);
-	UNP_GLOBAL_UNLOCK_ASSERT();
+	UNP_UNLOCK_ASSERT();
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 	const struct sockaddr *sa;
 
 	/*
-	 * Pass back name of connected socket, if it was bound and we are
-	 * still connected (our peer may have closed already!).
+	 * Pass back name of connected socket,
+	 * if it was bound and we are still connected
+	 * (our peer may have closed already!).
 	 */
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
-
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) {
-		UNP_PCB_LOCK(unp2);
+	UNP_LOCK();
+	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-		bcopy(sa, *nam, sa->sa_len);
-		UNP_PCB_UNLOCK(unp2);
-	} else {
+	else
 		sa = &sun_noname;
-		bcopy(sa, *nam, sa->sa_len);
-	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	bcopy(sa, *nam, sa->sa_len);
+	UNP_UNLOCK();
 	return (0);
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
-	u_long sendspace, recvspace;
-	struct unpcb *unp;
-	int error;
 
-	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
-	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
-		switch (so->so_type) {
-		case SOCK_STREAM:
-			sendspace = unpst_sendspace;
-			recvspace = unpst_recvspace;
-			break;
-
-		case SOCK_DGRAM:
-			sendspace = unpdg_sendspace;
-			recvspace = unpdg_recvspace;
-			break;
-
-		default:
-			panic("uipc_attach");
-		}
-		error = soreserve(so, sendspace, recvspace);
-		if (error)
-			return (error);
-	}
-	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
-	if (unp == NULL)
-		return (ENOBUFS);
-	LIST_INIT(&unp->unp_refs);
-	UNP_PCB_LOCK_INIT(unp);
-	unp->unp_socket = so;
-	so->so_pcb = unp;
-
-	UNP_GLOBAL_LOCK();
-	unp->unp_gencnt = ++unp_gencnt;
-	unp_count++;
-	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
-			 : &unp_shead, unp, unp_link);
-	UNP_GLOBAL_UNLOCK();
-
-	return (0);
+	return (unp_attach(so));
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
-	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
-	struct vnode *vp;
-	struct mount *mp;
-	struct vattr vattr;
-	int error, namelen;
-	struct nameidata nd;
 	struct unpcb *unp;
-	char *buf;
+	int error;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
-
-	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
-	if (namelen <= 0)
-		return (EINVAL);
-
-	/*
-	 * We don't allow simultaneous bind() calls on a single UNIX domain
-	 * socket, so flag in-progress operations, and return an error if an
-	 * operation is already in progress.
-	 *
-	 * Historically, we have not allowed a socket to be rebound, so this
-	 * also returns an error.  Not allowing re-binding certainly
-	 * simplifies the implementation and avoids a great many possible
-	 * failure modes.
-	 */
-	UNP_PCB_LOCK(unp);
-	if (unp->unp_vnode != NULL) {
-		UNP_PCB_UNLOCK(unp);
-		return (EINVAL);
-	}
-	if (unp->unp_flags & UNP_BINDING) {
-		UNP_PCB_UNLOCK(unp);
-		return (EALREADY);
-	}
-	unp->unp_flags |= UNP_BINDING;
-	UNP_PCB_UNLOCK(unp);
-
-	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
-	strlcpy(buf, soun->sun_path, namelen + 1);
-
-	mtx_lock(&Giant);
-restart:
-	mtx_assert(&Giant, MA_OWNED);
-	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
-	    buf, td);
-/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
-	error = namei(&nd);
-	if (error)
-		goto error;
-	vp = nd.ni_vp;
-	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
-		NDFREE(&nd, NDF_ONLY_PNBUF);
-		if (nd.ni_dvp == vp)
-			vrele(nd.ni_dvp);
-		else
-			vput(nd.ni_dvp);
-		if (vp != NULL) {
-			vrele(vp);
-			error = EADDRINUSE;
-			goto error;
-		}
-		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
-		if (error)
-			goto error;
-		goto restart;
-	}
-	VATTR_NULL(&vattr);
-	vattr.va_type = VSOCK;
-	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
-#ifdef MAC
-	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
-	    &vattr);
-#endif
-	if (error == 0) {
-		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
-		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
-	}
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vput(nd.ni_dvp);
-	if (error) {
-		vn_finished_write(mp);
-		goto error;
-	}
-	vp = nd.ni_vp;
-	ASSERT_VOP_LOCKED(vp, "uipc_bind");
-	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
-
-	/*
-	 * XXXRW: handle race against another consumer also frobbing
-	 * v_socket?  Or not.
-	 */
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
-	vp->v_socket = unp->unp_socket;
-	unp->unp_vnode = vp;
-	unp->unp_addr = soun;
-	unp->unp_flags &= ~UNP_BINDING;
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-	mtx_unlock(&Giant);
-	free(buf, M_TEMP);
-	return (0);
-
-error:
-	UNP_PCB_LOCK(unp);
-	unp->unp_flags &= ~UNP_BINDING;
-	UNP_PCB_UNLOCK(unp);
-	mtx_unlock(&Giant);
-	free(buf, M_TEMP);
+	UNP_LOCK();
+	error = unp_bind(unp, nam, td);
+	UNP_UNLOCK();
 	return (error);
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
+	struct unpcb *unp;
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
-
-	UNP_GLOBAL_LOCK();
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_connect: unp == NULL"));
+	UNP_LOCK();
 	error = unp_connect(so, nam, td);
-	UNP_GLOBAL_UNLOCK();
+	UNP_UNLOCK();
 	return (error);
 }
 
 int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 	int error;
 
-	UNP_GLOBAL_LOCK();
-	unp = so1->so_pcb;
+	unp = sotounpcb(so1);
 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
-	UNP_PCB_LOCK(unp);
-	unp2 = so2->so_pcb;
-	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
-	UNP_PCB_LOCK(unp2);
+	UNP_LOCK();
 	error = unp_connect2(so1, so2, PRU_CONNECT2);
-	UNP_PCB_UNLOCK(unp2);
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	UNP_UNLOCK();
 	return (error);
 }
 
@@ -454,31 +236,21 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
-
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	unp_detach(unp);
-	UNP_GLOBAL_UNLOCK_ASSERT();
+	UNP_UNLOCK_ASSERT();
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
-
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
-		unp_disconnect(unp, unp2);
-		UNP_PCB_UNLOCK(unp2);
-	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	UNP_LOCK();
+	unp_disconnect(unp);
+	UNP_UNLOCK();
 	return (0);
 }
 
@@ -490,105 +262,81 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
-
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	if (unp->unp_vnode == NULL) {
-		UNP_PCB_UNLOCK(unp);
+		UNP_UNLOCK();
 		return (EINVAL);
 	}
-
-	SOCK_LOCK(so);
-	error = solisten_proto_check(so);
-	if (error == 0) {
-		cru2x(td->td_ucred, &unp->unp_peercred);
-		unp->unp_flags |= UNP_HAVEPCCACHED;
-		solisten_proto(so, backlog);
-	}
-	SOCK_UNLOCK(so);
-	UNP_PCB_UNLOCK(unp);
+	error = unp_listen(so, unp, backlog, td);
+	UNP_UNLOCK();
 	return (error);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
-
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_PCB_LOCK(unp);
-	/*
-	 * XXX: It seems that this test always fails even when connection is
-	 * established.  So, this else clause is added as workaround to
-	 * return PF_LOCAL sockaddr.
-	 */
-	unp2 = unp->unp_conn;
-	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
-		if (unp2->unp_addr != NULL)
-			sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-		else
-			sa = &sun_noname;
-		bcopy(sa, *nam, sa->sa_len);
-		UNP_PCB_UNLOCK(unp2);
-	} else {
+	UNP_LOCK();
+	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
+		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
+	else {
+		/*
+		 * XXX: It seems that this test always fails even when
+		 * connection is established.  So, this else clause is
+		 * added as workaround to return PF_LOCAL sockaddr.
+		 */
 		sa = &sun_noname;
-		bcopy(sa, *nam, sa->sa_len);
 	}
-	UNP_PCB_UNLOCK(unp);
+	bcopy(sa, *nam, sa->sa_len);
+	UNP_UNLOCK();
 	return (0);
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 	struct socket *so2;
 	u_long newhiwat;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
+	UNP_LOCK();
+	switch (so->so_type) {
+	case SOCK_DGRAM:
+		panic("uipc_rcvd DGRAM?");
+		/*NOTREACHED*/
 
-	if (so->so_type == SOCK_DGRAM)
-		panic("uipc_rcvd DGRAM?");
+	case SOCK_STREAM:
+		if (unp->unp_conn == NULL)
+			break;
+		so2 = unp->unp_conn->unp_socket;
+		SOCKBUF_LOCK(&so2->so_snd);
+		SOCKBUF_LOCK(&so->so_rcv);
+		/*
+		 * Adjust backpressure on sender
+		 * and wakeup any waiting to write.
+		 */
+		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
+		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
+		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
+		    so->so_rcv.sb_cc;
+		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+		    newhiwat, RLIM_INFINITY);
+		unp->unp_cc = so->so_rcv.sb_cc;
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		sowwakeup_locked(so2);
+		break;
 
-	if (so->so_type != SOCK_STREAM)
+	default:
 		panic("uipc_rcvd unknown socktype");
-
-	/*
-	 * Adjust backpressure on sender and wakeup any waiting to write.
-	 *
-	 * The consistency requirements here are a bit complex: we must
-	 * acquire the lock for our own unpcb in order to prevent it from
-	 * disconnecting while in use, changing the unp_conn peer.  We do not
-	 * need unp2's lock, since the unp2->unp_socket pointer will remain
-	 * static as long as the unp2 pcb is valid, which it will be until we
-	 * release unp's lock to allow a disconnect.  We do need socket
-	 * mutexes for both socket endpoints since we manipulate fields in
-	 * both; we hold both locks at once since we access both
-	 * simultaneously.
-	 */
-	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if (unp2 == NULL) {
-		UNP_PCB_UNLOCK(unp);
-		return (0);
 	}
-	so2 = unp2->unp_socket;
-	SOCKBUF_LOCK(&so2->so_snd);
-	SOCKBUF_LOCK(&so->so_rcv);
-	so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
-	unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
-	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - so->so_rcv.sb_cc;
-	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
-	    newhiwat, RLIM_INFINITY);
-	unp->unp_cc = so->so_rcv.sb_cc;
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	sowwakeup_locked(so2);
-	UNP_PCB_UNLOCK(unp);
+	UNP_UNLOCK();
 	return (0);
 }
 
@@ -598,14 +346,13 @@
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
-	struct unpcb *unp, *unp2;
+	int error = 0;
+	struct unpcb *unp;
 	struct socket *so2;
 	u_long newhiwat;
-	int error = 0;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
-
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
@@ -614,38 +361,32 @@
 	if (control != NULL && (error = unp_internalize(&control, td)))
 		goto release;
 
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 	{
 		const struct sockaddr *from;
 
-		unp2 = unp->unp_conn;
 		if (nam != NULL) {
-			if (unp2 != NULL) {
+			if (unp->unp_conn != NULL) {
 				error = EISCONN;
 				break;
 			}
-			UNP_PCB_UNLOCK(unp);
 			error = unp_connect(so, nam, td);
-			UNP_PCB_LOCK(unp);
 			if (error)
 				break;
-			unp2 = unp->unp_conn;
 		} else {
-			if (unp2 == NULL) {
+			if (unp->unp_conn == NULL) {
 				error = ENOTCONN;
 				break;
 			}
 		}
-		UNP_PCB_LOCK(unp2);
-		so2 = unp2->unp_socket;
+		so2 = unp->unp_conn->unp_socket;
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
 			from = &sun_noname;
-		if (unp2->unp_flags & UNP_WANTCRED)
+		if (unp->unp_conn->unp_flags & UNP_WANTCRED)
 			control = unp_addsockcred(td, control);
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
@@ -657,22 +398,19 @@
 			error = ENOBUFS;
 		}
 		if (nam != NULL)
-			unp_disconnect(unp, unp2);
-		UNP_PCB_UNLOCK(unp2);
+			unp_disconnect(unp);
 		break;
 	}
 
 	case SOCK_STREAM:
 		/* Connect if not connected yet. */
 		/*
-		 * Note: A better implementation would complain if not equal
-		 * to the peer's address.
+		 * Note: A better implementation would complain
+		 * if not equal to the peer's address.
 		 */
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam != NULL) {
-				UNP_PCB_UNLOCK(unp);
 				error = unp_connect(so, nam, td);
-				UNP_PCB_LOCK(unp);
 				if (error)
 					break;	/* XXX */
 			} else {
@@ -681,34 +419,22 @@
 			}
 		}
 
-		/*
-		 * Lock order here has to be handled carefully: we hold the
-		 * global lock, so acquiring two unpcb locks is OK.  We must
-		 * acquire both before acquiring any socket mutexes.  We must
-		 * also acquire the local socket send mutex before the remote
-		 * socket receive mutex.  The only tricky thing is making
-		 * sure to acquire the unp2 lock before the local socket send
-		 * lock, or we will experience deadlocks.
-		 */
-		unp2 = unp->unp_conn;
-		KASSERT(unp2 != NULL,
-		    ("uipc_send connected but no connection?"));
-		UNP_PCB_LOCK(unp2);
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
-			UNP_PCB_UNLOCK(unp2);
 			error = EPIPE;
 			break;
 		}
-		so2 = unp2->unp_socket;
+		if (unp->unp_conn == NULL)
+			panic("uipc_send connected but no connection?");
+		so2 = unp->unp_conn->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
-		if (unp2->unp_flags & UNP_WANTCRED) {
+		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
 			/*
 			 * Credentials are passed only once on
 			 * SOCK_STREAM.
 			 */
-			unp2->unp_flags &= ~UNP_WANTCRED;
+			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
 			control = unp_addsockcred(td, control);
 		}
 		/*
@@ -719,19 +445,19 @@
 		if (control != NULL) {
 			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
 				control = NULL;
-		} else
+		} else {
 			sbappend_locked(&so2->so_rcv, m);
+		}
 		so->so_snd.sb_mbmax -=
 			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
-		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
 		newhiwat = so->so_snd.sb_hiwat -
-		    (so2->so_rcv.sb_cc - unp2->unp_cc);
+		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
 		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
 		    newhiwat, RLIM_INFINITY);
 		SOCKBUF_UNLOCK(&so->so_snd);
-		unp2->unp_cc = so2->so_rcv.sb_cc;
+		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
 		sorwakeup_locked(so2);
-		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
 
@@ -747,8 +473,7 @@
 		socantsendmore(so);
 		unp_shutdown(unp);
 	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	UNP_UNLOCK();
 
 	if (control != NULL && error != 0)
 		unp_dispose(control);
@@ -764,28 +489,22 @@
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
-	struct unpcb *unp, *unp2;
+	struct unpcb *unp;
 	struct socket *so2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
-
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	sb->st_blksize = so->so_snd.sb_hiwat;
-	unp2 = unp->unp_conn;
-	if (so->so_type == SOCK_STREAM && unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
-		so2 = unp2->unp_socket;
+	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
+		so2 = unp->unp_conn->unp_socket;
 		sb->st_blksize += so2->so_rcv.sb_cc;
-		UNP_PCB_UNLOCK(unp2);
 	}
 	sb->st_dev = NODEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
 	sb->st_ino = unp->unp_ino;
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	UNP_UNLOCK();
 	return (0);
 }
 
@@ -796,13 +515,10 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
-
-	UNP_GLOBAL_LOCK();
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	socantsendmore(so);
 	unp_shutdown(unp);
-	UNP_PCB_UNLOCK(unp);
-	UNP_GLOBAL_UNLOCK();
+	UNP_UNLOCK();
 	return (0);
 }
 
@@ -814,15 +530,14 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
-
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_PCB_LOCK(unp);
+	UNP_LOCK();
 	if (unp->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
-	UNP_PCB_UNLOCK(unp);
+	UNP_UNLOCK();
 	return (0);
 }
 
@@ -859,13 +574,12 @@
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
-
+	UNP_LOCK();
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case LOCAL_PEERCRED:
-			UNP_PCB_LOCK(unp);
 			if (unp->unp_flags & UNP_HAVEPC)
 				xu = unp->unp_peercred;
 			else {
@@ -874,31 +588,22 @@
 				else
 					error = EINVAL;
 			}
-			UNP_PCB_UNLOCK(unp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &xu, sizeof(xu));
 			break;
-
 		case LOCAL_CREDS:
-			UNP_PCB_LOCK(unp);
 			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
-			UNP_PCB_UNLOCK(unp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
-
 		case LOCAL_CONNWAIT:
-			UNP_PCB_LOCK(unp);
 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
-			UNP_PCB_UNLOCK(unp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
-
 		default:
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
-
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case LOCAL_CREDS:
@@ -908,24 +613,19 @@
 			if (error)
 				break;
 
-#define	OPTSET(bit) do {						\
-	UNP_PCB_LOCK(unp);						\
-	if (optval)							\
-		unp->unp_flags |= bit;					\
-	else								\
-		unp->unp_flags &= ~bit;					\
-	UNP_PCB_UNLOCK(unp);						\
-} while (0)
+#define	OPTSET(bit) \
+	if (optval) \
+		unp->unp_flags |= bit; \
+	else \
+		unp->unp_flags &= ~bit;
 
 			switch (sopt->sopt_name) {
 			case LOCAL_CREDS:
 				OPTSET(UNP_WANTCRED);
 				break;
-
 			case LOCAL_CONNWAIT:
 				OPTSET(UNP_CONNWAIT);
 				break;
-
 			default:
 				break;
 			}
@@ -936,60 +636,117 @@
 			break;
 		}
 		break;
-
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
+	UNP_UNLOCK();
 	return (error);
 }
 
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering
+ * for stream sockets, although the total for sender and receiver is
+ * actually only PIPSIZ.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should
+ * be large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+static int	unp_rights;			/* file descriptors in flight */
+
+SYSCTL_DECL(_net_local_stream);
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+	   &unpst_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_DECL(_net_local_dgram);
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_DECL(_net_local);
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
+unp_attach(struct socket *so)
+{
+	struct unpcb *unp;
+	int error;
+
+	KASSERT(so->so_pcb == NULL, ("unp_attach: so_pcb != NULL"));
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+
+		case SOCK_STREAM:
+			error = soreserve(so, unpst_sendspace, unpst_recvspace);
+			break;
+
+		case SOCK_DGRAM:
+			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
+			break;
+
+		default:
+			panic("unp_attach");
+		}
+		if (error)
+			return (error);
+	}
+	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200606090428.k594SR38018446>