Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 23 May 2011 19:32:02 +0000 (UTC)
From:      Robert Watson <rwatson@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r222217 - head/sys/netinet
Message-ID:  <201105231932.p4NJW2ms034573@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rwatson
Date: Mon May 23 19:32:02 2011
New Revision: 222217
URL: http://svn.freebsd.org/changeset/base/222217

Log:
  Continue to refine inpcb reference counting and locking, in preparation for
  reworking of inpcbinfo locking:
  
  (1) Convert inpcb reference counting from manually manipulated integers to
      the refcount(9) KPI.  This allows the refcount to be managed atomically
      with an inpcb read lock rather than write lock, or even with no inpcb
      lock at all.  As a result, in_pcbref() also no longer requires an inpcb
      lock, so can be performed solely using the lock used to look up an
      inpcb.
  
  (2) Shift more inpcb freeing activity from the in_pcbrele() context (via
      in_pcbfree_internal) to the explicit in_pcbfree() context.  This means
      that the inpcb refcount is increasingly used only to maintain memory
      stability, not actually defer the clean up of inpcb protocol parts.
      This is desirable as many of those protocol parts required the pcbinfo
      lock, which we'd like not to acquire in in_pcbrele() contexts.  Document
      this in comments better.
  
  (3) Introduce new read-locked and write-locked in_pcbrele() variations,
      in_pcbrele_rlocked() and in_pcbrele_wlocked(), which allow the inpcb to
      be properly unlocked as needed.  in_pcbrele() is a wrapper around the
      latter, and should probably go away at some point.  This makes it
      easier to use this weak reference model when holding only a read lock,
      as will happen in the future.
  
  This may well be safe to MFC, but some more KBI analysis is required.
  
  Reviewed by:    bz
  MFC after:      3 weeks
  Sponsored by:   Juniper Networks, Inc.

Modified:
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Mon May 23 16:40:44 2011	(r222216)
+++ head/sys/netinet/in_pcb.c	Mon May 23 19:32:02 2011	(r222217)
@@ -2,8 +2,12 @@
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -50,6 +54,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
@@ -287,7 +292,7 @@ in_pcballoc(struct socket *so, struct in
 #endif
 	INP_WLOCK(inp);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
-	inp->inp_refcount = 1;	/* Reference from the inpcbinfo */
+	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
 #if defined(IPSEC) || defined(MAC)
 out:
 	if (error != 0) {
@@ -1028,56 +1033,18 @@ in_pcbdetach(struct inpcb *inp)
 }
 
 /*
- * in_pcbfree_internal() frees an inpcb that has been detached from its
- * socket, and whose reference count has reached 0.  It will also remove the
- * inpcb from any global lists it might remain on.
- */
-static void
-in_pcbfree_internal(struct inpcb *inp)
-{
-	struct inpcbinfo *ipi = inp->inp_pcbinfo;
-
-	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-	KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
-
-	INP_INFO_WLOCK_ASSERT(ipi);
-	INP_WLOCK_ASSERT(inp);
-
-#ifdef IPSEC
-	if (inp->inp_sp != NULL)
-		ipsec_delete_pcbpolicy(inp);
-#endif /* IPSEC */
-	inp->inp_gencnt = ++ipi->ipi_gencnt;
-	in_pcbremlists(inp);
-#ifdef INET6
-	if (inp->inp_vflag & INP_IPV6PROTO) {
-		ip6_freepcbopts(inp->in6p_outputopts);
-		if (inp->in6p_moptions != NULL)
-			ip6_freemoptions(inp->in6p_moptions);
-	}
-#endif
-	if (inp->inp_options)
-		(void)m_free(inp->inp_options);
-#ifdef INET
-	if (inp->inp_moptions != NULL)
-		inp_freemoptions(inp->inp_moptions);
-#endif
-	inp->inp_vflag = 0;
-	crfree(inp->inp_cred);
-
-#ifdef MAC
-	mac_inpcb_destroy(inp);
-#endif
-	INP_WUNLOCK(inp);
-	uma_zfree(ipi->ipi_zone, inp);
-}
-
-/*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released.  This
  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
  * but where the inpcb lock is already held.
  *
+ * in_pcbref() should be used only to provide brief memory stability, and
+ * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
+ * garbage collect the inpcb if it has been in_pcbfree()'d from another
+ * context.  Until in_pcbrele() has returned that the inpcb is still valid,
+ * lock and rele are the *only* safe operations that may be performed on the
+ * inpcb.
+ *
  * While the inpcb will not be freed, releasing the inpcb lock means that the
  * connection's state may change, so the caller should be careful to
  * revalidate any cached state on reacquiring the lock.  Drop the reference
@@ -1091,7 +1058,7 @@ in_pcbref(struct inpcb *inp)
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
-	inp->inp_refcount++;
+	refcount_acquire(&inp->inp_refcount);
 }
 
 /*
@@ -1099,47 +1066,108 @@ in_pcbref(struct inpcb *inp)
  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
  * return a flag indicating whether or not the inpcb remains valid.  If it is
  * valid, we return with the inpcb lock held.
+ *
+ * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
+ * reference on an inpcb.  Historically more work was done here (actually, in
+ * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
+ * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
+ * about memory stability (and continued use of the write lock).
  */
 int
-in_pcbrele(struct inpcb *inp)
+in_pcbrele_rlocked(struct inpcb *inp)
 {
-#ifdef INVARIANTS
-	struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+	struct inpcbinfo *pcbinfo;
+
+	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+
+	INP_RLOCK_ASSERT(inp);
+
+	if (refcount_release(&inp->inp_refcount) == 0)
+		return (0);
+
+	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+	INP_RUNLOCK(inp);
+	pcbinfo = inp->inp_pcbinfo;
+	uma_zfree(pcbinfo->ipi_zone, inp);
+	return (1);
+}
+
+int
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo;
 
 	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 
-	INP_INFO_WLOCK_ASSERT(ipi);
 	INP_WLOCK_ASSERT(inp);
 
-	inp->inp_refcount--;
-	if (inp->inp_refcount > 0)
+	if (refcount_release(&inp->inp_refcount) == 0)
 		return (0);
-	in_pcbfree_internal(inp);
+
+	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+	INP_WUNLOCK(inp);
+	pcbinfo = inp->inp_pcbinfo;
+	uma_zfree(pcbinfo->ipi_zone, inp);
 	return (1);
 }
 
 /*
+ * Temporary wrapper.
+ */
+int
+in_pcbrele(struct inpcb *inp)
+{
+
+	return (in_pcbrele_wlocked(inp));
+}
+
+/*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked.
+ * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
+ * work, including removal from global lists, is done in this context, where
+ * the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
-#ifdef INVARIANTS
-	struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 
-	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
-	    __func__));
+	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 
-	INP_INFO_WLOCK_ASSERT(ipi);
+	INP_INFO_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
-	if (!in_pcbrele(inp))
+	/* XXXRW: Do as much as possible here. */
+#ifdef IPSEC
+	if (inp->inp_sp != NULL)
+		ipsec_delete_pcbpolicy(inp);
+#endif /* IPSEC */
+	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+	in_pcbremlists(inp);
+#ifdef INET6
+	if (inp->inp_vflag & INP_IPV6PROTO) {
+		ip6_freepcbopts(inp->in6p_outputopts);
+		if (inp->in6p_moptions != NULL)
+			ip6_freemoptions(inp->in6p_moptions);
+	}
+#endif
+	if (inp->inp_options)
+		(void)m_free(inp->inp_options);
+#ifdef INET
+	if (inp->inp_moptions != NULL)
+		inp_freemoptions(inp->inp_moptions);
+#endif
+	inp->inp_vflag = 0;
+	crfree(inp->inp_cred);
+#ifdef MAC
+	mac_inpcb_destroy(inp);
+#endif
+	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 }
 

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Mon May 23 16:40:44 2011	(r222216)
+++ head/sys/netinet/in_pcb.h	Mon May 23 19:32:02 2011	(r222217)
@@ -534,6 +534,8 @@ void	in_pcbnotifyall(struct inpcbinfo *p
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 int	in_pcbrele(struct inpcb *);
+int	in_pcbrele_rlocked(struct inpcb *);
+int	in_pcbrele_wlocked(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201105231932.p4NJW2ms034573>