Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 25 Feb 2019 09:13:33 +0000 (UTC)
From:      Vincenzo Maffione <vmaffione@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r344509 - stable/11/sys/dev/netmap
Message-ID:  <201902250913.x1P9DXfH022498@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: vmaffione
Date: Mon Feb 25 09:13:33 2019
New Revision: 344509
URL: https://svnweb.freebsd.org/changeset/base/344509

Log:
  MFC r343579, r344253
  
  netmap: fix lock order reversal related to kqueue usage
  
  When using poll(), select() or kevent() on netmap file descriptors,
  netmap executes the equivalent of NIOCTXSYNC and NIOCRXSYNC commands,
  before collecting the events that are ready. In other words, the
  poll/kevent callback has side effects. This is done to avoid the
  overhead of two system call per iteration (e.g., poll() + ioctl(NIOC*XSYNC)).
  
  When the kqueue subsystem invokes the kqueue(9) f_event callback
  (netmap_knrw), it holds the lock of the struct knlist object associated
  to the netmap port (the lock is provided at initialization, by calling
  knlist_init_mtx).
  However, netmap_knrw() may need to wake up another netmap port (or even
  the same one), which means that it may need to call knote().
  Since knote() needs the lock of the struct knlist object associated to
  the to-be-wake-up netmap port, it is possible to have a lock order reversal
  problem (AB/BA deadlock).
  
  This change prevents the deadlock by executing the knote() call in a
  per-selinfo taskqueue, where it is possible to hold a mutex.
  The change also adds a counter (kqueue_users) to keep track of how many
  kqueue users are referencing a given struct nm_selinfo.
  In this way, nm_os_selwakeup() can schedule the kevent notification
  task only when kqueue is actually being used.
  This is important to avoid wasting CPU in the common case where
  kqueue is not used.
  
  Reviewed by:    aleksandr.fedorov_itglobal.com
  Differential Revision:  https://reviews.freebsd.org/D18956

Modified:
  stable/11/sys/dev/netmap/netmap.c
  stable/11/sys/dev/netmap/netmap_freebsd.c
  stable/11/sys/dev/netmap/netmap_kern.h
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/netmap/netmap.c
==============================================================================
--- stable/11/sys/dev/netmap/netmap.c	Mon Feb 25 08:50:25 2019	(r344508)
+++ stable/11/sys/dev/netmap/netmap.c	Mon Feb 25 09:13:33 2019	(r344509)
@@ -831,6 +831,7 @@ netmap_krings_create(struct netmap_adapter *na, u_int 
 	struct netmap_kring *kring;
 	u_int n[NR_TXRX];
 	enum txrx t;
+	int err = 0;
 
 	if (na->tx_rings != NULL) {
 		if (netmap_debug & NM_DEBUG_ON)
@@ -870,7 +871,6 @@ netmap_krings_create(struct netmap_adapter *na, u_int 
 		for (i = 0; i < n[t]; i++) {
 			kring = NMR(na, t)[i];
 			bzero(kring, sizeof(*kring));
-			kring->na = na;
 			kring->notify_na = na;
 			kring->ring_id = i;
 			kring->tx = t;
@@ -896,13 +896,21 @@ netmap_krings_create(struct netmap_adapter *na, u_int 
 					nm_txrx2str(t), i);
 			nm_prdis("ktx %s h %d c %d t %d",
 				kring->name, kring->rhead, kring->rcur, kring->rtail);
+			err = nm_os_selinfo_init(&kring->si, kring->name);
+			if (err) {
+				netmap_krings_delete(na);
+				return err;
+			}
 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
-			nm_os_selinfo_init(&kring->si);
+			kring->na = na;	/* setting this field marks the mutex as initialized */
 		}
-		nm_os_selinfo_init(&na->si[t]);
+		err = nm_os_selinfo_init(&na->si[t], na->name);
+		if (err) {
+			netmap_krings_delete(na);
+			return err;
+		}
 	}
 
-
 	return 0;
 }
 
@@ -926,7 +934,8 @@ netmap_krings_delete(struct netmap_adapter *na)
 
 	/* we rely on the krings layout described above */
 	for ( ; kring != na->tailroom; kring++) {
-		mtx_destroy(&(*kring)->q_lock);
+		if ((*kring)->na != NULL)
+			mtx_destroy(&(*kring)->q_lock);
 		nm_os_selinfo_uninit(&(*kring)->si);
 	}
 	nm_os_free(na->tx_rings);

Modified: stable/11/sys/dev/netmap/netmap_freebsd.c
==============================================================================
--- stable/11/sys/dev/netmap/netmap_freebsd.c	Mon Feb 25 08:50:25 2019	(r344508)
+++ stable/11/sys/dev/netmap/netmap_freebsd.c	Mon Feb 25 09:13:33 2019	(r344509)
@@ -56,6 +56,7 @@
 #include <sys/unistd.h> /* RFNOWAIT */
 #include <sys/sched.h> /* sched_bind() */
 #include <sys/smp.h> /* mp_maxid */
+#include <sys/taskqueue.h> /* taskqueue_enqueue(), taskqueue_create(), ... */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h> /* IFT_ETHER */
@@ -73,16 +74,49 @@
 
 /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */
 
-void nm_os_selinfo_init(NM_SELINFO_T *si) {
-	struct mtx *m = &si->m;
-	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);
-	knlist_init_mtx(&si->si.si_note, m);
+static void
+nm_kqueue_notify(void *opaque, int pending)
+{
+	struct nm_selinfo *si = opaque;
+
+	/* We use a non-zero hint to distinguish this notification call
+	 * from the call done in kqueue_scan(), which uses hint=0.
+	 */
+	KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100);
 }
 
+int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) {
+	int err;
+
+	TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si);
+	si->ntfytq = taskqueue_create(name, M_NOWAIT,
+	    taskqueue_thread_enqueue, &si->ntfytq);
+	if (si->ntfytq == NULL)
+		return -ENOMEM;
+	err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name);
+	if (err) {
+		taskqueue_free(si->ntfytq);
+		si->ntfytq = NULL;
+		return err;
+	}
+
+	snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name);
+	mtx_init(&si->m, si->mtxname, NULL, MTX_DEF);
+	knlist_init_mtx(&si->si.si_note, &si->m);
+	si->kqueue_users = 0;
+
+	return (0);
+}
+
 void
 nm_os_selinfo_uninit(NM_SELINFO_T *si)
 {
-	/* XXX kqueue(9) needed; these will mirror knlist_init. */
+	if (si->ntfytq == NULL) {
+		return;	/* si was not initialized */
+	}
+	taskqueue_drain(si->ntfytq, &si->ntfytask);
+	taskqueue_free(si->ntfytq);
+	si->ntfytq = NULL;
 	knlist_delete(&si->si.si_note, curthread, /*islocked=*/0);
 	knlist_destroy(&si->si.si_note);
 	/* now we don't need the mutex anymore */
@@ -1290,13 +1324,18 @@ nm_os_kctx_destroy(struct nm_kctx *nmk)
 
 /*
  * In addition to calling selwakeuppri(), nm_os_selwakeup() also
- * needs to call KNOTE to wake up kqueue listeners.
- * We use a non-zero 'hint' argument to inform the netmap_knrw()
- * function that it is being called from 'nm_os_selwakeup'; this
- * is necessary because when netmap_knrw() is called by the kevent
- * subsystem (i.e. kevent_scan()) we also need to call netmap_poll().
- * The knote uses a private mutex associated to the 'si' (see struct
- * selinfo, struct nm_selinfo, and nm_os_selinfo_init).
+ * needs to call knote() to wake up kqueue listeners.
+ * This operation is deferred to a taskqueue in order to avoid possible
+ * lock order reversals; these may happen because knote() grabs a
+ * private lock associated to the 'si' (see struct selinfo,
+ * struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup()
+ * can be called while holding the lock associated to a different
+ * 'si'.
+ * When calling knote() we use a non-zero 'hint' argument to inform
+ * the netmap_knrw() function that it is being called from
+ * 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is
+ * called by the kevent subsystem (i.e. kevent_scan()) we also need to
+ * call netmap_poll().
  *
  * The netmap_kqfilter() function registers one or another f_event
  * depending on read or write mode. A pointer to the struct
@@ -1311,11 +1350,9 @@ void
 nm_os_selwakeup(struct nm_selinfo *si)
 {
 	selwakeuppri(&si->si, PI_NET);
-	/* We use a non-zero hint to distinguish this notification call
-	 * from the call done in kqueue_scan(), which uses hint=0.
-	 */
-	KNOTE(&si->si.si_note, /*hint=*/0x100,
-	    mtx_owned(&si->m) ? KNF_LISTLOCKED : 0);
+	if (si->kqueue_users > 0) {
+		taskqueue_enqueue(si->ntfytq, &si->ntfytask);
+	}
 }
 
 void
@@ -1328,20 +1365,28 @@ static void
 netmap_knrdetach(struct knote *kn)
 {
 	struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
-	struct selinfo *si = &priv->np_si[NR_RX]->si;
+	struct nm_selinfo *si = priv->np_si[NR_RX];
 
-	nm_prinf("remove selinfo %p", si);
-	knlist_remove(&si->si_note, kn, /*islocked=*/0);
+	knlist_remove(&si->si.si_note, kn, /*islocked=*/0);
+	NMG_LOCK();
+	KASSERT(si->kqueue_users > 0, ("kqueue_user underflow on %s",
+	    si->mtxname));
+	si->kqueue_users--;
+	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
+	NMG_UNLOCK();
 }
 
 static void
 netmap_knwdetach(struct knote *kn)
 {
 	struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook;
-	struct selinfo *si = &priv->np_si[NR_TX]->si;
+	struct nm_selinfo *si = priv->np_si[NR_TX];
 
-	nm_prinf("remove selinfo %p", si);
-	knlist_remove(&si->si_note, kn, /*islocked=*/0);
+	knlist_remove(&si->si.si_note, kn, /*islocked=*/0);
+	NMG_LOCK();
+	si->kqueue_users--;
+	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
+	NMG_UNLOCK();
 }
 
 /*
@@ -1429,6 +1474,10 @@ netmap_kqfilter(struct cdev *dev, struct knote *kn)
 	kn->kn_fop = (ev == EVFILT_WRITE) ?
 		&netmap_wfiltops : &netmap_rfiltops;
 	kn->kn_hook = priv;
+	NMG_LOCK();
+	si->kqueue_users++;
+	nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users);
+	NMG_UNLOCK();
 	knlist_add(&si->si.si_note, kn, /*islocked=*/0);
 
 	return 0;

Modified: stable/11/sys/dev/netmap/netmap_kern.h
==============================================================================
--- stable/11/sys/dev/netmap/netmap_kern.h	Mon Feb 25 08:50:25 2019	(r344508)
+++ stable/11/sys/dev/netmap/netmap_kern.h	Mon Feb 25 09:13:33 2019	(r344509)
@@ -130,8 +130,14 @@ struct netmap_adapter *netmap_getna(if_t ifp);
 #define MBUF_QUEUED(m)		1
 
 struct nm_selinfo {
+	/* Support for select(2) and poll(2). */
 	struct selinfo si;
+	/* Support for kqueue(9). See comments in netmap_freebsd.c */
+	struct taskqueue *ntfytq;
+	struct task ntfytask;
 	struct mtx m;
+	char mtxname[32];
+	int kqueue_users;
 };
 
 
@@ -288,7 +294,7 @@ struct netmap_priv_d;
 struct nm_bdg_args;
 
 /* os-specific NM_SELINFO_T initialzation/destruction functions */
-void nm_os_selinfo_init(NM_SELINFO_T *);
+int nm_os_selinfo_init(NM_SELINFO_T *, const char *name);
 void nm_os_selinfo_uninit(NM_SELINFO_T *);
 
 const char *nm_dump_buf(char *p, int len, int lim, char *dst);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201902250913.x1P9DXfH022498>