Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 26 Dec 2016 19:33:40 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r310617 - in head/sys: kern sys
Message-ID:  <201612261933.uBQJXe7T071333@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Mon Dec 26 19:33:40 2016
New Revision: 310617
URL: https://svnweb.freebsd.org/changeset/base/310617

Log:
  Make knote KN_INFLUX state counted.  This is final fix for the issue
  closed by r310302 for knote().
  
  If KN_INFLUX | KN_SCAN flags are set for the note passed to knote() or
  knote_fork(), i.e. the knote is scanned, we might erronously clear
  INFLUX when finishing notification.  For normal knote() it was fixed
  in r310302 simply by remembering the fact that we do not own
  KN_INFLUX, since there we own knlist lock and scan thread cannot clear
  KN_INFLUX until we drop the lock.  For knote_fork(), the situation is
  more complicated, e must drop knlist lock AKA the process lock, since
  we need to register new knotes.
  
  Change KN_INFLUX into counter and allow shared ownership of the
  in-flux state between scan and knote_fork() or knote().  Both in-flux
  setters need to ensure that knote is not dropped in parallel.  Added
  assert about kn_influx == 1 in knote_drop() verifies that in-flux state
  is not shared when knote is destroyed.
  
  Since KBI of the struct knote is changed by addition of the int
  kn_influx field, reorder kn_hook and kn_hookid to fill pad on LP64
  arches [1].  This keeps sizeof(struct knote) to same 128 bytes as it
  was before addition of kn_influx, on amd64.
  
  Reviewed by:	markj
  Suggested by:	markj [1]
  Tested by:	pho (previous version)
  Sponsored by:	The FreeBSD Foundation
  Differential revision:	https://reviews.freebsd.org/D8898

Modified:
  head/sys/kern/kern_event.c
  head/sys/sys/event.h

Modified: head/sys/kern/kern_event.c
==============================================================================
--- head/sys/kern/kern_event.c	Mon Dec 26 19:29:04 2016	(r310616)
+++ head/sys/kern/kern_event.c	Mon Dec 26 19:33:40 2016	(r310617)
@@ -193,7 +193,7 @@ static unsigned int 	kq_calloutmax = 4 *
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
-/* XXX - ensure not KN_INFLUX?? */
+/* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
@@ -254,6 +254,32 @@ kn_list_unlock(struct knlist *knl)
 	}
 }
 
+static bool
+kn_in_flux(struct knote *kn)
+{
+
+	return (kn->kn_influx > 0);
+}
+
+static void
+kn_enter_flux(struct knote *kn)
+{
+
+	KQ_OWNED(kn->kn_kq);
+	MPASS(kn->kn_influx < INT_MAX);
+	kn->kn_influx++;
+}
+
+static bool
+kn_leave_flux(struct knote *kn)
+{
+
+	KQ_OWNED(kn->kn_kq);
+	MPASS(kn->kn_influx > 0);
+	kn->kn_influx--;
+	return (kn->kn_influx == 0);
+}
+
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
@@ -498,7 +524,7 @@ knote_fork(struct knlist *list, int pid)
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
-		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
+		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
@@ -521,7 +547,7 @@ knote_fork(struct knlist *list, int pid)
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
-		kn->kn_status |= KN_INFLUX;
+		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
@@ -561,7 +587,7 @@ knote_fork(struct knlist *list, int pid)
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		KQ_LOCK(kq);
-		kn->kn_status &= ~KN_INFLUX;
+		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 		list->kl_lock(list->kl_lockarg);
 	}
@@ -1262,7 +1288,7 @@ findkn:
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
-	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
@@ -1306,7 +1332,8 @@ findkn:
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
-			kn->kn_status = KN_INFLUX|KN_DETACHED;
+			kn->kn_status = KN_DETACHED;
+			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
@@ -1330,7 +1357,7 @@ findkn:
 	}
 	
 	if (kev->flags & EV_DELETE) {
-		kn->kn_status |= KN_INFLUX;
+		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		if (!(kn->kn_status & KN_DETACHED))
 			kn->kn_fop->f_detach(kn);
@@ -1348,7 +1375,8 @@ findkn:
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
-	kn->kn_status |= KN_INFLUX | KN_SCAN;
+	kn->kn_status |= KN_SCAN;
+	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	kn->kn_kevent.udata = kev->udata;
@@ -1383,7 +1411,8 @@ done_ev_add:
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
-	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
+	kn->kn_status &= ~KN_SCAN;
+	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
@@ -1546,7 +1575,7 @@ kqueue_task(void *arg, int pending)
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
- * We treat KN_MARKER knotes as if they are INFLUX.
+ * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
@@ -1620,7 +1649,7 @@ retry:
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
-		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
@@ -1643,17 +1672,17 @@ retry:
 				goto retry;
 			goto done;
 		}
-		KASSERT((kn->kn_status & KN_INFLUX) == 0,
-		    ("KN_INFLUX set when not suppose to be"));
+		KASSERT(!kn_in_flux(kn),
+		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
-			kn->kn_status |= KN_INFLUX;
+			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
-			 * We don't need to lock the list since we've marked
-			 * it _INFLUX.
+			 * We don't need to lock the list since we've
+			 * marked it as in flux.
 			 */
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
@@ -1662,12 +1691,12 @@ retry:
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
-			kn->kn_status |= KN_INFLUX;
+			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
-			 * We don't need to lock the list since we've marked
-			 * it _INFLUX.
+			 * We don't need to lock the list since we've
+			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			if (!(kn->kn_status & KN_DETACHED))
@@ -1676,7 +1705,8 @@ retry:
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
-			kn->kn_status |= KN_INFLUX | KN_SCAN;
+			kn->kn_status |= KN_SCAN;
+			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
@@ -1684,9 +1714,9 @@ retry:
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
-				kn->kn_status &=
-				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
+				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
+				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
@@ -1716,7 +1746,8 @@ retry:
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
-			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
+			kn->kn_status &= ~KN_SCAN;
+			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
@@ -1864,12 +1895,12 @@ kqueue_drain(struct kqueue *kq, struct t
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
-			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
-			kn->kn_status |= KN_INFLUX;
+			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
@@ -1880,13 +1911,13 @@ kqueue_drain(struct kqueue *kq, struct t
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
-				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
-				kn->kn_status |= KN_INFLUX;
+				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				if (!(kn->kn_status & KN_DETACHED))
 					kn->kn_fop->f_detach(kn);
@@ -2010,7 +2041,6 @@ knote(struct knlist *list, long hint, in
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
-	bool own_influx;
 
 	if (list == NULL)
 		return;
@@ -2021,7 +2051,7 @@ knote(struct knlist *list, long hint, in
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
-	 * If we unlock the list lock (and set KN_INFLUX), we can
+	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
@@ -2030,7 +2060,7 @@ knote(struct knlist *list, long hint, in
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
-		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
+		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
@@ -2041,14 +2071,11 @@ knote(struct knlist *list, long hint, in
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
-			own_influx = (kn->kn_status & KN_INFLUX) == 0;
-			if (own_influx)
-				kn->kn_status |= KN_INFLUX;
+			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
-			if (own_influx)
-				kn->kn_status &= ~KN_INFLUX;
+			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
@@ -2070,10 +2097,12 @@ knote(struct knlist *list, long hint, in
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
+
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
-	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
-	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
+	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
+	KASSERT((kn->kn_status & KN_DETACHED) != 0,
+	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
@@ -2089,12 +2118,13 @@ static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
-	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
+
+	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
-	if (!kqislocked)
-		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
-    ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
+	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
+	KASSERT((kn->kn_status & KN_DETACHED) == 0,
+	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
@@ -2287,30 +2317,30 @@ again:		/* need to reacquire lock since 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
-		if ((kn->kn_status & KN_INFLUX)) {
+		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
-			kn->kn_status |= KN_INFLUX | KN_DETACHED;
+			kn->kn_status |= KN_DETACHED;
+			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
-			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
-		/* there are still KN_INFLUX remaining */
+		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
-		KASSERT(kn->kn_status & KN_INFLUX,
-		    ("knote removed w/o list lock"));
+		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
@@ -2352,7 +2382,7 @@ again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
-			if (kn->kn_status & KN_INFLUX) {
+			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
@@ -2360,7 +2390,7 @@ again:
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
-			kn->kn_status |= KN_INFLUX;
+			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
@@ -2377,7 +2407,7 @@ knote_attach(struct knote *kn, struct kq
 {
 	struct klist *list;
 
-	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
+	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if (kn->kn_fop->f_isfd) {
@@ -2395,7 +2425,7 @@ knote_attach(struct knote *kn, struct kq
 
 /*
  * knote must already have been detached using the f_detach method.
- * no lock need to be held, it is assumed that the KN_INFLUX flag is set
+ * no lock need to be held, it is assumed that the influx state is set
  * to prevent other removal.
  */
 static void
@@ -2407,10 +2437,10 @@ knote_drop(struct knote *kn, struct thre
 	kq = kn->kn_kq;
 
 	KQ_NOTOWNED(kq);
-	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
-	    ("knote_drop called without KN_INFLUX set in kn_status"));
-
 	KQ_LOCK(kq);
+	KASSERT(kn->kn_influx == 1,
+	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
+
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else

Modified: head/sys/sys/event.h
==============================================================================
--- head/sys/sys/event.h	Mon Dec 26 19:29:04 2016	(r310616)
+++ head/sys/sys/event.h	Mon Dec 26 19:33:40 2016	(r310617)
@@ -202,8 +202,11 @@ struct filterops {
 };
 
 /*
- * Setting the KN_INFLUX flag enables you to unlock the kq that this knote
- * is on, and modify kn_status as if you had the KQ lock.
+ * An in-flux knote cannot be dropped from its kq while the kq is
+ * unlocked.  If the KN_SCAN flag is not set, a thread can only set
+ * kn_influx when it is exclusive owner of the knote state, and can
+ * modify kn_status as if it had the KQ lock.  KN_SCAN must not be set
+ * on a knote which is already in flux.
  *
  * kn_sfflags, kn_sdata, and kn_kevent are protected by the knlist lock.
  */
@@ -214,16 +217,18 @@ struct knote {
 	TAILQ_ENTRY(knote)	kn_tqe;
 	struct			kqueue *kn_kq;	/* which queue we are on */
 	struct 			kevent kn_kevent;
+	void			*kn_hook;
+	int			kn_hookid;
 	int			kn_status;	/* protected by kq lock */
 #define KN_ACTIVE	0x01			/* event has been triggered */
 #define KN_QUEUED	0x02			/* event is on queue */
 #define KN_DISABLED	0x04			/* event is disabled */
 #define KN_DETACHED	0x08			/* knote is detached */
-#define KN_INFLUX	0x10			/* knote is in flux */
 #define KN_MARKER	0x20			/* ignore this knote */
 #define KN_KQUEUE	0x40			/* this knote belongs to a kq */
 #define KN_HASKQLOCK	0x80			/* for _inevent */
 #define	KN_SCAN		0x100			/* flux set in kqueue_scan() */
+	int			kn_influx;
 	int			kn_sfflags;	/* saved filter flags */
 	intptr_t		kn_sdata;	/* saved data field */
 	union {
@@ -234,8 +239,6 @@ struct knote {
 		void		*p_v;		/* generic other pointer */
 	} kn_ptr;
 	struct			filterops *kn_fop;
-	void			*kn_hook;
-	int			kn_hookid;
 
 #define kn_id		kn_kevent.ident
 #define kn_filter	kn_kevent.filter



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201612261933.uBQJXe7T071333>