Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 15 Apr 2004 23:40:28 -0400
From:      "Brian F. Feldman" <green@freebsd.org>
To:        John-Mark Gurney <gurney_j@efn.org>, Robert Watson <rwatson@freebsd.org>, Seigo Tanimura <tanimura@tanimura.dyndns.org>, Pawel Jakub Dawidek <pjd@freebsd.org>, freebsd-arch@freebsd.org
Subject:    Re: locking down kqueue (was some other completely unrelated topic) 
Message-ID:  <200404160340.i3G3eTKi004092@green.homeunix.org>
In-Reply-To: Message from John-Mark Gurney <gurney_j@efn.org>  of "Thu, 15 Apr 2004 15:39:20 PDT." <20040415223920.GT567@funkthat.com> 

next in thread | previous in thread | raw e-mail | index | archive | help
BTW, I'll enclose the current (broken) implementation.  It crashes during 
boot which I really can't track down due to not having a serial console but 
other than that, the only KNOWN issues with the design should be the fact 
that many places KNOTE() is called, it is still called with locks held.

Index: cam/scsi/scsi_target.c
===================================================================
RCS file: /usr/ncvs/src/sys/cam/scsi/scsi_target.c,v
retrieving revision 1.60
diff -u -r1.60 scsi_target.c
--- cam/scsi/scsi_target.c	21 Feb 2004 21:10:39 -0000	1.60
+++ cam/scsi/scsi_target.c	15 Apr 2004 18:32:10 -0000
@@ -337,7 +337,7 @@
 	kn->kn_hook = (caddr_t)softc;
 	kn->kn_fop = &targread_filtops;
 	TARG_LOCK(softc);
-	SLIST_INSERT_HEAD(&softc->read_select.si_note, kn, kn_selnext);
+	klist_add(&softc->read_select.si_note, kn);
 	TARG_UNLOCK(softc);
 	return (0);
 }
@@ -349,7 +349,7 @@
 
 	softc = (struct targ_softc *)kn->kn_hook;
 	TARG_LOCK(softc);
-	SLIST_REMOVE(&softc->read_select.si_note, kn, knote, kn_selnext);
+	klist_remove(&softc->read_select.si_note, kn);
 	TARG_UNLOCK(softc);
 }
 
Index: fs/fifofs/fifo_vnops.c
===================================================================
RCS file: /usr/ncvs/src/sys/fs/fifofs/fifo_vnops.c,v
retrieving revision 1.92
diff -u -r1.92 fifo_vnops.c
--- fs/fifofs/fifo_vnops.c	31 Mar 2004 01:41:29 -0000	1.92
+++ fs/fifofs/fifo_vnops.c	15 Apr 2004 18:32:30 -0000
@@ -407,6 +407,8 @@
 	return (0);
 }
 
+/* XXX None of the kqueue functions do their own klist/socket locking. */
+
 /* ARGSUSED */
 static int
 fifo_kqfilter(ap)
@@ -436,7 +438,7 @@
 
 	ap->a_kn->kn_hook = (caddr_t)so;
 
-	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext);
+	klist_add(&sb->sb_sel.si_note, ap->a_kn);
 	sb->sb_flags |= SB_KNOTE;
 
 	return (0);
@@ -447,7 +449,7 @@
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
-	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&so->so_rcv.sb_sel.si_note, kn);
 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 }
@@ -471,7 +473,7 @@
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
-	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&so->so_snd.sb_sel.si_note, kn);
 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 }
Index: gnu/ext2fs/ext2_vnops.c
===================================================================
RCS file: /usr/ncvs/src/sys/gnu/ext2fs/ext2_vnops.c,v
retrieving revision 1.82
diff -u -r1.82 ext2_vnops.c
--- gnu/ext2fs/ext2_vnops.c	11 Mar 2004 16:33:10 -0000	1.82
+++ gnu/ext2fs/ext2_vnops.c	15 Apr 2004 18:31:10 -0000
@@ -1899,7 +1899,7 @@
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
-	SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext);
+	klist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 
 	return (0);
@@ -1912,8 +1912,7 @@
 
 	KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
-	SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note,
-	    kn, knote, kn_selnext);
+	klist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 
Index: kern/kern_event.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_event.c,v
retrieving revision 1.67
diff -u -r1.67 kern_event.c
--- kern/kern_event.c	20 Feb 2004 04:00:48 -0000	1.67
+++ kern/kern_event.c	16 Apr 2004 03:06:01 -0000
@@ -58,7 +58,6 @@
 static int	kqueue_scan(struct file *fp, int maxevents,
 		    struct kevent *ulistp, const struct timespec *timeout,
 		    struct thread *td);
-static void 	kqueue_wakeup(struct kqueue *kq);
 
 static fo_rdwr_t	kqueue_read;
 static fo_rdwr_t	kqueue_write;
@@ -78,10 +77,12 @@
 	.fo_close = kqueue_close,
 };
 
-static void 	knote_attach(struct knote *kn, struct filedesc *fdp);
-static void 	knote_drop(struct knote *kn, struct thread *td);
-static void 	knote_enqueue(struct knote *kn);
-static void 	knote_dequeue(struct knote *kn);
+static void 	knote_attach(struct kqueue *kq, struct knote *kn,
+	    struct filedesc *fdp);
+static void 	knote_drop(struct kqueue *kq, struct knote *kn,
+	    struct thread *td);
+static void 	knote_enqueue(struct kqueue *kq, struct knote *kn);
+static void 	knote_dequeue(struct kqueue *kq, struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(void);
 static void 	knote_free(struct knote *kn);
@@ -107,20 +108,47 @@
 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
 
 static uma_zone_t	knote_zone;
+static struct mtx	klist_mtx;
+static struct mtx	knote_mtx;
 static int 		kq_ncallouts = 0;
 static int 		kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
-#define KNOTE_ACTIVATE(kn) do { 					\
+#define KNOTE_ACTIVATE(kn, enqueued) do {				\
 	kn->kn_status |= KN_ACTIVE;					\
-	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
-		knote_enqueue(kn);					\
+	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {		\
+		knote_enqueue(kn->kn_kq, kn);				\
+		enqueued = 1;						\
+	}								\
 } while(0)
 
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
+void
+klist_add(struct klist *list, struct knote *note)
+{
+	mtx_assert(&klist_mtx, MA_OWNED);
+	SLIST_INSERT_HEAD(list, note, kn_selnext);
+}
+
+void
+klist_remove(struct klist *list, struct knote *note)
+{
+	mtx_assert(&klist_mtx, MA_OWNED);
+	SLIST_REMOVE(list, note, knote, kn_selnext);
+}
+
+void
+klist_disappearing(struct klist *list)
+{
+	mtx_lock(&klist_mtx);
+	while (SLIST_FIRST(list))
+		SLIST_REMOVE_HEAD(list, kn_selnext);
+	mtx_unlock(&klist_mtx);
+}
+
 static int
 filt_nullattach(struct knote *kn)
 {
@@ -164,7 +192,7 @@
 		return (1);
 
 	kn->kn_fop = &kqread_filtops;
-	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
+	klist_add(&kq->kq_sel.si_note, kn);
 	return (0);
 }
 
@@ -173,7 +201,7 @@
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
-	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&kq->kq_sel.si_note, kn);
 }
 
 /*ARGSUSED*/
@@ -205,6 +233,8 @@
 		PROC_UNLOCK(p);
 		return (error);
 	}
+	PHOLD(p);
+	PROC_UNLOCK(p);
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
@@ -219,17 +249,18 @@
 	}
 
 	if (immediate == 0)
-		SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+		klist_add(&p->p_klist, kn);
+
+	PRELE(p);
 
 	/*
 	 * Immediately activate any exit notes if the target process is a
 	 * zombie.  This is necessary to handle the case where the target
 	 * process, e.g. a child, dies before the kevent is registered.
+	 * The side-effect of filt_proc() here is KNOTE_ACTIVATE().
 	 */
-	if (immediate && filt_proc(kn, NOTE_EXIT))
-		KNOTE_ACTIVATE(kn);
-
-	PROC_UNLOCK(p);
+	if (immediate)
+		(void)filt_proc(kn, NOTE_EXIT);
 
 	return (0);
 }
@@ -246,13 +277,16 @@
 filt_procdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
+	struct knote *ckn;
 
+	while ((ckn = SLIST_FIRST(&kn_forklist(kn))) != NULL) {
+		SLIST_REMOVE_HEAD(&kn_forklist(kn), kn_link);
+		knote_free(ckn);
+	}
 	if (kn->kn_status & KN_DETACHED)
 		return;
 
-	PROC_LOCK(p);
-	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
-	PROC_UNLOCK(p);
+	klist_remove(&p->p_klist, kn);
 }
 
 static int
@@ -285,24 +319,28 @@
 	 * so attach a new knote to it, and immediately report an
 	 * event with the parent's pid.
 	 */
-	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
-		struct kevent kev;
-		int error;
+	if (event == NOTE_FORK && kn->kn_sfflags & NOTE_TRACK) {
+		struct knote *ckn;
 
-		/*
-		 * register knote with new process.
-		 */
-		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
-		kev.filter = kn->kn_filter;
-		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
-		kev.fflags = kn->kn_sfflags;
-		kev.data = kn->kn_id;			/* parent */
-		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
-		error = kqueue_register(kn->kn_kq, &kev, NULL);
-		if (error)
+		/* Lazy-attach this new knote. */
+		ckn = knote_alloc();
+		if (ckn == NULL) {
 			kn->kn_fflags |= NOTE_TRACKERR;
+			goto out;
+		}
+		ckn->kn_kq = kn->kn_kq;
+		ckn->kn_fop = kn->kn_fop;
+		ckn->kn_sfflags = kn->kn_sfflags;
+		ckn->kn_sdata = kn->kn_id;		/* parent */
+		ckn->kn_id = hint & NOTE_PDATAMASK;	/* pid */
+		ckn->kn_filter = kn->kn_filter;
+		ckn->kn_flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
+		ckn->kn_kevent.udata =			/* preserve udata */
+		    kn->kn_kevent.udata;
+		SLIST_INSERT_HEAD(&kn_forklist(kn), ckn, kn_link);
 	}
 
+out:
 	return (kn->kn_fflags != 0);
 }
 
@@ -312,10 +350,13 @@
 	struct knote *kn = knx;
 	struct callout *calloutp;
 	struct timeval tv;
-	int tticks;
+	int tticks, enqueued = 0;
 
+	mtx_lock(&klist_mtx);
+	mtx_lock(&kn->kn_kq->kq_mtx);
+	mtx_lock(&knote_mtx);
 	kn->kn_data++;
-	KNOTE_ACTIVATE(kn);
+	KNOTE_ACTIVATE(kn, enqueued);
 
 	if ((kn->kn_flags & EV_ONESHOT) == 0) {
 		tv.tv_sec = kn->kn_sdata / 1000;
@@ -324,6 +365,11 @@
 		calloutp = (struct callout *)kn->kn_hook;
 		callout_reset(calloutp, tticks, filt_timerexpire, kn);
 	}
+	mtx_unlock(&knote_mtx);
+	mtx_unlock(&kn->kn_kq->kq_mtx);
+	mtx_unlock(&klist_mtx);
+	if (enqueued)
+		KNOTE(&kn->kn_kq->kq_sel.si_note, 0);
 }
 
 /*
@@ -338,7 +384,7 @@
 
 	if (kq_ncallouts >= kq_calloutmax)
 		return (ENOMEM);
-	kq_ncallouts++;
+	kq_ncallouts++;			/* protected by klist_mtx */
 
 	tv.tv_sec = kn->kn_sdata / 1000;
 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
@@ -362,7 +408,7 @@
 	calloutp = (struct callout *)kn->kn_hook;
 	callout_stop(calloutp);
 	FREE(calloutp, M_KQUEUE);
-	kq_ncallouts--;
+	kq_ncallouts--;			/* protected by klist_mtx */
 }
 
 static int
@@ -383,7 +429,6 @@
 	struct file *fp;
 	int fd, error;
 
-	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
@@ -391,6 +436,8 @@
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&kq->kq_head);
+	mtx_init(&kq->kq_mtx, "kqueue mutex", NULL, MTX_DEF);
+	kq->kq_fdp = fdp;
 	FILE_LOCK(fp);
 	fp->f_flag = FREAD | FWRITE;
 	fp->f_type = DTYPE_KQUEUE;
@@ -403,9 +450,7 @@
 	if (fdp->fd_knlistsize < 0)
 		fdp->fd_knlistsize = 0;		/* this process has a kq */
 	FILEDESC_UNLOCK(fdp);
-	kq->kq_fdp = fdp;
 done2:
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -425,7 +470,7 @@
 int
 kevent(struct thread *td, struct kevent_args *uap)
 {
-	struct kevent *kevp;
+	struct kevent kqkev[KQ_NEVENTS], *kevp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct timespec ts;
@@ -440,22 +485,21 @@
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
-			goto done_nogiant;
+			goto done;
 		uap->timeout = &ts;
 	}
-	mtx_lock(&Giant);
 
 	kq = fp->f_data;
 	nerrors = 0;
 
 	while (uap->nchanges > 0) {
 		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
-		error = copyin(uap->changelist, kq->kq_kev,
+		error = copyin(uap->changelist, kqkev,
 		    n * sizeof(struct kevent));
 		if (error)
 			goto done;
 		for (i = 0; i < n; i++) {
-			kevp = &kq->kq_kev[i];
+			kevp = &kqkev[i];
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td);
 			if (error) {
@@ -484,8 +528,6 @@
 
 	error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
 done:
-	mtx_unlock(&Giant);
-done_nogiant:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
@@ -528,7 +570,7 @@
 	struct filterops *fops;
 	struct file *fp = NULL;
 	struct knote *kn = NULL;
-	int s, error = 0;
+	int error, enqueued;
 
 	if (kev->filter < 0) {
 		if (kev->filter + EVFILT_SYSCOUNT < 0)
@@ -544,6 +586,12 @@
 		return (EINVAL);
 	}
 
+	enqueued = 0;
+top:
+	error = 0;
+	mtx_lock(&klist_mtx);
+	mtx_lock(&kq->kq_mtx);
+	mtx_lock(&knote_mtx);
 	FILEDESC_LOCK(fdp);
 	if (fops->f_isfd) {
 		/* validate descriptor */
@@ -575,7 +623,18 @@
 	}
 	FILEDESC_UNLOCK(fdp);
 
-	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+	/*
+	 * We came from below: EV_ADD enqueued a knote immediately.
+	 */
+	if (enqueued) {
+		enqueued = 0;
+		if (kn == NULL) {
+			error = ESRCH;
+			goto done;
+		}
+		goto onceagain;
+	}
+	if (kn == NULL && (kev->flags & EV_ADD) == 0) {
 		error = ENOENT;
 		goto done;
 	}
@@ -607,9 +666,9 @@
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 
-			knote_attach(kn, fdp);
+			knote_attach(kq, kn, fdp);
 			if ((error = fops->f_attach(kn)) != 0) {
-				knote_drop(kn, td);
+				knote_drop(kq, kn, td);
 				goto done;
 			}
 		} else {
@@ -623,36 +682,46 @@
 			kn->kn_kevent.udata = kev->udata;
 		}
 
-		s = splhigh();
-		if (kn->kn_fop->f_event(kn, 0))
-			KNOTE_ACTIVATE(kn);
-		splx(s);
+		if (kn->kn_fop->f_event(kn, 0)) {
+			KNOTE_ACTIVATE(kn, enqueued);
+			if (enqueued) {
+				if (fp != NULL)
+					fdrop(fp, td);
+				mtx_unlock(&knote_mtx);
+				mtx_unlock(&kq->kq_mtx);
+				mtx_unlock(&klist_mtx);
+				KNOTE(&kq->kq_sel.si_note, 0);
+				goto top;
+			}
+		}
 
 	} else if (kev->flags & EV_DELETE) {
 		kn->kn_fop->f_detach(kn);
-		knote_drop(kn, td);
+		knote_drop(kq, kn, td);
 		goto done;
 	}
 
+onceagain:
 	if ((kev->flags & EV_DISABLE) &&
-	    ((kn->kn_status & KN_DISABLED) == 0)) {
-		s = splhigh();
+	    ((kn->kn_status & KN_DISABLED) == 0))
 		kn->kn_status |= KN_DISABLED;
-		splx(s);
-	}
-
 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
-		s = splhigh();
 		kn->kn_status &= ~KN_DISABLED;
 		if ((kn->kn_status & KN_ACTIVE) &&
-		    ((kn->kn_status & KN_QUEUED) == 0))
-			knote_enqueue(kn);
-		splx(s);
+		    ((kn->kn_status & KN_QUEUED) == 0)) {
+			knote_enqueue(kn->kn_kq, kn);
+			enqueued = 1;
+		}
 	}
 
 done:
 	if (fp != NULL)
 		fdrop(fp, td);
+	mtx_unlock(&knote_mtx);
+	mtx_unlock(&kq->kq_mtx);
+	mtx_unlock(&klist_mtx);
+	if (enqueued)
+		KNOTE(&kq->kq_sel.si_note, 0);
 	return (error);
 }
 
@@ -661,10 +730,11 @@
 	const struct timespec *tsp, struct thread *td)
 {
 	struct kqueue *kq;
-	struct kevent *kevp;
+	struct kevent kqkev[KQ_NEVENTS], *kevp;
 	struct timeval atv, rtv, ttv;
-	struct knote *kn, marker;
-	int s, count, timeout, nkev = 0, error = 0;
+	struct knote *kn, *ckn;
+	u_int gen;
+	int count, timeout, nkev = 0, error = 0;
 
 	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
 
@@ -705,16 +775,17 @@
 	}
 
 start:
-	kevp = kq->kq_kev;
-	s = splhigh();
+	kevp = kqkev;
+	mtx_lock(&kq->kq_mtx);
 	if (kq->kq_count == 0) {
 		if (timeout < 0) { 
 			error = EWOULDBLOCK;
+			mtx_unlock(&kq->kq_mtx);
 		} else {
 			kq->kq_state |= KQ_SLEEP;
-			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+			error = msleep(kq, &kq->kq_mtx, PSOCK | PCATCH | PDROP,
+			    "kqread", timeout);
 		}
-		splx(s);
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
@@ -725,63 +796,129 @@
 		goto done;
 	}
 
-	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); 
-	while (count) {
-		kn = TAILQ_FIRST(&kq->kq_head);
-		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
-		if (kn == &marker) {
-			splx(s);
+	for (kn = TAILQ_FIRST(&kq->kq_head); count != 0;
+	    kn = TAILQ_NEXT(kn, kn_tqe)) {
+		if (kn == NULL) {
+			mtx_unlock(&kq->kq_mtx);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		if (kn->kn_status & KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
-			kq->kq_count--;
 			continue;
 		}
-		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
-		    kn->kn_fop->f_event(kn, 0) == 0) {
-			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-			kq->kq_count--;
-			continue;
+		if ((kn->kn_flags & EV_ONESHOT) == 0) {
+			mtx_lock(&knote_mtx);
+			if (kn->kn_fop->f_event(kn, 0) == 0) {
+				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+				mtx_unlock(&knote_mtx);
+				continue;
+			}
+			mtx_unlock(&knote_mtx);
 		}
-		*kevp = kn->kn_kevent;
-		kevp++;
-		nkev++;
 		if (kn->kn_flags & EV_ONESHOT) {
+			gen = kq->kq_dqgen;
+			mtx_unlock(&kq->kq_mtx);
+			mtx_lock(&klist_mtx);
+			mtx_lock(&kq->kq_mtx);
+			if (gen != kq->kq_dqgen ||
+			    !(kn->kn_status & KN_QUEUED)) {
+				mtx_unlock(&klist_mtx);
+				goto retry;
+			}
+			*kevp = kn->kn_kevent;
+			kevp++;
+			nkev++;
+			count--;
+			mtx_lock(&knote_mtx);
 			kn->kn_status &= ~KN_QUEUED;
-			kq->kq_count--;
-			splx(s);
 			kn->kn_fop->f_detach(kn);
-			knote_drop(kn, td);
-			s = splhigh();
-		} else if (kn->kn_flags & EV_CLEAR) {
-			kn->kn_data = 0;
-			kn->kn_fflags = 0;
-			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
-			kq->kq_count--;
+			knote_drop(kq, kn, td);
+			mtx_unlock(&knote_mtx);
+			mtx_unlock(&klist_mtx);
 		} else {
-			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
+			*kevp = kn->kn_kevent;
+			kevp++;
+			nkev++;
+			count--;
+			if (kn->kn_flags & EV_CLEAR) {
+				mtx_lock(&knote_mtx);
+				kn->kn_data = 0;
+				kn->kn_fflags = 0;
+				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+				mtx_unlock(&knote_mtx);
+			}
 		}
-		count--;
 		if (nkev == KQ_NEVENTS) {
-			splx(s);
-			error = copyout(&kq->kq_kev, ulistp,
+			error = copyout(kqkev, ulistp,
 			    sizeof(struct kevent) * nkev);
 			ulistp += nkev;
 			nkev = 0;
-			kevp = kq->kq_kev;
-			s = splhigh();
+			kevp = kqkev;
 			if (error)
 				break;
 		}
 	}
-	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); 
-	splx(s);
+	mtx_unlock(&kq->kq_mtx);
 done:
+	mtx_lock(&klist_mtx);
+	mtx_lock(&kq->kq_mtx);
+	mtx_lock(&knote_mtx);
+	for (kn = TAILQ_FIRST(&kq->kq_head); kn; kn = TAILQ_NEXT(kn, kn_tqe)) {
+		if (kn->kn_filter != EVFILT_PROC)
+			continue;
+		/*
+		 * This is a pretty crappy implementation of
+		 * kqueue_register() :-(
+		 */
+		while ((ckn = SLIST_FIRST(&kn_forklist(kn))) != NULL) {
+			struct knote *lkn;
+
+			SLIST_REMOVE_HEAD(&kn_forklist(kn), kn_link);
+			lkn = NULL;
+			FILEDESC_LOCK(kq->kq_fdp);
+			if (kq->kq_fdp->fd_knhashmask != 0) {
+				struct klist *list;
+			
+				list = &kq->kq_fdp->fd_knhash[
+				    KN_HASH((u_long)ckn->kn_id,
+				    kq->kq_fdp->fd_knhashmask)];
+				SLIST_FOREACH(lkn, list, kn_link)
+					if (ckn->kn_id == lkn->kn_id &&
+					    kq == lkn->kn_kq &&
+					    EVFILT_PROC == lkn->kn_filter)
+						break;
+			}
+			FILEDESC_UNLOCK(kq->kq_fdp);
+			if (lkn != NULL) {
+				lkn->kn_sfflags = ckn->kn_sfflags;
+				lkn->kn_sdata = ckn->kn_sdata;
+				lkn->kn_kevent.udata = ckn->kn_kevent.udata;
+				knote_free(ckn);
+			} else {
+				knote_attach(kq, ckn, kq->kq_fdp);
+				if ((error = ckn->kn_fop->f_attach(ckn)) != 0) {
+					knote_drop(kq, ckn, td);
+					for (kevp = kqkev; kevp < &kqkev[nkev];
+					    kevp++) {
+						if (kevp->ident == kn->kn_id &&
+						    kevp->filter ==
+						    EVFILT_PROC) {
+							kn->kn_fflags |=
+								NOTE_TRACKERR;
+							break;
+						}
+					}
+				}
+			}
+		}
+	}
+	mtx_unlock(&knote_mtx);
+	mtx_unlock(&kq->kq_mtx);
+	mtx_unlock(&klist_mtx);
 	if (nkev != 0)
-		error = copyout(&kq->kq_kev, ulistp,
+		error = copyout(kqkev, ulistp,
 		    sizeof(struct kevent) * nkev);
         td->td_retval[0] = maxevents - count;
 	return (error);
@@ -822,9 +959,9 @@
 {
 	struct kqueue *kq;
 	int revents = 0;
-	int s = splnet();
 
 	kq = fp->f_data;
+	mtx_lock(&kq->kq_mtx);
         if (events & (POLLIN | POLLRDNORM)) {
                 if (kq->kq_count) {
                         revents |= events & (POLLIN | POLLRDNORM);
@@ -833,7 +970,7 @@
 			kq->kq_state |= KQ_SEL;
 		}
 	}
-	splx(s);
+	mtx_unlock(&kq->kq_mtx);
 	return (revents);
 }
 
@@ -846,7 +983,9 @@
 
 	kq = fp->f_data;
 	bzero((void *)st, sizeof(*st));
+	mtx_lock(&kq->kq_mtx);
 	st->st_size = kq->kq_count;
+	mtx_unlock(&kq->kq_mtx);
 	st->st_blksize = sizeof(struct kevent);
 	st->st_mode = S_IFIFO;
 	return (0);
@@ -858,75 +997,64 @@
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp = kq->kq_fdp;
-	struct knote **knp, *kn, *kn0;
+	struct knote *kn;
 	int i;
 
+restart:
+	mtx_lock(&klist_mtx);
+	mtx_lock(&kq->kq_mtx);
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i < fdp->fd_knlistsize; i++) {
-		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
-		kn = *knp;
-		while (kn != NULL) {
-			kn0 = SLIST_NEXT(kn, kn_link);
+		for (kn = SLIST_FIRST(&fdp->fd_knlist[i]); kn;) {
 			if (kq == kn->kn_kq) {
+				mtx_lock(&knote_mtx);
 				kn->kn_fop->f_detach(kn);
-				*knp = kn0;
+				mtx_unlock(&knote_mtx);
 				FILE_LOCK(kn->kn_fp);
 				FILEDESC_UNLOCK(fdp);
 				fdrop_locked(kn->kn_fp, td);
 				knote_free(kn);
-				FILEDESC_LOCK(fdp);
+				mtx_unlock(&kq->kq_mtx);
+				mtx_unlock(&klist_mtx);
+				goto restart;
 			} else {
-				knp = &SLIST_NEXT(kn, kn_link);
+				kn = SLIST_NEXT(kn, kn_link);
 			}
-			kn = kn0;
 		}
 	}
 	if (fdp->fd_knhashmask != 0) {
 		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
-			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
-			kn = *knp;
-			while (kn != NULL) {
-				kn0 = SLIST_NEXT(kn, kn_link);
+			for (kn = SLIST_FIRST(&fdp->fd_knhash[i]); kn;) {
 				if (kq == kn->kn_kq) {
+					mtx_lock(&knote_mtx);
 					kn->kn_fop->f_detach(kn);
-					*knp = kn0;
+					mtx_unlock(&knote_mtx);
 		/* XXX non-fd release of kn->kn_ptr */
 					FILEDESC_UNLOCK(fdp);
 					knote_free(kn);
-					FILEDESC_LOCK(fdp);
+					mtx_unlock(&kq->kq_mtx);
+					mtx_unlock(&klist_mtx);
+					goto restart;
 				} else {
-					knp = &SLIST_NEXT(kn, kn_link);
+					kn = SLIST_NEXT(kn, kn_link);
 				}
-				kn = kn0;
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
+	mtx_unlock(&klist_mtx);
 	if (kq->kq_state & KQ_SEL) {
 		kq->kq_state &= ~KQ_SEL;
 		selwakeuppri(&kq->kq_sel, PSOCK);
 	}
+	mtx_unlock(&kq->kq_mtx);
+	mtx_destroy(&kq->kq_mtx);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
-static void
-kqueue_wakeup(struct kqueue *kq)
-{
-
-	if (kq->kq_state & KQ_SLEEP) {
-		kq->kq_state &= ~KQ_SLEEP;
-		wakeup(kq);
-	}
-	if (kq->kq_state & KQ_SEL) {
-		kq->kq_state &= ~KQ_SEL;
-		selwakeuppri(&kq->kq_sel, PSOCK);
-	}
-	KNOTE(&kq->kq_sel.si_note, 0);
-}
-
 /*
  * walk down a list of knotes, activating them if their event has triggered.
  */
@@ -935,9 +1063,26 @@
 {
 	struct knote *kn;
 
-	SLIST_FOREACH(kn, list, kn_selnext)
-		if (kn->kn_fop->f_event(kn, hint))
-			KNOTE_ACTIVATE(kn);
+top:
+	mtx_lock(&klist_mtx);
+	SLIST_FOREACH(kn, list, kn_selnext) {
+		mtx_lock(&kn->kn_kq->kq_mtx);
+		mtx_lock(&knote_mtx);
+		if (kn->kn_fop->f_event(kn, hint)) {
+			int enqueued = 0;
+
+			KNOTE_ACTIVATE(kn, enqueued);
+			if (enqueued) {
+				mtx_unlock(&knote_mtx);
+				mtx_unlock(&kn->kn_kq->kq_mtx);
+				mtx_unlock(&klist_mtx);
+				goto top;
+			}
+		}
+		mtx_unlock(&knote_mtx);
+		mtx_unlock(&kn->kn_kq->kq_mtx);
+	}
+	mtx_unlock(&klist_mtx);
 }
 
 /*
@@ -947,11 +1092,19 @@
 knote_remove(struct thread *td, struct klist *list)
 {
 	struct knote *kn;
+	struct mtx *kqmtx;
 
+	mtx_lock(&klist_mtx);
 	while ((kn = SLIST_FIRST(list)) != NULL) {
+		kqmtx = &kn->kn_kq->kq_mtx;
+		mtx_lock(kqmtx);
+		mtx_lock(&knote_mtx);
 		kn->kn_fop->f_detach(kn);
-		knote_drop(kn, td);
+		knote_drop(kn->kn_kq, kn, td);
+		mtx_unlock(&knote_mtx);
+		mtx_unlock(kqmtx);
 	}
+	mtx_unlock(&klist_mtx);
 }
 
 /*
@@ -970,7 +1123,7 @@
 }
 
 static void
-knote_attach(struct knote *kn, struct filedesc *fdp)
+knote_attach(struct kqueue *kq, struct knote *kn, struct filedesc *fdp)
 {
 	struct klist *list, *tmp_knhash;
 	u_long tmp_knhashmask;
@@ -1026,12 +1179,8 @@
 	kn->kn_status = 0;
 }
 
-/*
- * should be called at spl == 0, since we don't want to hold spl
- * while calling fdrop and free.
- */
 static void
-knote_drop(struct knote *kn, struct thread *td)
+knote_drop(struct kqueue *kq, struct knote *kn, struct thread *td)
 {
         struct filedesc *fdp = td->td_proc->p_fd;
 	struct klist *list;
@@ -1047,7 +1196,7 @@
 
 	SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
-		knote_dequeue(kn);
+		knote_dequeue(kn->kn_kq, kn);
 	if (kn->kn_fop->f_isfd)
 		fdrop_locked(kn->kn_fp, td);
 	knote_free(kn);
@@ -1055,32 +1204,32 @@
 
 
 static void
-knote_enqueue(struct knote *kn)
+knote_enqueue(struct kqueue *kq, struct knote *kn)
 {
-	struct kqueue *kq = kn->kn_kq;
-	int s = splhigh();
-
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
-	splx(s);
-	kqueue_wakeup(kq);
+	if (kq->kq_state & KQ_SLEEP) {
+		kq->kq_state &= ~KQ_SLEEP;
+		wakeup(kq);
+	}
+	if (kq->kq_state & KQ_SEL) {
+		kq->kq_state &= ~KQ_SEL;
+		selwakeuppri(&kq->kq_sel, PSOCK);
+	}
 }
 
 static void
-knote_dequeue(struct knote *kn)
+knote_dequeue(struct kqueue *kq, struct knote *kn)
 {
-	struct kqueue *kq = kn->kn_kq;
-	int s = splhigh();
-
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
+	kq->kq_dqgen++;
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
-	splx(s);
 }
 
 static void
@@ -1088,6 +1237,8 @@
 {
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtx_init(&klist_mtx, "kqueue note lists", NULL, MTX_DEF);
+	mtx_init(&knote_mtx, "kqueue notes", NULL, MTX_DEF);
 
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
Index: kern/kern_exec.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.241
diff -u -r1.241 kern_exec.c
--- kern/kern_exec.c	1 Apr 2004 00:10:44 -0000	1.241
+++ kern/kern_exec.c	16 Apr 2004 03:07:48 -0000
@@ -622,7 +622,6 @@
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
-	KNOTE(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
@@ -646,6 +645,7 @@
 		newargs = NULL;
 	}
 	PROC_UNLOCK(p);
+	KNOTE(&p->p_klist, NOTE_EXEC);
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
Index: kern/kern_exit.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.229
diff -u -r1.229 kern_exit.c
--- kern/kern_exit.c	5 Apr 2004 21:03:34 -0000	1.229
+++ kern/kern_exit.c	15 Apr 2004 18:22:52 -0000
@@ -434,6 +434,14 @@
 	 * Save exit status and final rusage info, adding in child rusage
 	 * info and self times.
 	 */
+	KNOTE(&p->p_klist, NOTE_EXIT);
+	/*
+	 * Start to notify interested parties of our demise.
+	 * Just delete all entries in the p_klist. At this point we won't
+	 * report any more events, and there are nasty race conditions that
+	 * can beat us if we don't.
+	 */
+	klist_disappearing(&p->p_klist);
 	mtx_lock(&Giant);	
 	PROC_LOCK(p);
 	p->p_xstat = rv;
@@ -442,19 +450,7 @@
 	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
 	mtx_unlock_spin(&sched_lock);
 	ruadd(p->p_ru, &p->p_stats->p_cru);
-
-	/*
-	 * Notify interested parties of our demise.
-	 */
-	KNOTE(&p->p_klist, NOTE_EXIT);
 	mtx_unlock(&Giant);	
-	/*
-	 * Just delete all entries in the p_klist. At this point we won't
-	 * report any more events, and there are nasty race conditions that
-	 * can beat us if we don't.
-	 */
-	while (SLIST_FIRST(&p->p_klist))
-		SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
 
 	/*
 	 * Notify parent that we're gone.  If parent has the PS_NOCLDWAIT
Index: kern/kern_fork.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.226
diff -u -r1.226 kern_fork.c
--- kern/kern_fork.c	5 Apr 2004 21:03:34 -0000	1.226
+++ kern/kern_fork.c	16 Apr 2004 03:08:38 -0000
@@ -715,15 +715,12 @@
 	/*
 	 * Now can be swapped.
 	 */
-	PROC_LOCK(p1);
-	_PRELE(p1);
-
+	PRELE(p1);
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
-	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
Index: kern/kern_sig.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.275
diff -u -r1.275 kern_sig.c
--- kern/kern_sig.c	5 Apr 2004 21:03:35 -0000	1.275
+++ kern/kern_sig.c	15 Apr 2004 18:22:52 -0000
@@ -2682,9 +2682,7 @@
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
-	PROC_LOCK(p);
-	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
-	PROC_UNLOCK(p);
+	klist_add(&p->p_klist, kn);
 
 	return (0);
 }
@@ -2694,9 +2692,7 @@
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
-	PROC_LOCK(p);
-	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
-	PROC_UNLOCK(p);
+	klist_remove(&p->p_klist, kn);
 }
 
 /*
Index: kern/sys_pipe.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/sys_pipe.c,v
retrieving revision 1.171
diff -u -r1.171 sys_pipe.c
--- kern/sys_pipe.c	27 Mar 2004 19:50:22 -0000	1.171
+++ kern/sys_pipe.c	15 Apr 2004 18:22:52 -0000
@@ -1502,7 +1502,7 @@
 		return (1);
 	}
 
-	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
+	klist_add(&cpipe->pipe_sel.si_note, kn);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
@@ -1520,7 +1520,7 @@
 		}
 		cpipe = cpipe->pipe_peer;
 	}
-	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&cpipe->pipe_sel.si_note, kn);
 	PIPE_UNLOCK(cpipe);
 }
 
Index: kern/tty.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/tty.c,v
retrieving revision 1.209
diff -u -r1.209 tty.c
--- kern/tty.c	21 Feb 2004 20:41:11 -0000	1.209
+++ kern/tty.c	15 Apr 2004 18:22:52 -0000
@@ -1203,7 +1203,7 @@
 	kn->kn_hook = (caddr_t)dev;
 
 	s = spltty();
-	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+	klist_add(klist, kn);
 	splx(s);
 
 	return (0);
@@ -1215,7 +1215,7 @@
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 	int s = spltty();
 
-	SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext);
+	klist_remove(&tp->t_rsel.si_note, kn);
 	splx(s);
 }
 
@@ -1224,11 +1224,14 @@
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 
+	mtx_lock(&Giant);
 	kn->kn_data = ttnread(tp);
 	if (ISSET(tp->t_state, TS_ZOMBIE)) {
 		kn->kn_flags |= EV_EOF;
+		mtx_unlock(&Giant);
 		return (1);
 	}
+	mtx_unlock(&Giant);
 	return (kn->kn_data > 0);
 }
 
@@ -1238,7 +1241,7 @@
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 	int s = spltty();
 
-	SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext);
+	klist_remove(&tp->t_wsel.si_note, kn);
 	splx(s);
 }
 
@@ -1246,12 +1249,15 @@
 filt_ttywrite(struct knote *kn, long hint)
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
+	int setting;
 
+	mtx_lock(&Giant);
 	kn->kn_data = tp->t_outq.c_cc;
-	if (ISSET(tp->t_state, TS_ZOMBIE))
-		return (1);
-	return (kn->kn_data <= tp->t_olowat &&
+	setting = ISSET(tp->t_state, TS_ZOMBIE) ||
+	    (kn->kn_data <= tp->t_olowat &&
 	    ISSET(tp->t_state, TS_CONNECTED));
+	mtx_unlock(&Giant);
+	return (setting);
 }
 
 /*
Index: kern/uipc_socket.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.169
diff -u -r1.169 uipc_socket.c
--- kern/uipc_socket.c	5 Apr 2004 21:03:36 -0000	1.169
+++ kern/uipc_socket.c	15 Apr 2004 18:22:52 -0000
@@ -1819,7 +1819,7 @@
 	}
 
 	s = splnet();
-	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
+	klist_add(&sb->sb_sel.si_note, kn);
 	sb->sb_flags |= SB_KNOTE;
 	splx(s);
 	return (0);
@@ -1831,7 +1831,7 @@
 	struct socket *so = kn->kn_fp->f_data;
 	int s = splnet();
 
-	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&so->so_rcv.sb_sel.si_note, kn);
 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 	splx(s);
@@ -1864,7 +1864,7 @@
 	struct socket *so = kn->kn_fp->f_data;
 	int s = splnet();
 
-	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&so->so_snd.sb_sel.si_note, kn);
 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 	splx(s);
Index: kern/vfs_aio.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/vfs_aio.c,v
retrieving revision 1.169
diff -u -r1.169 vfs_aio.c
--- kern/vfs_aio.c	14 Mar 2004 02:06:27 -0000	1.169
+++ kern/vfs_aio.c	16 Apr 2004 02:53:31 -0000
@@ -49,6 +49,7 @@
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
+#include <sys/eventvar.h>
 
 #include <posix4/posix4.h>
 #include <vm/vm.h>
@@ -2268,14 +2269,13 @@
 
 	/*
 	 * The aiocbe pointer must be validated before using it, so
-	 * registration is restricted to the kernel; the user cannot
-	 * set EV_FLAG1.
+	 * registration is restricted to the kernel.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_flags &= ~EV_FLAG1;
 
-	SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
+	klist_add(&aiocbe->klist, kn);
 
 	return (0);
 }
@@ -2286,7 +2286,7 @@
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
-	SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
+	klist_remove(&aiocbe->klist, kn);
 }
 
 /* kqueue filter function */
Index: kern/vfs_subr.c
===================================================================
RCS file: /usr/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.489
diff -u -r1.489 vfs_subr.c
--- kern/vfs_subr.c	5 Apr 2004 21:03:37 -0000	1.489
+++ kern/vfs_subr.c	16 Apr 2004 03:11:52 -0000
@@ -3225,8 +3225,8 @@
 	struct vnode *vp;
 {
 
-	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	VN_KNOTE(vp, NOTE_REVOKE);
+	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_events) {
 		vp->v_pollinfo->vpi_events = 0;
 		selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
Index: net/bpf.c
===================================================================
RCS file: /usr/ncvs/src/sys/net/bpf.c,v
retrieving revision 1.124
diff -u -r1.124 bpf.c
--- net/bpf.c	29 Feb 2004 15:32:33 -0000	1.124
+++ net/bpf.c	15 Apr 2004 18:37:16 -0000
@@ -529,7 +529,6 @@
 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 
 	selwakeuppri(&d->bd_sel, PRINET);
-	KNOTE(&d->bd_sel.si_note, 0);
 }
 
 static void
@@ -537,14 +536,19 @@
 	void *arg;
 {
 	struct bpf_d *d = (struct bpf_d *)arg;
+	int donote = 0;
 
 	BPFD_LOCK(d);
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
-		if (d->bd_slen != 0)
+		if (d->bd_slen != 0) {
 			bpf_wakeup(d);
+			donote = 1;
+		}
 	}
 	BPFD_UNLOCK(d);
+	if (donote)
+		KNOTE(&d->bd_sel.si_note, 0);
 }
 
 static	int
@@ -1093,7 +1097,7 @@
 	kn->kn_fop = &bpfread_filtops;
 	kn->kn_hook = d;
 	BPFD_LOCK(d);
-	SLIST_INSERT_HEAD(&d->bd_sel.si_note, kn, kn_selnext);
+	klist_add(&d->bd_sel.si_note, kn);
 	BPFD_UNLOCK(d);
 
 	return (0);
@@ -1106,7 +1110,7 @@
 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
 	BPFD_LOCK(d);
-	SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
+	klist_remove(&d->bd_sel.si_note, kn);
 	BPFD_UNLOCK(d);
 }
 
Index: net/if.c
===================================================================
RCS file: /usr/ncvs/src/sys/net/if.c,v
retrieving revision 1.185
diff -u -r1.185 if.c
--- net/if.c	13 Mar 2004 02:35:03 -0000	1.185
+++ net/if.c	16 Apr 2004 03:13:23 -0000
@@ -211,8 +211,8 @@
 
 	kn->kn_hook = (caddr_t)klist;
 
-	/* XXX locking? */
-	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+	/* XXX klist locked */
+	klist_add(klist, kn);
 
 	return (0);
 }
@@ -224,7 +224,8 @@
 
 	if (kn->kn_status & KN_DETACHED)
 		return;
-	SLIST_REMOVE(klist, kn, knote, kn_selnext);
+	/* XXX klist locked */
+	klist_remove(klist, kn);
 }
 
 static int
@@ -606,13 +607,13 @@
 #ifdef MAC
 	mac_destroy_ifnet(ifp);
 #endif /* MAC */
-	KNOTE(&ifp->if_klist, NOTE_EXIT);
 	IFNET_WLOCK();
 	TAILQ_REMOVE(&ifnet, ifp, if_link);
 	IFNET_WUNLOCK();
 	mtx_destroy(&ifp->if_snd.ifq_mtx);
 	IF_AFDATA_DESTROY(ifp);
 	splx(s);
+	KNOTE(&ifp->if_klist, NOTE_EXIT);
 }
 
 /*
Index: sys/event.h
===================================================================
RCS file: /usr/ncvs/src/sys/sys/event.h,v
retrieving revision 1.22
diff -u -r1.22 event.h
--- sys/event.h	2 Feb 2003 19:39:51 -0000	1.22
+++ sys/event.h	16 Apr 2004 03:01:12 -0000
@@ -127,7 +127,10 @@
 MALLOC_DECLARE(M_KQUEUE);
 #endif
 
-#define KNOTE(list, hint)	if ((list) != NULL) knote(list, hint)
+#define KNOTE(list, hint)		do {				\
+	if ((list) != NULL)						\
+		knote(list, hint);					\
+} while (0)
 
 /*
  * Flag indicating hint is a signal.  Used by EVFILT_SIGNAL, and also
@@ -168,6 +171,7 @@
 #define kn_fflags	kn_kevent.fflags
 #define kn_data		kn_kevent.data
 #define kn_fp		kn_ptr.p_fp
+#define kn_forklist(kn)	(*(struct klist *)&(kn)->kn_hook)
 };
 
 struct thread;
@@ -180,6 +184,9 @@
 		    struct kevent *kev, struct thread *p);
 extern int	kqueue_add_filteropts(int filt, struct filterops *filtops);
 extern int	kqueue_del_filteropts(int filt);
+extern void	klist_add(struct klist *list, struct knote *note);
+extern void	klist_remove(struct klist *list, struct knote *note);
+extern void	klist_disappearing(struct klist *list);
 
 #else 	/* !_KERNEL */
 
Index: sys/eventvar.h
===================================================================
RCS file: /usr/ncvs/src/sys/sys/eventvar.h,v
retrieving revision 1.4
diff -u -r1.4 eventvar.h
--- sys/eventvar.h	18 Jul 2000 19:31:48 -0000	1.4
+++ sys/eventvar.h	16 Apr 2004 00:40:04 -0000
@@ -34,13 +34,14 @@
 
 struct kqueue {
 	TAILQ_HEAD(kqlist, knote) kq_head;	/* list of pending event */
+	u_int		kq_dqgen;		/* generation of dequeues */
 	int		kq_count;		/* number of pending events */
+	struct		mtx kq_mtx;
 	struct		selinfo kq_sel;	
 	struct		filedesc *kq_fdp;
 	int		kq_state;
 #define KQ_SEL		0x01
 #define KQ_SLEEP	0x02
-	struct		kevent kq_kev[KQ_NEVENTS];
 };
 
 #endif /* !_SYS_EVENTVAR_H_ */
Index: ufs/ufs/ufs_vnops.c
===================================================================
RCS file: /usr/ncvs/src/sys/ufs/ufs/ufs_vnops.c,v
retrieving revision 1.238
diff -u -r1.238 ufs_vnops.c
--- ufs/ufs/ufs_vnops.c	11 Mar 2004 18:50:33 -0000	1.238
+++ ufs/ufs/ufs_vnops.c	15 Apr 2004 18:38:47 -0000
@@ -2625,7 +2625,7 @@
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
-	SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext);
+	klist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 
 	return (0);
@@ -2638,8 +2638,7 @@
 
 	KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
-	SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note,
-	    kn, knote, kn_selnext);
+	klist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 }
 


-- 
Brian Fundakowski Feldman                           \'[ FreeBSD ]''''''''''\
  <> green@FreeBSD.org                               \  The Power to Serve! \
 Opinions expressed are my own.                       \,,,,,,,,,,,,,,,,,,,,,,\




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200404160340.i3G3eTKi004092>