Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 17 Jan 2014 05:26:55 +0000 (UTC)
From:      Adrian Chadd <adrian@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r260806 - in head/sys: compat/freebsd32 kern sys
Message-ID:  <201401170526.s0H5QtFI097439@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: adrian
Date: Fri Jan 17 05:26:55 2014
New Revision: 260806
URL: http://svnweb.freebsd.org/changeset/base/260806

Log:
  Implement a kqueue notification path for sendfile.
  
  This fires off a kqueue note (of type sendfile) to the configured kqfd
  when the sendfile transaction has completed and the relevant memory
  backing the transaction is no longer in use by this transaction.
  This is analogous to SF_SYNC waiting for the mbufs to complete -
  except now you don't have to wait.
  
  Both SF_SYNC and SF_KQUEUE should work together, even if it
  doesn't necessarily make any practical sense.
  
  This is designed for use by applications which use backing cache/store
  files (eg Varnish) or POSIX shared memory (not sure anything is using
  it yet!) to know when a region of memory is free for re-use.  Note
  it doesn't mark the region as free overall - only free from this
  transaction.  The application developer still needs to track which
  ranges are in the process of being recycled and wait until all
  pending transactions are completed.
  
  TODO:
  
  * documentation, as always
  
  Sponsored by:	Netflix, Inc.

Modified:
  head/sys/compat/freebsd32/freebsd32_misc.c
  head/sys/kern/uipc_syscalls.c
  head/sys/sys/sf_base.h
  head/sys/sys/sf_sync.h

Modified: head/sys/compat/freebsd32/freebsd32_misc.c
==============================================================================
--- head/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan 17 05:15:44 2014	(r260805)
+++ head/sys/compat/freebsd32/freebsd32_misc.c	Fri Jan 17 05:26:55 2014	(r260806)
@@ -1644,18 +1644,28 @@ struct sf_hdtr32 {
 	int trl_cnt;
 };
 
+struct sf_hdtr_kq32 {
+	int kq_fd;
+	uint32_t kq_flags;
+	uint32_t kq_udata;	/* 32-bit void ptr */
+	uint32_t kq_ident;	/* 32-bit uintptr_t */
+};
+
 static int
 freebsd32_do_sendfile(struct thread *td,
     struct freebsd32_sendfile_args *uap, int compat)
 {
 	struct sf_hdtr32 hdtr32;
 	struct sf_hdtr hdtr;
+	struct sf_hdtr_kq32 hdtr_kq32;
+	struct sf_hdtr_kq hdtr_kq;
 	struct uio *hdr_uio, *trl_uio;
 	struct iovec32 *iov32;
 	off_t offset;
 	int error;
 	off_t sbytes;
 	struct sendfile_sync *sfs;
+	int do_kqueue = 0;
 
 	offset = PAIR32TO64(off_t, uap->offset);
 	if (offset < 0)
@@ -1687,10 +1697,32 @@ freebsd32_do_sendfile(struct thread *td,
 			if (error)
 				goto out;
 		}
+
+		/*
+		 * If SF_KQUEUE is set, then we need to also copy in
+		 * the kqueue data after the normal hdtr set and set do_kqueue=1.
+		 */
+		if (uap->flags & SF_KQUEUE) {
+			error = copyin(((char *) uap->hdtr) + sizeof(hdtr32),
+			    &hdtr_kq32,
+			    sizeof(hdtr_kq32));
+			if (error != 0)
+				goto out;
+
+			/* 32->64 bit fields */
+			CP(hdtr_kq32, hdtr_kq, kq_fd);
+			CP(hdtr_kq32, hdtr_kq, kq_flags);
+			PTRIN_CP(hdtr_kq32, hdtr_kq, kq_udata);
+			CP(hdtr_kq32, hdtr_kq, kq_ident);
+			do_kqueue = 1;
+		}
 	}
 
+
+	/* Call sendfile */
+	/* XXX stack depth! */
 	error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat,
-	    offset, uap->nbytes, &sbytes, hdr_uio, trl_uio);
+	    offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));

Modified: head/sys/kern/uipc_syscalls.c
==============================================================================
--- head/sys/kern/uipc_syscalls.c	Fri Jan 17 05:15:44 2014	(r260805)
+++ head/sys/kern/uipc_syscalls.c	Fri Jan 17 05:26:55 2014	(r260806)
@@ -123,6 +123,10 @@ static int getpeername1(struct thread *t
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
+static int	filt_sfsync_attach(struct knote *kn);
+static void	filt_sfsync_detach(struct knote *kn);
+static int	filt_sfsync(struct knote *kn, long hint);
+
 /*
  * sendfile(2)-related variables and associated sysctls
  */
@@ -132,8 +136,28 @@ static int sfreadahead = 1;
 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
     &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 
+#ifdef	SFSYNC_DEBUG
+static int sf_sync_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW,
+    &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle");
+#define	SFSYNC_DPRINTF(s, ...)				\
+		do {					\
+			if (sf_sync_debug)		\
+				printf((s), ##__VA_ARGS__); \
+		} while (0)
+#else
+#define	SFSYNC_DPRINTF(c, ...)
+#endif
+
 static uma_zone_t	zone_sfsync;
 
+static struct filterops sendfile_filtops = {
+	.f_isfd = 0,
+	.f_attach = filt_sfsync_attach,
+	.f_detach = filt_sfsync_detach,
+	.f_event = filt_sfsync,
+};
+
 static void
 sfstat_init(const void *unused)
 {
@@ -152,6 +176,7 @@ sf_sync_init(const void *unused)
 	    NULL, NULL,
 	    UMA_ALIGN_CACHE,
 	    0);
+	kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops);
 }
 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL);
 
@@ -1860,6 +1885,118 @@ getsockaddr(namp, uaddr, len)
 	return (error);
 }
 
+static int
+filt_sfsync_attach(struct knote *kn)
+{
+	struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata;
+	struct knlist *knl = &sfs->klist;
+
+	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
+
+	/*
+	 * Validate that we actually received this via the kernel API.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+
+	kn->kn_ptr.p_v = sfs;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knl->kl_lock(knl->kl_lockarg);
+	/*
+	 * If we're in the "freeing" state,
+	 * don't allow the add.  That way we don't
+	 * end up racing with some other thread that
+	 * is trying to finish some setup.
+	 */
+	if (sfs->state == SF_STATE_FREEING) {
+		knl->kl_unlock(knl->kl_lockarg);
+		return (EINVAL);
+	}
+	knlist_add(&sfs->klist, kn, 1);
+	knl->kl_unlock(knl->kl_lockarg);
+
+	return (0);
+}
+
+/*
+ * Called when a knote is being detached.
+ */
+static void
+filt_sfsync_detach(struct knote *kn)
+{
+	struct knlist *knl;
+	struct sendfile_sync *sfs;
+	int do_free = 0;
+
+	sfs = kn->kn_ptr.p_v;
+	knl = &sfs->klist;
+
+	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
+
+	knl->kl_lock(knl->kl_lockarg);
+	if (!knlist_empty(knl))
+		knlist_remove(knl, kn, 1);
+
+	/*
+	 * If the list is empty _AND_ the refcount is 0
+	 * _AND_ we've finished the setup phase and now
+	 * we're in the running phase, we can free the
+	 * underlying sendfile_sync.
+	 *
+	 * But we shouldn't do it before finishing the
+	 * underlying divorce from the knote.
+	 *
+	 * So, we have the sfsync lock held; transition
+	 * it to "freeing", then unlock, then free
+	 * normally.
+	 */
+	if (knlist_empty(knl)) {
+		if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) {
+			SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
+			    "count==0, empty list: time to free!\n",
+			    __func__,
+			    (unsigned long long) curthread->td_tid,
+			    sfs);
+			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
+			do_free = 1;
+		}
+	}
+	knl->kl_unlock(knl->kl_lockarg);
+
+	/*
+	 * Only call free if we're the one who has transitioned things
+	 * to free.  Otherwise we could race with another thread that
+	 * is currently tearing things down.
+	 */
+	if (do_free == 1) {
+		SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n",
+		    __func__,
+		    (unsigned long long) curthread->td_tid,
+		    sfs,
+		    __FILE__,
+		    __LINE__);
+		sf_sync_free(sfs);
+	}
+}
+
+static int
+filt_sfsync(struct knote *kn, long hint)
+{
+	struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v;
+	int ret;
+
+	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
+
+	/*
+	 * XXX add a lock assertion here!
+	 */
+	ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED);
+
+	return (ret);
+}
+
+
 /*
  * Detach mapped page and release resources back to the system.
  */
@@ -1885,21 +2022,97 @@ sf_buf_mext(struct mbuf *mb, void *addr,
 		sfs = addr;
 		sf_sync_deref(sfs);
 	}
+	/*
+	 * sfs may be invalid at this point, don't use it!
+	 */
 	return (EXT_FREE_OK);
 }
 
+/*
+ * Called to remove a reference to a sf_sync object.
+ *
+ * This is generally done during the mbuf free path to signify
+ * that one of the mbufs in the transaction has been completed.
+ *
+ * If we're doing SF_SYNC and the refcount is zero then we'll wake
+ * up any waiters.
+ *
+ * IF we're doing SF_KQUEUE and the refcount is zero then we'll
+ * fire off the knote.
+ */
 void
 sf_sync_deref(struct sendfile_sync *sfs)
 {
+	int do_free = 0;
 
 	if (sfs == NULL)
 		return;
 
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
-	if (--sfs->count == 0)
-		cv_signal(&sfs->cv);
+	sfs->count --;
+
+	/*
+	 * Only fire off the wakeup / kqueue notification if
+	 * we are in the running state.
+	 */
+	if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) {
+		if (sfs->flags & SF_SYNC)
+			cv_signal(&sfs->cv);
+
+		if (sfs->flags & SF_KQUEUE) {
+			SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n",
+			    __func__,
+			    (unsigned long long) curthread->td_tid,
+			    sfs);
+			KNOTE_LOCKED(&sfs->klist, 1);
+		}
+
+		/*
+		 * If we're not waiting around for a sync,
+		 * check if the knote list is empty.
+		 * If it is, we transition to free.
+		 *
+		 * XXX I think it's about time I added some state
+		 * or flag that says whether we're supposed to be
+		 * waiting around until we've done a signal.
+		 *
+		 * XXX Ie, the reason that I don't free it here
+		 * is because the caller will free the last reference,
+		 * not us.  That should be codified in some flag
+		 * that indicates "self-free" rather than checking
+		 * for SF_SYNC all the time.
+		 */
+		if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) {
+			SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
+			    "count==0, empty list: time to free!\n",
+			    __func__,
+			    (unsigned long long) curthread->td_tid,
+			    sfs);
+			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
+			do_free = 1;
+		}
+
+	}
 	mtx_unlock(&sfs->mtx);
+
+	/*
+	 * Attempt to do a free here.
+	 *
+	 * We do this outside of the lock because it may destroy the
+	 * lock in question as it frees things.  We can optimise this
+	 * later.
+	 *
+	 * XXX yes, we should make it a requirement to hold the
+	 * lock across sf_sync_free().
+	 */
+	if (do_free == 1) {
+		SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n",
+		    __func__,
+		    (unsigned long long) curthread->td_tid,
+		    sfs);
+		sf_sync_free(sfs);
+	}
 }
 
 /*
@@ -1917,6 +2130,10 @@ sf_sync_alloc(uint32_t flags)
 	mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 	cv_init(&sfs->cv, "sendfile");
 	sfs->flags = flags;
+	sfs->state = SF_STATE_SETUP;
+	knlist_init_mtx(&sfs->klist, &sfs->mtx);
+
+	SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags);
 
 	return (sfs);
 }
@@ -1946,13 +2163,49 @@ sf_sync_syscall_wait(struct sendfile_syn
 	if (sfs == NULL)
 		return;
 
-	mtx_lock(&sfs->mtx);
+	KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
+	    __func__,
+	    sfs));
+
+	/*
+	 * If we're not requested to wait during the syscall,
+	 * don't bother waiting.
+	 */
+	if ((sfs->flags & SF_SYNC) == 0)
+		goto out;
+
+	/*
+	 * This is a bit suboptimal and confusing, so bear with me.
+	 *
+	 * Ideally sf_sync_syscall_wait() will wait until
+	 * all pending mbuf transmit operations are done.
+	 * This means that when sendfile becomes async, it'll
+	 * run in the background and will transition from
+	 * RUNNING to COMPLETED when it's finished acquiring
+	 * new things to send.  Then, when the mbufs finish
+	 * sending, COMPLETED + sfs->count == 0 is enough to
+	 * know that no further work is being done.
+	 *
+	 * So, we will sleep on both RUNNING and COMPLETED.
+	 * It's up to the (in progress) async sendfile loop
+	 * to transition the sf_sync from RUNNING to
+	 * COMPLETED so the wakeup above will actually
+	 * do the cv_signal() call.
+	 */
+	if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING)
+		goto out;
+
 	if (sfs->count != 0)
 		cv_wait(&sfs->cv, &sfs->mtx);
 	KASSERT(sfs->count == 0, ("sendfile sync still busy"));
-	mtx_unlock(&sfs->mtx);
+
+out:
+	return;
 }
 
+/*
+ * Free an sf_sync if it's appropriate to.
+ */
 void
 sf_sync_free(struct sendfile_sync *sfs)
 {
@@ -1960,18 +2213,158 @@ sf_sync_free(struct sendfile_sync *sfs)
 	if (sfs == NULL)
 		return;
 
+	SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x "
+	    "count=%d\n",
+	    __func__,
+	    (long long) curthread->td_tid,
+	    sfs,
+	    sfs->state,
+	    sfs->flags,
+	    sfs->count);
+
+	mtx_lock(&sfs->mtx);
+
 	/*
-	 * XXX we should ensure that nothing else has this
-	 * locked before freeing.
+	 * We keep the sf_sync around if the state is active,
+	 * we are doing kqueue notification and we have active
+	 * knotes.
+	 *
+	 * If the caller wants to free us right this second it
+	 * should transition this to the freeing state.
+	 *
+	 * So, complain loudly if they break this rule.
 	 */
-	mtx_lock(&sfs->mtx);
+	if (sfs->state != SF_STATE_FREEING) {
+		printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n",
+		    __func__,
+		    (unsigned long long) curthread->td_tid,
+		    sfs);
+		mtx_unlock(&sfs->mtx);
+		return;
+	}
+
 	KASSERT(sfs->count == 0, ("sendfile sync still busy"));
 	cv_destroy(&sfs->cv);
+	/*
+	 * This doesn't call knlist_detach() on each knote; it just frees
+	 * the entire list.
+	 */
+	knlist_delete(&sfs->klist, curthread, 1);
 	mtx_destroy(&sfs->mtx);
+	SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n",
+	    __func__,
+	    (unsigned long long) curthread->td_tid,
+	    sfs);
 	uma_zfree(zone_sfsync, sfs);
 }
 
 /*
+ * Setup a sf_sync to post a kqueue notification when things are complete.
+ */
+int
+sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq)
+{
+	struct kevent kev;
+	int error;
+
+	sfs->flags |= SF_KQUEUE;
+
+	/* Check the flags are valid */
+	if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0)
+		return (EINVAL);
+
+	SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n",
+	    __func__,
+	    sfs,
+	    sfkq->kq_fd,
+	    sfkq->kq_flags,
+	    (void *) sfkq->kq_ident,
+	    (void *) sfkq->kq_udata);
+
+	/* Setup and register a knote on the given kqfd. */
+	kev.ident = (uintptr_t) sfkq->kq_ident;
+	kev.filter = EVFILT_SENDFILE;
+	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags;
+	kev.data = (intptr_t) sfs;
+	kev.udata = sfkq->kq_udata;
+
+	error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1);
+	if (error != 0) {
+		SFSYNC_DPRINTF("%s: returned %d\n", __func__, error);
+	}
+	return (error);
+}
+
+void
+sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state,
+    int islocked)
+{
+	sendfile_sync_state_t old_state;
+
+	if (! islocked)
+		mtx_lock(&sfs->mtx);
+
+	/*
+	 * Update our current state.
+	 */
+	old_state = sfs->state;
+	sfs->state = state;
+	SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n",
+	    __func__,
+	    (unsigned long long) curthread->td_tid,
+	    sfs,
+	    old_state,
+	    state);
+
+	/*
+	 * If we're transitioning from RUNNING to COMPLETED and the count is
+	 * zero, then post the knote.  The caller may have completed the
+	 * send before we updated the state to COMPLETED and we need to make
+	 * sure this is communicated.
+	 */
+	if (old_state == SF_STATE_RUNNING
+	    && state == SF_STATE_COMPLETED
+	    && sfs->count == 0
+	    && sfs->flags & SF_KQUEUE) {
+		SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n",
+		    __func__,
+		    (unsigned long long) curthread->td_tid,
+		    sfs);
+		KNOTE_LOCKED(&sfs->klist, 1);
+	}
+
+	if (! islocked)
+		mtx_unlock(&sfs->mtx);
+}
+
+/*
+ * Set the retval/errno for the given transaction.
+ *
+ * This will eventually/ideally be used when the KNOTE is fired off
+ * to signify the completion of this transaction.
+ *
+ * The sfsync lock should be held before entering this function.
+ */
+void
+sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno)
+{
+
+	KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
+	    __func__,
+	    sfs));
+
+	SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n",
+	    __func__,
+	    (unsigned long long) curthread->td_tid,
+	    sfs,
+	    xerrno,
+	    (intmax_t) retval);
+
+	sfs->retval = retval;
+	sfs->xerrno = xerrno;
+}
+
+/*
  * sendfile(2)
  *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
@@ -1992,15 +2385,21 @@ sys_sendfile(struct thread *td, struct s
 int
 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags,
     int compat, off_t offset, size_t nbytes, off_t *sbytes,
-    struct uio *hdr_uio, struct uio *trl_uio)
+    struct uio *hdr_uio,
+    struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq)
 {
 	cap_rights_t rights;
 	struct sendfile_sync *sfs = NULL;
 	struct file *fp;
 	int error;
+	int do_kqueue = 0;
+	int do_free = 0;
 
 	AUDIT_ARG_FD(src_fd);
 
+	if (hdtr_kq != NULL)
+		do_kqueue = 1;
+
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
@@ -2011,20 +2410,121 @@ _do_sendfile(struct thread *td, int src_
 	}
 
 	/*
+	 * IF SF_KQUEUE is set but we haven't copied in anything for
+	 * kqueue data, error out.
+	 */
+	if (flags & SF_KQUEUE && do_kqueue == 0) {
+		SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__);
+		goto out;
+	}
+
+	/*
 	 * If we need to wait for completion, initialise the sfsync
 	 * state here.
 	 */
-	if (flags & SF_SYNC)
-		sfs = sf_sync_alloc(flags & SF_SYNC);
+	if (flags & (SF_SYNC | SF_KQUEUE))
+		sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE));
+
+	if (flags & SF_KQUEUE) {
+		error = sf_sync_kqueue_setup(sfs, hdtr_kq);
+		if (error) {
+			SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n",
+			    __func__,
+			    (unsigned long long) curthread->td_tid,
+			    sfs);
+			sf_sync_set_state(sfs, SF_STATE_FREEING, 0);
+			sf_sync_free(sfs);
+			goto out;
+		}
+	}
 
+	/*
+	 * Do the sendfile call.
+	 *
+	 * If this fails, it'll free the mbuf chain which will free up the
+	 * sendfile_sync references.
+	 */
 	error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset,
 	    nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td);
 
 	/*
-	 * If appropriate, do the wait and free here.
+	 * If the sendfile call succeeded, transition the sf_sync state
+	 * to RUNNING, then COMPLETED.
+	 *
+	 * If the sendfile call failed, then the sendfile call may have
+	 * actually sent some data first - so we check to see whether
+	 * any data was sent.  If some data was queued (ie, count > 0)
+	 * then we can't call free; we have to wait until the partial
+	 * transaction completes before we continue along.
+	 *
+	 * This has the side effect of firing off the knote
+	 * if the refcount has hit zero by the time we get here.
 	 */
 	if (sfs != NULL) {
+		mtx_lock(&sfs->mtx);
+		if (error == 0 || sfs->count > 0) {
+			/*
+			 * When it's time to do async sendfile, the transition
+			 * to RUNNING signifies that we're actually actively
+			 * adding and completing mbufs.  When the last disk
+			 * buffer is read (ie, when we're not doing any
+			 * further read IO and all subsequent stuff is mbuf
+			 * transmissions) we'll transition to COMPLETED
+			 * and when the final mbuf is freed, the completion
+			 * will be signaled.
+			 */
+			sf_sync_set_state(sfs, SF_STATE_RUNNING, 1);
+
+			/*
+			 * Set the retval before we signal completed.
+			 * If we do it the other way around then transitioning to
+			 * COMPLETED may post the knote before you set the return
+			 * status!
+			 *
+			 * XXX for now, errno is always 0, as we don't post
+			 * knotes if sendfile failed.  Maybe that'll change later.
+			 */
+			sf_sync_set_retval(sfs, *sbytes, error);
+
+			/*
+			 * And now transition to completed, which will kick off
+			 * the knote if required.
+			 */
+			sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1);
+		} else {
+			/*
+			 * Error isn't zero, sfs_count is zero, so we
+			 * won't have some other thing to wake things up.
+			 * Thus free.
+			 */
+			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
+			do_free = 1;
+		}
+
+		/*
+		 * Next - wait if appropriate.
+		 */
 		sf_sync_syscall_wait(sfs);
+
+		/*
+		 * If we're not doing kqueue notifications, we can
+		 * transition this immediately to the freeing state.
+		 */
+		if ((sfs->flags & SF_KQUEUE) == 0) {
+			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
+			do_free = 1;
+		}
+
+		mtx_unlock(&sfs->mtx);
+	}
+
+	/*
+	 * If do_free is set, free here.
+	 *
+	 * If we're doing no-kqueue notification and it's just sleep notification,
+	 * we also do free; it's the only chance we have.
+	 */
+	if (sfs != NULL && do_free == 1) {
 		sf_sync_free(sfs);
 	}
 
@@ -2036,16 +2536,20 @@ _do_sendfile(struct thread *td, int src_
 	fdrop(fp, td);
 
 out:
+	/* Return error */
 	return (error);
 }
 
+
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
+	struct sf_hdtr_kq hdtr_kq;
 	struct uio *hdr_uio, *trl_uio;
 	int error;
 	off_t sbytes;
+	int do_kqueue = 0;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
@@ -2070,10 +2574,25 @@ do_sendfile(struct thread *td, struct se
 			if (error != 0)
 				goto out;
 		}
+
+		/*
+		 * If SF_KQUEUE is set, then we need to also copy in
+		 * the kqueue data after the normal hdtr set and set
+		 * do_kqueue=1.
+		 */
+		if (uap->flags & SF_KQUEUE) {
+			error = copyin(((char *) uap->hdtr) + sizeof(hdtr),
+			    &hdtr_kq,
+			    sizeof(hdtr_kq));
+			if (error != 0)
+				goto out;
+			do_kqueue = 1;
+		}
 	}
 
+	/* Call sendfile */
 	error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat,
-	    uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio);
+	    uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq);
 
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));

Modified: head/sys/sys/sf_base.h
==============================================================================
--- head/sys/sys/sf_base.h	Fri Jan 17 05:15:44 2014	(r260805)
+++ head/sys/sys/sf_base.h	Fri Jan 17 05:26:55 2014	(r260806)
@@ -31,6 +31,7 @@
 
 extern	int _do_sendfile(struct thread *, int src_fd, int sock_fd, int flags,
 	    int compat, off_t offset, size_t nbytes, off_t *sbytes,
-	    struct uio *hdr_uio, struct uio *trl_uio);
+	    struct uio *hdr_uio, struct uio *trl_uio,
+	    struct sf_hdtr_kq *hdtr_kq);
 
 #endif	/* _SYS_SF_BASE_H_ */

Modified: head/sys/sys/sf_sync.h
==============================================================================
--- head/sys/sys/sf_sync.h	Fri Jan 17 05:15:44 2014	(r260805)
+++ head/sys/sys/sf_sync.h	Fri Jan 17 05:26:55 2014	(r260806)
@@ -29,17 +29,36 @@
 #ifndef _SYS_SF_SYNC_H_
 #define _SYS_SF_SYNC_H_
 
+typedef enum {
+	SF_STATE_NONE,
+	SF_STATE_SETUP,
+	SF_STATE_RUNNING,
+	SF_STATE_COMPLETED,
+	SF_STATE_FREEING
+} sendfile_sync_state_t;
+
 struct sendfile_sync {
-	uint32_t	flags;
 	struct mtx	mtx;
 	struct cv	cv;
-	unsigned	count;
+	struct knlist	klist;
+	uint32_t	flags;
+	uint32_t	count;
+	int32_t		xerrno;		/* Completion errno, if retval < 0 */
+	off_t		retval;		/* Completion retval (eg written bytes) */
+	sendfile_sync_state_t	state;
 };
 
+/* XXX pollution */
+struct sf_hdtr_kq;
+
 extern	struct sendfile_sync * sf_sync_alloc(uint32_t flags);
 extern	void sf_sync_syscall_wait(struct sendfile_sync *);
 extern	void sf_sync_free(struct sendfile_sync *);
+extern	void sf_sync_try_free(struct sendfile_sync *);
 extern	void sf_sync_ref(struct sendfile_sync *);
 extern	void sf_sync_deref(struct sendfile_sync *);
+extern	int sf_sync_kqueue_setup(struct sendfile_sync *, struct sf_hdtr_kq *);
+extern	void sf_sync_set_state(struct sendfile_sync *, sendfile_sync_state_t, int);
+extern	void sf_sync_set_retval(struct sendfile_sync *, off_t, int);
 
 #endif /* !_SYS_SF_BUF_H_ */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201401170526.s0H5QtFI097439>