Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 27 Dec 2020 10:57:39 GMT
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 7a202823aa54 - main - Expose eventfd in the native API/ABI using a new __specialfd syscall
Message-ID:  <202012271057.0BRAvdqK013912@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=7a202823aa54ba18c485bdbcf355269bcfee1ab9

commit 7a202823aa54ba18c485bdbcf355269bcfee1ab9
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2020-12-23 14:14:04 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2020-12-27 10:57:26 +0000

    Expose eventfd in the native API/ABI using a new __specialfd syscall
    
    eventfd is a Linux system call that produces special file descriptors
    for event notification. When porting Linux software, it is currently
    usually emulated by epoll-shim on top of kqueues.  Unfortunately, kqueues
    are not passable between processes.  And, as noted by the author of
    epoll-shim, even if they were, the library state would also have to be
    passed somehow.  This came up when debugging strange HW video decode
    failures in Firefox.  A native implementation would avoid these problems
    and help with porting Linux software.
    
    Since we now already have an eventfd implementation in the kernel (for
    the Linuxulator), it's pretty easy to expose it natively, which is what
    this patch does.
    
    Submitted by:   greg@unrelenting.technology
    Reviewed by:    markj (previous version)
    MFC after:      2 weeks
    Differential Revision:  https://reviews.freebsd.org/D26668
---
 sys/bsm/audit_kevents.h              |   1 +
 sys/compat/freebsd32/syscalls.master |   2 +
 sys/compat/linux/linux_event.c       | 368 ++++-------------------------------
 sys/conf/files                       |   1 +
 sys/kern/capabilities.conf           |   5 +
 sys/kern/kern_descrip.c              |   4 +-
 sys/kern/sys_eventfd.c               | 349 +++++++++++++++++++++++++++++++++
 sys/kern/sys_generic.c               |  63 ++++++
 sys/kern/syscalls.master             |   7 +
 sys/sys/eventfd.h                    |  54 +++++
 sys/sys/file.h                       |   2 +-
 sys/sys/specialfd.h                  |  42 ++++
 sys/sys/syscallsubr.h                |   1 +
 sys/sys/user.h                       |   5 +
 14 files changed, 574 insertions(+), 330 deletions(-)

diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index 150ecc1b49ac..5b37329078a1 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -659,6 +659,7 @@
 #define	AUE_SHMRENAME		43263	/* FreeBSD-specific. */
 #define	AUE_REALPATHAT		43264	/* FreeBSD-specific. */
 #define	AUE_CLOSERANGE		43265	/* FreeBSD-specific. */
+#define	AUE_SPECIALFD		43266	/* FreeBSD-specific. */
 
 /*
  * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 1e3d26d727a1..f4339795781a 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1168,5 +1168,7 @@
 ; 576 is initialised by the krpc code, if present.
 576	AUE_NULL	NOSTD|NOPROTO	{ int rpctls_syscall(int op, \
 				    const char *path); }
+577	AUE_SPECIALFD	NOPROTO	{ int __specialfd(int type, const void *req, \
+				    size_t len); }
 
 ; vim: syntax=off
diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index c67d62d8aff0..b4b4be1f7b49 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -51,9 +51,11 @@ __FBSDID("$FreeBSD$");
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
+#include <sys/specialfd.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/timespec.h>
+#include <sys/eventfd.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
@@ -124,53 +126,11 @@ struct epoll_copyout_args {
 	int			error;
 };
 
-/* eventfd */
-typedef uint64_t	eventfd_t;
-
-static fo_rdwr_t	eventfd_read;
-static fo_rdwr_t	eventfd_write;
-static fo_ioctl_t	eventfd_ioctl;
-static fo_poll_t	eventfd_poll;
-static fo_kqfilter_t	eventfd_kqfilter;
-static fo_stat_t	eventfd_stat;
-static fo_close_t	eventfd_close;
-static fo_fill_kinfo_t	eventfd_fill_kinfo;
-
-static struct fileops eventfdops = {
-	.fo_read = eventfd_read,
-	.fo_write = eventfd_write,
-	.fo_truncate = invfo_truncate,
-	.fo_ioctl = eventfd_ioctl,
-	.fo_poll = eventfd_poll,
-	.fo_kqfilter = eventfd_kqfilter,
-	.fo_stat = eventfd_stat,
-	.fo_close = eventfd_close,
-	.fo_chmod = invfo_chmod,
-	.fo_chown = invfo_chown,
-	.fo_sendfile = invfo_sendfile,
-	.fo_fill_kinfo = eventfd_fill_kinfo,
-	.fo_flags = DFLAG_PASSABLE
-};
-
-static void	filt_eventfddetach(struct knote *kn);
-static int	filt_eventfdread(struct knote *kn, long hint);
-static int	filt_eventfdwrite(struct knote *kn, long hint);
-
-static struct filterops eventfd_rfiltops = {
-	.f_isfd = 1,
-	.f_detach = filt_eventfddetach,
-	.f_event = filt_eventfdread
-};
-static struct filterops eventfd_wfiltops = {
-	.f_isfd = 1,
-	.f_detach = filt_eventfddetach,
-	.f_event = filt_eventfdwrite
-};
-
 /* timerfd */
 typedef uint64_t	timerfd_t;
 
 static fo_rdwr_t	timerfd_read;
+static fo_ioctl_t	timerfd_ioctl;
 static fo_poll_t	timerfd_poll;
 static fo_kqfilter_t	timerfd_kqfilter;
 static fo_stat_t	timerfd_stat;
@@ -181,7 +141,7 @@ static struct fileops timerfdops = {
 	.fo_read = timerfd_read,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
-	.fo_ioctl = eventfd_ioctl,
+	.fo_ioctl = timerfd_ioctl,
 	.fo_poll = timerfd_poll,
 	.fo_kqfilter = timerfd_kqfilter,
 	.fo_stat = timerfd_stat,
@@ -202,13 +162,6 @@ static struct filterops timerfd_rfiltops = {
 	.f_event = filt_timerfdread
 };
 
-struct eventfd {
-	eventfd_t	efd_count;
-	uint32_t	efd_flags;
-	struct selinfo	efd_sel;
-	struct mtx	efd_lock;
-};
-
 struct timerfd {
 	clockid_t	tfd_clockid;
 	struct itimerspec tfd_time;
@@ -219,7 +172,6 @@ struct timerfd {
 	struct mtx	tfd_lock;
 };
 
-static int	eventfd_create(struct thread *td, uint32_t initval, int flags);
 static void	linux_timerfd_expire(void *);
 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
 
@@ -691,294 +643,39 @@ epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
 	return (error1 == 0 ? 0 : error2);
 }
 
-static int
-eventfd_create(struct thread *td, uint32_t initval, int flags)
-{
-	struct filedesc *fdp;
-	struct eventfd *efd;
-	struct file *fp;
-	int fflags, fd, error;
-
-	fflags = 0;
-	if ((flags & LINUX_O_CLOEXEC) != 0)
-		fflags |= O_CLOEXEC;
-
-	fdp = td->td_proc->p_fd;
-	error = falloc(td, &fp, &fd, fflags);
-	if (error != 0)
-		return (error);
-
-	efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
-	efd->efd_flags = flags;
-	efd->efd_count = initval;
-	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
-
-	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
-
-	fflags = FREAD | FWRITE;
-	if ((flags & LINUX_O_NONBLOCK) != 0)
-		fflags |= FNONBLOCK;
-
-	finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
-	fdrop(fp, td);
-
-	td->td_retval[0] = fd;
-	return (error);
-}
-
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
 {
+	struct specialfd_eventfd ae;
 
-	return (eventfd_create(td, args->initval, 0));
+	bzero(&ae, sizeof(ae));
+	ae.initval = args->initval;
+	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 #endif
 
 int
 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
 {
+	struct specialfd_eventfd ae;
+	int flags;
 
-	if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
-		return (EINVAL);
-
-	return (eventfd_create(td, args->initval, args->flags));
-}
-
-static int
-eventfd_close(struct file *fp, struct thread *td)
-{
-	struct eventfd *efd;
-
-	efd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
-		return (EINVAL);
-
-	seldrain(&efd->efd_sel);
-	knlist_destroy(&efd->efd_sel.si_note);
-
-	fp->f_ops = &badfileops;
-	mtx_destroy(&efd->efd_lock);
-	free(efd, M_EPOLL);
-
-	return (0);
-}
-
-static int
-eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
-    int flags, struct thread *td)
-{
-	struct eventfd *efd;
-	eventfd_t count;
-	int error;
-
-	efd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
-		return (EINVAL);
-
-	if (uio->uio_resid < sizeof(eventfd_t))
-		return (EINVAL);
-
-	error = 0;
-	mtx_lock(&efd->efd_lock);
-retry:
-	if (efd->efd_count == 0) {
-		if ((fp->f_flag & FNONBLOCK) != 0) {
-			mtx_unlock(&efd->efd_lock);
-			return (EAGAIN);
-		}
-		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
-		if (error == 0)
-			goto retry;
-	}
-	if (error == 0) {
-		if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
-			count = 1;
-			--efd->efd_count;
-		} else {
-			count = efd->efd_count;
-			efd->efd_count = 0;
-		}
-		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
-		selwakeup(&efd->efd_sel);
-		wakeup(&efd->efd_count);
-		mtx_unlock(&efd->efd_lock);
-		error = uiomove(&count, sizeof(eventfd_t), uio);
-	} else
-		mtx_unlock(&efd->efd_lock);
-
-	return (error);
-}
-
-static int
-eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
-     int flags, struct thread *td)
-{
-	struct eventfd *efd;
-	eventfd_t count;
-	int error;
-
-	efd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
-		return (EINVAL);
-
-	if (uio->uio_resid < sizeof(eventfd_t))
-		return (EINVAL);
-
-	error = uiomove(&count, sizeof(eventfd_t), uio);
-	if (error != 0)
-		return (error);
-	if (count == UINT64_MAX)
-		return (EINVAL);
-
-	mtx_lock(&efd->efd_lock);
-retry:
-	if (UINT64_MAX - efd->efd_count <= count) {
-		if ((fp->f_flag & FNONBLOCK) != 0) {
-			mtx_unlock(&efd->efd_lock);
-			/* Do not not return the number of bytes written */
-			uio->uio_resid += sizeof(eventfd_t);
-			return (EAGAIN);
-		}
-		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
-		    PCATCH, "lefdwr", 0);
-		if (error == 0)
-			goto retry;
-	}
-	if (error == 0) {
-		efd->efd_count += count;
-		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
-		selwakeup(&efd->efd_sel);
-		wakeup(&efd->efd_count);
-	}
-	mtx_unlock(&efd->efd_lock);
-
-	return (error);
-}
-
-static int
-eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
-    struct thread *td)
-{
-	struct eventfd *efd;
-	int revents = 0;
-
-	efd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
-		return (POLLERR);
-
-	mtx_lock(&efd->efd_lock);
-	if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
-		revents |= events & (POLLIN|POLLRDNORM);
-	if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
-		revents |= events & (POLLOUT|POLLWRNORM);
-	if (revents == 0)
-		selrecord(td, &efd->efd_sel);
-	mtx_unlock(&efd->efd_lock);
-
-	return (revents);
-}
-
-static int
-eventfd_kqfilter(struct file *fp, struct knote *kn)
-{
-	struct eventfd *efd;
-
-	efd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
-		return (EINVAL);
-
-	mtx_lock(&efd->efd_lock);
-	switch (kn->kn_filter) {
-	case EVFILT_READ:
-		kn->kn_fop = &eventfd_rfiltops;
-		break;
-	case EVFILT_WRITE:
-		kn->kn_fop = &eventfd_wfiltops;
-		break;
-	default:
-		mtx_unlock(&efd->efd_lock);
-		return (EINVAL);
-	}
-
-	kn->kn_hook = efd;
-	knlist_add(&efd->efd_sel.si_note, kn, 1);
-	mtx_unlock(&efd->efd_lock);
-
-	return (0);
-}
-
-static void
-filt_eventfddetach(struct knote *kn)
-{
-	struct eventfd *efd = kn->kn_hook;
-
-	mtx_lock(&efd->efd_lock);
-	knlist_remove(&efd->efd_sel.si_note, kn, 1);
-	mtx_unlock(&efd->efd_lock);
-}
-
-static int
-filt_eventfdread(struct knote *kn, long hint)
-{
-	struct eventfd *efd = kn->kn_hook;
-	int ret;
-
-	mtx_assert(&efd->efd_lock, MA_OWNED);
-	ret = (efd->efd_count > 0);
-
-	return (ret);
-}
-
-static int
-filt_eventfdwrite(struct knote *kn, long hint)
-{
-	struct eventfd *efd = kn->kn_hook;
-	int ret;
-
-	mtx_assert(&efd->efd_lock, MA_OWNED);
-	ret = (UINT64_MAX - 1 > efd->efd_count);
-
-	return (ret);
-}
-
-static int
-eventfd_ioctl(struct file *fp, u_long cmd, void *data,
-    struct ucred *active_cred, struct thread *td)
-{
-
-	if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD &&
-	    fp->f_type != DTYPE_LINUXTFD))
+	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
+	    LINUX_EFD_SEMAPHORE)) != 0)
 		return (EINVAL);
-
-	switch (cmd)
-	{
-	case FIONBIO:
-		if ((*(int *)data))
-			atomic_set_int(&fp->f_flag, FNONBLOCK);
-		else
-			atomic_clear_int(&fp->f_flag, FNONBLOCK);
-	case FIOASYNC:
-		return (0);
-	default:
-		return (ENXIO);
-	}
-}
-
-static int
-eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-    struct thread *td)
-{
-
-	return (ENXIO);
-}
-
-static int
-eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
-{
-
-	kif->kf_type = KF_TYPE_UNKNOWN;
-	return (0);
+	flags = 0;
+	if ((args->flags & LINUX_O_CLOEXEC) != 0)
+		flags |= EFD_CLOEXEC;
+	if ((args->flags & LINUX_O_NONBLOCK) != 0)
+		flags |= EFD_NONBLOCK;
+	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
+		flags |= EFD_SEMAPHORE;
+
+	bzero(&ae, sizeof(ae));
+	ae.flags = flags;
+	ae.initval = args->initval;
+	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 
 int
@@ -1154,6 +851,23 @@ filt_timerfdread(struct knote *kn, long hint)
 	return (tfd->tfd_count > 0);
 }
 
+static int
+timerfd_ioctl(struct file *fp, u_long cmd, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+
+	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
+		return (EINVAL);
+
+	switch (cmd) {
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	}
+
+	return (ENOTTY);
+}
+
 static int
 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
     struct thread *td)
diff --git a/sys/conf/files b/sys/conf/files
index 8e30ae1eded1..0258fca24836 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3926,6 +3926,7 @@ kern/subr_unit.c		standard
 kern/subr_vmem.c		standard
 kern/subr_witness.c		optional witness
 kern/sys_capability.c		standard
+kern/sys_eventfd.c		standard
 kern/sys_generic.c		standard
 kern/sys_getrandom.c		standard
 kern/sys_pipe.c			standard
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 09d09515c816..3d552255d823 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -55,6 +55,11 @@ __mac_get_proc
 __mac_set_fd
 __mac_set_proc
 
+##
+## Allow creating special file descriptors like eventfd(2).
+##
+__specialfd
+
 ##
 ## Allow sysctl(2) as we scope internal to the call; this is a global
 ## namespace, but there are several critical sysctls required for almost
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 3c9664c69da6..a510ad90a618 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -4609,8 +4609,8 @@ file_type_to_name(short type)
 		return ("dev");
 	case DTYPE_PROCDESC:
 		return ("proc");
-	case DTYPE_LINUXEFD:
-		return ("levent");
+	case DTYPE_EVENTFD:
+		return ("eventfd");
 	case DTYPE_LINUXTFD:
 		return ("ltimer");
 	default:
diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c
new file mode 100644
index 000000000000..3fdb4afc7850
--- /dev/null
+++ b/sys/kern/sys_eventfd.c
@@ -0,0 +1,349 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2007 Roman Divacky
+ * Copyright (c) 2014 Dmitry Chagin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/event.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/selinfo.h>
+#include <sys/eventfd.h>
+
+#include <security/audit/audit.h>
+
+_Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC");
+_Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK");
+
+MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures");
+
+static fo_rdwr_t	eventfd_read;
+static fo_rdwr_t	eventfd_write;
+static fo_ioctl_t	eventfd_ioctl;
+static fo_poll_t	eventfd_poll;
+static fo_kqfilter_t	eventfd_kqfilter;
+static fo_stat_t	eventfd_stat;
+static fo_close_t	eventfd_close;
+static fo_fill_kinfo_t	eventfd_fill_kinfo;
+
+static struct fileops eventfdops = {
+	.fo_read = eventfd_read,
+	.fo_write = eventfd_write,
+	.fo_truncate = invfo_truncate,
+	.fo_ioctl = eventfd_ioctl,
+	.fo_poll = eventfd_poll,
+	.fo_kqfilter = eventfd_kqfilter,
+	.fo_stat = eventfd_stat,
+	.fo_close = eventfd_close,
+	.fo_chmod = invfo_chmod,
+	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
+	.fo_fill_kinfo = eventfd_fill_kinfo,
+	.fo_flags = DFLAG_PASSABLE
+};
+
+static void	filt_eventfddetach(struct knote *kn);
+static int	filt_eventfdread(struct knote *kn, long hint);
+static int	filt_eventfdwrite(struct knote *kn, long hint);
+
+static struct filterops eventfd_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_eventfddetach,
+	.f_event = filt_eventfdread
+};
+
+static struct filterops eventfd_wfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_eventfddetach,
+	.f_event = filt_eventfdwrite
+};
+
+struct eventfd {
+	eventfd_t	efd_count;
+	uint32_t	efd_flags;
+	struct selinfo	efd_sel;
+	struct mtx	efd_lock;
+};
+
+int
+eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval,
+    int flags)
+{
+	struct eventfd *efd;
+	int fflags;
+
+	AUDIT_ARG_FFLAGS(flags);
+	AUDIT_ARG_VALUE(initval);
+
+	efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO);
+	efd->efd_flags = flags;
+	efd->efd_count = initval;
+	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
+	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
+
+	fflags = FREAD | FWRITE;
+	if ((flags & EFD_NONBLOCK) != 0)
+		fflags |= FNONBLOCK;
+	finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops);
+
+	return (0);
+}
+
+static int
+eventfd_close(struct file *fp, struct thread *td)
+{
+	struct eventfd *efd;
+
+	efd = fp->f_data;
+	seldrain(&efd->efd_sel);
+	knlist_destroy(&efd->efd_sel.si_note);
+	mtx_destroy(&efd->efd_lock);
+	free(efd, M_EVENTFD);
+	return (0);
+}
+
+static int
+eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct eventfd *efd;
+	eventfd_t count;
+	int error;
+
+	if (uio->uio_resid < sizeof(eventfd_t))
+		return (EINVAL);
+
+	error = 0;
+	efd = fp->f_data;
+	mtx_lock(&efd->efd_lock);
+	while (error == 0 && efd->efd_count == 0) {
+		if ((fp->f_flag & FNONBLOCK) != 0) {
+			mtx_unlock(&efd->efd_lock);
+			return (EAGAIN);
+		}
+		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH,
+		    "efdrd", 0);
+	}
+	if (error == 0) {
+		MPASS(efd->efd_count > 0);
+		if ((efd->efd_flags & EFD_SEMAPHORE) != 0) {
+			count = 1;
+			--efd->efd_count;
+		} else {
+			count = efd->efd_count;
+			efd->efd_count = 0;
+		}
+		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
+		selwakeup(&efd->efd_sel);
+		wakeup(&efd->efd_count);
+		mtx_unlock(&efd->efd_lock);
+		error = uiomove(&count, sizeof(eventfd_t), uio);
+	} else
+		mtx_unlock(&efd->efd_lock);
+
+	return (error);
+}
+
+static int
+eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct eventfd *efd;
+	eventfd_t count;
+	int error;
+
+	if (uio->uio_resid < sizeof(eventfd_t))
+		return (EINVAL);
+
+	error = uiomove(&count, sizeof(eventfd_t), uio);
+	if (error != 0)
+		return (error);
+	if (count == UINT64_MAX)
+		return (EINVAL);
+
+	efd = fp->f_data;
+	mtx_lock(&efd->efd_lock);
+retry:
+	if (UINT64_MAX - efd->efd_count <= count) {
+		if ((fp->f_flag & FNONBLOCK) != 0) {
+			mtx_unlock(&efd->efd_lock);
+			/* Do not not return the number of bytes written */
+			uio->uio_resid += sizeof(eventfd_t);
+			return (EAGAIN);
+		}
+		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
+		    PCATCH, "efdwr", 0);
+		if (error == 0)
+			goto retry;
+	}
+	if (error == 0) {
+		MPASS(UINT64_MAX - efd->efd_count > count);
+		efd->efd_count += count;
+		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
+		selwakeup(&efd->efd_sel);
+		wakeup(&efd->efd_count);
+	}
+	mtx_unlock(&efd->efd_lock);
+
+	return (error);
+}
+
+static int
+eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct eventfd *efd;
+	int revents;
+
+	efd = fp->f_data;
+	revents = 0;
+	mtx_lock(&efd->efd_lock);
+	if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0)
+		revents |= events & (POLLIN | POLLRDNORM);
+	if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 >
+	    efd->efd_count)
+		revents |= events & (POLLOUT | POLLWRNORM);
+	if (revents == 0)
+		selrecord(td, &efd->efd_sel);
+	mtx_unlock(&efd->efd_lock);
+
+	return (revents);
+}
+
+static int
+eventfd_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct eventfd *efd = fp->f_data;
+
+	mtx_lock(&efd->efd_lock);
+	switch (kn->kn_filter) {
+	case EVFILT_READ:
+		kn->kn_fop = &eventfd_rfiltops;
+		break;
+	case EVFILT_WRITE:
+		kn->kn_fop = &eventfd_wfiltops;
+		break;
+	default:
+		mtx_unlock(&efd->efd_lock);
+		return (EINVAL);
+	}
+
+	kn->kn_hook = efd;
+	knlist_add(&efd->efd_sel.si_note, kn, 1);
+	mtx_unlock(&efd->efd_lock);
+
+	return (0);
+}
+
+static void
+filt_eventfddetach(struct knote *kn)
+{
+	struct eventfd *efd = kn->kn_hook;
+
+	mtx_lock(&efd->efd_lock);
+	knlist_remove(&efd->efd_sel.si_note, kn, 1);
+	mtx_unlock(&efd->efd_lock);
+}
+
+static int
+filt_eventfdread(struct knote *kn, long hint)
+{
+	struct eventfd *efd = kn->kn_hook;
+	int ret;
+
+	mtx_assert(&efd->efd_lock, MA_OWNED);
+	kn->kn_data = (int64_t)efd->efd_count;
+	ret = efd->efd_count > 0;
+
+	return (ret);
+}
+
+static int
+filt_eventfdwrite(struct knote *kn, long hint)
+{
+	struct eventfd *efd = kn->kn_hook;
+	int ret;
+
+	mtx_assert(&efd->efd_lock, MA_OWNED);
+	kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count);
+	ret = UINT64_MAX - 1 > efd->efd_count;
+
+	return (ret);
+}
+
+static int
+eventfd_ioctl(struct file *fp, u_long cmd, void *data,
+    struct ucred *active_cred, struct thread *td)
+{
+	switch (cmd) {
+	case FIONBIO:
+	case FIOASYNC:
+		return (0);
+	}
+
+	return (ENOTTY);
+}
+
+static int
+eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+    struct thread *td)
+{
+	bzero((void *)st, sizeof *st);
+	st->st_mode = S_IFIFO;
+	return (0);
+}
+
+static int
+eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+	struct eventfd *efd = fp->f_data;
+
+	kif->kf_type = KF_TYPE_EVENTFD;
+	mtx_lock(&efd->efd_lock);
+	kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count;
+	kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags;
+	mtx_unlock(&efd->efd_lock);
+	return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index fa9436d1f9f9..a055f4a9b597 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
+#include <sys/eventfd.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
@@ -63,6 +64,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
+#include <sys/specialfd.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -859,6 +861,67 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 	return (error);
 }
 
+int
+kern_specialfd(struct thread *td, int type, void *arg)
+{
+	struct file *fp;
+	struct specialfd_eventfd *ae;
+	int error, fd, fflags;
+
+	fflags = 0;
+	error = falloc_noinstall(td, &fp);
+	if (error != 0)
+		return (error);
+
+	switch (type) {
+	case SPECIALFD_EVENTFD:
+		ae = arg;
+		if ((ae->flags & EFD_CLOEXEC) != 0)
+			fflags |= O_CLOEXEC;
+		error = eventfd_create_file(td, fp, ae->initval, ae->flags);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (error == 0)
+		error = finstall(td, fp, &fd, fflags, NULL);
+	fdrop(fp, td);
+	if (error == 0)
+		td->td_retval[0] = fd;
+	return (error);
+}
+
+int
+sys___specialfd(struct thread *td, struct __specialfd_args *args)
+{
+	struct specialfd_eventfd ae;
+	int error;
+
+	switch (args->type) {
+	case SPECIALFD_EVENTFD:
+		if (args->len != sizeof(struct specialfd_eventfd)) {
+			error = EINVAL;
+			break;
+		}
+		error = copyin(args->req, &ae, sizeof(ae));
+		if (error != 0)
+			break;
+		if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
+		    EFD_SEMAPHORE)) != 0) {
+			error = EINVAL;
+			break;
+		}
+		error = kern_specialfd(td, args->type, &ae);
+		break;
*** 184 LINES SKIPPED ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202012271057.0BRAvdqK013912>