Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 25 May 2015 10:42:06 +0800
From:      Julian Elischer <julian@freebsd.org>
To:        Dmitry Chagin <dchagin@FreeBSD.org>, src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   Re: svn commit: r283441 - in head/sys: amd64/linux amd64/linux32 compat/linux conf i386/linux modules/linux modules/linux64
Message-ID:  <55628BFE.5070306@freebsd.org>
In-Reply-To: <201505241641.t4OGfeAX094242@svn.freebsd.org>
References:  <201505241641.t4OGfeAX094242@svn.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On 5/25/15 12:41 AM, Dmitry Chagin wrote:
> Author: dchagin
> Date: Sun May 24 16:41:39 2015
> New Revision: 283441
> URL: https://svnweb.freebsd.org/changeset/base/283441
>
> Log:
>    Implement epoll family system calls. This is a tiny wrapper
>    around kqueue() to implement epoll subset of functionality.
>    The kqueue user data are 32bit on i386 which is not enough for
>    epoll user data, so we keep user data in the proc emuldata.
Have you considered making the in-kernel representation just have more 
room?
>    
>    Initial patch developed by rdivacky@ in 2007, then extended
>    by Yuri Victorovich @ r255672 and finished by me
>    in collaboration with mjg@ and jillies@.
>    
>    Differential Revision:	https://reviews.freebsd.org/D1092
>
> Added:
>    head/sys/compat/linux/linux_event.c   (contents, props changed)
>    head/sys/compat/linux/linux_event.h   (contents, props changed)
> Modified:
>    head/sys/amd64/linux/linux_dummy.c
>    head/sys/amd64/linux/syscalls.master
>    head/sys/amd64/linux32/linux32_dummy.c
>    head/sys/amd64/linux32/syscalls.master
>    head/sys/compat/linux/linux_emul.c
>    head/sys/compat/linux/linux_emul.h
>    head/sys/compat/linux/linux_util.c
>    head/sys/compat/linux/linux_util.h
>    head/sys/conf/files.amd64
>    head/sys/conf/files.i386
>    head/sys/conf/files.pc98
>    head/sys/i386/linux/linux_dummy.c
>    head/sys/i386/linux/syscalls.master
>    head/sys/modules/linux/Makefile
>    head/sys/modules/linux64/Makefile
>
> Modified: head/sys/amd64/linux/linux_dummy.c
> ==============================================================================
> --- head/sys/amd64/linux/linux_dummy.c	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/amd64/linux/linux_dummy.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -69,13 +69,10 @@ DUMMY(tuxcall);
>   DUMMY(security);
>   DUMMY(set_thread_area);
>   DUMMY(lookup_dcookie);
> -DUMMY(epoll_create);
>   DUMMY(epoll_ctl_old);
>   DUMMY(epoll_wait_old);
>   DUMMY(remap_file_pages);
>   DUMMY(semtimedop);
> -DUMMY(epoll_ctl);
> -DUMMY(epoll_wait);
>   DUMMY(mbind);
>   DUMMY(get_mempolicy);
>   DUMMY(set_mempolicy);
> @@ -112,7 +109,6 @@ DUMMY(timerfd_settime);
>   DUMMY(timerfd_gettime);
>   DUMMY(signalfd4);
>   DUMMY(eventfd2);
> -DUMMY(epoll_create1);
>   DUMMY(inotify_init1);
>   DUMMY(preadv);
>   DUMMY(pwritev);
>
> Modified: head/sys/amd64/linux/syscalls.master
> ==============================================================================
> --- head/sys/amd64/linux/syscalls.master	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/amd64/linux/syscalls.master	Sun May 24 16:41:39 2015	(r283441)
> @@ -373,7 +373,7 @@
>   210	AUE_NULL	UNIMPL	linux_io_cancel
>   211	AUE_NULL	UNIMPL	linux_get_thread_area
>   212	AUE_NULL	STD	{ int linux_lookup_dcookie(void); }
> -213	AUE_NULL	STD	{ int linux_epoll_create(void); }
> +213	AUE_NULL	STD	{ int linux_epoll_create(l_int size); }
>   214	AUE_NULL	STD	{ int linux_epoll_ctl_old(void); }
>   215	AUE_NULL	STD	{ int linux_epoll_wait_old(void); }
>   216	AUE_NULL	STD	{ int linux_remap_file_pages(void); }
> @@ -397,8 +397,10 @@
>   230	AUE_NULL	STD	{ int linux_clock_nanosleep(clockid_t which, int flags, \
>   				    struct l_timespec *rqtp, struct l_timespec *rmtp); }
>   231	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }
> -232	AUE_NULL	STD	{ int linux_epoll_wait(void); }
> -233	AUE_NULL	STD	{ int linux_epoll_ctl(void); }
> +232	AUE_NULL	STD	{ int linux_epoll_wait(l_int epfd, struct epoll_event *events, \
> +					l_int maxevents, l_int timeout); }
> +233	AUE_NULL	STD	{ int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
> +					struct epoll_event *event); }
>   234	AUE_NULL	STD	{ int linux_tgkill(int tgid, int pid, int sig); }
>   235	AUE_UTIMES	STD	{ int linux_utimes(char *fname, \
>   				    struct l_timeval *tptr); }
> @@ -466,7 +468,8 @@
>   278	AUE_NULL	STD	{ int linux_vmsplice(void); }
>   279	AUE_NULL	STD	{ int linux_move_pages(void); }
>   280	AUE_NULL	STD	{ int linux_utimensat(void); }
> -281	AUE_NULL	STD	{ int linux_epoll_pwait(void); }
> +281     AUE_NULL        STD     { int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \
> +                                        l_int maxevents, l_int timeout, l_sigset_t *mask); }
>   282	AUE_NULL	STD	{ int linux_signalfd(void); }
>   283	AUE_NULL	STD	{ int linux_timerfd(void); }
>   284	AUE_NULL	STD	{ int linux_eventfd(void); }
> @@ -477,7 +480,7 @@
>   				    l_uintptr_t namelen, int flags); }
>   289	AUE_NULL	STD	{ int linux_signalfd4(void); }
>   290	AUE_NULL	STD	{ int linux_eventfd2(void); }
> -291	AUE_NULL	STD	{ int linux_epoll_create1(void); }
> +291	AUE_NULL	STD	{ int linux_epoll_create1(l_int flags); }
>   292	AUE_NULL	STD	{ int linux_dup3(l_int oldfd,		\
>   				    l_int newfd, l_int flags); }
>   293	AUE_NULL	STD	{ int linux_pipe2(l_int *pipefds, l_int flags); }
>
> Modified: head/sys/amd64/linux32/linux32_dummy.c
> ==============================================================================
> --- head/sys/amd64/linux32/linux32_dummy.c	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/amd64/linux32/linux32_dummy.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -68,9 +68,6 @@ DUMMY(pivot_root);
>   DUMMY(mincore);
>   DUMMY(ptrace);
>   DUMMY(lookup_dcookie);
> -DUMMY(epoll_create);
> -DUMMY(epoll_ctl);
> -DUMMY(epoll_wait);
>   DUMMY(remap_file_pages);
>   DUMMY(fstatfs64);
>   DUMMY(mbind);
> @@ -120,7 +117,6 @@ DUMMY(timerfd_gettime);
>   /* linux 2.6.27: */
>   DUMMY(signalfd4);
>   DUMMY(eventfd2);
> -DUMMY(epoll_create1);
>   DUMMY(inotify_init1);
>   /* linux 2.6.30: */
>   DUMMY(preadv);
>
> Modified: head/sys/amd64/linux32/syscalls.master
> ==============================================================================
> --- head/sys/amd64/linux32/syscalls.master	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/amd64/linux32/syscalls.master	Sun May 24 16:41:39 2015	(r283441)
> @@ -430,9 +430,11 @@
>   251	AUE_NULL	UNIMPL
>   252	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }
>   253	AUE_NULL	STD	{ int linux_lookup_dcookie(void); }
> -254	AUE_NULL	STD	{ int linux_epoll_create(void); }
> -255	AUE_NULL	STD	{ int linux_epoll_ctl(void); }
> -256	AUE_NULL	STD	{ int linux_epoll_wait(void); }
> +254	AUE_NULL	STD	{ int linux_epoll_create(l_int size); }
> +255	AUE_NULL	STD	{ int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
> +					struct epoll_event *event); }
> +256	AUE_NULL	STD	{ int linux_epoll_wait(l_int epfd, struct epoll_event *events, \
> +					l_int maxevents, l_int timeout); }
>   257	AUE_NULL	STD	{ int linux_remap_file_pages(void); }
>   258	AUE_NULL	STD	{ int linux_set_tid_address(int *tidptr); }
>   259	AUE_NULL	STD	{ int linux_timer_create(clockid_t clock_id, \
> @@ -527,7 +529,8 @@
>   317	AUE_NULL	STD	{ int linux_move_pages(void); }
>   ; linux 2.6.19:
>   318	AUE_NULL	STD	{ int linux_getcpu(void); }
> -319	AUE_NULL	STD	{ int linux_epoll_pwait(void); }
> +319     AUE_NULL        STD     { int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \
> +                                        l_int maxevents, l_int timeout, l_osigset_t *mask); }
>   ; linux 2.6.22:
>   320	AUE_NULL	STD	{ int linux_utimensat(void); }
>   321	AUE_NULL	STD	{ int linux_signalfd(void); }
> @@ -541,7 +544,7 @@
>   ; linux 2.6.27:
>   327	AUE_NULL	STD	{ int linux_signalfd4(void); }
>   328	AUE_NULL	STD	{ int linux_eventfd2(void); }
> -329	AUE_NULL	STD	{ int linux_epoll_create1(void); }
> +329	AUE_NULL	STD	{ int linux_epoll_create1(l_int flags); }
>   330	AUE_NULL	STD	{ int linux_dup3(l_int oldfd,		\
>   					l_int newfd, l_int flags); }
>   331	AUE_NULL	STD	{ int linux_pipe2(l_int *pipefds, l_int flags); }
>
> Modified: head/sys/compat/linux/linux_emul.c
> ==============================================================================
> --- head/sys/compat/linux/linux_emul.c	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/compat/linux/linux_emul.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -42,8 +42,6 @@ __FBSDID("$FreeBSD$");
>   #include <sys/proc.h>
>   #include <sys/syscallsubr.h>
>   #include <sys/sysent.h>
> -#include <sys/sysproto.h>
> -#include <sys/unistd.h>
>   
>   #include <compat/linux/linux_emul.h>
>   #include <compat/linux/linux_misc.h>
> @@ -86,6 +84,7 @@ linux_proc_init(struct thread *td, struc
>   {
>   	struct linux_emuldata *em;
>   	struct linux_pemuldata *pem;
> +	struct epoll_emuldata *emd;
>   
>   	if (newtd != NULL) {
>   		/* non-exec call */
> @@ -93,8 +92,13 @@ linux_proc_init(struct thread *td, struc
>   		em->pdeath_signal = 0;
>   		em->robust_futexes = NULL;
>   		if (flags & LINUX_CLONE_THREAD) {
> +			LINUX_CTR1(proc_init, "thread newtd(%d)",
> +			    newtd->td_tid);
> +
>   			em->em_tid = newtd->td_tid;
>   		} else {
> +			LINUX_CTR1(proc_init, "fork newtd(%d)",
> +			    newtd->td_proc->p_pid);
>   
>   			em->em_tid = newtd->td_proc->p_pid;
>   
> @@ -105,12 +109,24 @@ linux_proc_init(struct thread *td, struc
>   		newtd->td_emuldata = em;
>   	} else {
>   		/* exec */
> +		LINUX_CTR1(proc_init, "exec newtd(%d)",
> +		    td->td_proc->p_pid);
>   
>   		/* lookup the old one */
>   		em = em_find(td);
>   		KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n"));
>   
>   		em->em_tid = td->td_proc->p_pid;
> +
> +		 /* epoll should be destroyed in a case of exec. */
> +		pem = pem_find(td->td_proc);
> +		KASSERT(pem != NULL, ("proc_exit: proc emuldata not found.\n"));
> +
> +		if (pem->epoll != NULL) {
> +			emd = pem->epoll;
> +			pem->epoll = NULL;
> +			free(emd, M_EPOLL);
> +		}
>   	}
>   
>   	em->child_clear_tid = NULL;
> @@ -121,6 +137,7 @@ void
>   linux_proc_exit(void *arg __unused, struct proc *p)
>   {
>   	struct linux_pemuldata *pem;
> +	struct epoll_emuldata *emd;
>   	struct thread *td = curthread;
>   
>   	if (__predict_false(SV_CURPROC_ABI() != SV_ABI_LINUX))
> @@ -133,6 +150,12 @@ linux_proc_exit(void *arg __unused, stru
>   
>   	p->p_emuldata = NULL;
>   
> +	if (pem->epoll != NULL) {
> +		emd = pem->epoll;
> +		pem->epoll = NULL;
> +		free(emd, M_EPOLL);
> +	}
> +
>   	sx_destroy(&pem->pem_sx);
>   	free(pem, M_LINUX);
>   }
> @@ -141,6 +164,7 @@ int
>   linux_common_execve(struct thread *td, struct image_args *eargs)
>   {
>   	struct linux_pemuldata *pem;
> +	struct epoll_emuldata *emd;
>   	struct linux_emuldata *em;
>   	struct proc *p;
>   	int error;
> @@ -180,6 +204,12 @@ linux_common_execve(struct thread *td, s
>   		p->p_emuldata = NULL;
>   		PROC_UNLOCK(p);
>   
> +		if (pem->epoll != NULL) {
> +			emd = pem->epoll;
> +			pem->epoll = NULL;
> +			free(emd, M_EPOLL);
> +		}
> +
>   		free(em, M_TEMP);
>   		free(pem, M_LINUX);
>   	}
> @@ -197,6 +227,7 @@ linux_proc_exec(void *arg __unused, stru
>   	 */
>   	if (__predict_false((imgp->sysent->sv_flags & SV_ABI_MASK) ==
>   	    SV_ABI_LINUX)) {
> +
>   		if (SV_PROC_ABI(p) == SV_ABI_LINUX)
>   			linux_proc_init(td, NULL, 0);
>   		else
>
> Modified: head/sys/compat/linux/linux_emul.h
> ==============================================================================
> --- head/sys/compat/linux/linux_emul.h	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/compat/linux/linux_emul.h	Sun May 24 16:41:39 2015	(r283441)
> @@ -60,9 +60,12 @@ int	linux_common_execve(struct thread *,
>   /* process emuldata flags */
>   #define	LINUX_XDEPR_REQUEUEOP	0x00000001	/* uses deprecated
>   						   futex REQUEUE op*/
> +#define	LINUX_XUNSUP_EPOLL	0x00000002	/* unsupported epoll events */
> +
>   struct linux_pemuldata {
>   	uint32_t	flags;		/* process emuldata flags */
>   	struct sx	pem_sx;		/* lock for this struct */
> +	void		*epoll;		/* epoll data */
>   };
>   
>   #define	LINUX_PEM_XLOCK(p)	sx_xlock(&(p)->pem_sx)
>
> Added: head/sys/compat/linux/linux_event.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/compat/linux/linux_event.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -0,0 +1,500 @@
> +/*-
> + * Copyright (c) 2007 Roman Divacky
> + * Copyright (c) 2014 Dmitry Chagin
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <sys/cdefs.h>
> +__FBSDID("$FreeBSD$");
> +
> +#include "opt_compat.h"
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/imgact.h>
> +#include <sys/kernel.h>
> +#include <sys/limits.h>
> +#include <sys/lock.h>
> +#include <sys/mutex.h>
> +#include <sys/capability.h>
> +#include <sys/types.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
> +#include <sys/errno.h>
> +#include <sys/event.h>
> +#include <sys/proc.h>
> +#include <sys/sx.h>
> +#include <sys/syscallsubr.h>
> +#include <sys/timespec.h>
> +
> +#ifdef COMPAT_LINUX32
> +#include <machine/../linux32/linux.h>
> +#include <machine/../linux32/linux32_proto.h>
> +#else
> +#include <machine/../linux/linux.h>
> +#include <machine/../linux/linux_proto.h>
> +#endif
> +
> +#include <compat/linux/linux_emul.h>
> +#include <compat/linux/linux_event.h>
> +#include <compat/linux/linux_file.h>
> +#include <compat/linux/linux_util.h>
> +
> +/*
> + * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
> + * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
> + * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
> + * data verbatuim. Therefore we allocate 64-bit memory block to pass
> + * user supplied data for every file descriptor.
> + */
> +
> +typedef uint64_t	epoll_udata_t;
> +
> +struct epoll_emuldata {
> +	uint32_t	fdc;		/* epoll udata max index */
> +	epoll_udata_t	udata[1];	/* epoll user data vector */
> +};
> +
> +#define	EPOLL_DEF_SZ		16
> +#define	EPOLL_SIZE(fdn)			\
> +	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
> +
> +struct epoll_event {
> +	uint32_t	events;
> +	epoll_udata_t	data;
> +}
> +#if defined(__amd64__)
> +__attribute__((packed))
> +#endif
> +;
> +
> +#define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
> +
> +static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
> +static int	epoll_to_kevent(struct thread *td, struct file *epfp,
> +		    int fd, struct epoll_event *l_event, int *kev_flags,
> +		    struct kevent *kevent, int *nkevents);
> +static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
> +static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
> +static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
> +static int	epoll_delete_event(struct thread *td, struct file *epfp,
> +		    int fd, int filter);
> +static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
> +		    int fd);
> +
> +struct epoll_copyin_args {
> +	struct kevent	*changelist;
> +};
> +
> +struct epoll_copyout_args {
> +	struct epoll_event	*leventlist;
> +	struct proc		*p;
> +	uint32_t		count;
> +	int			error;
> +};
> +
> +
> +static void
> +epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
> +{
> +	struct linux_pemuldata *pem;
> +	struct epoll_emuldata *emd;
> +	struct proc *p;
> +
> +	p = td->td_proc;
> +
> +	pem = pem_find(p);
> +	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
> +
> +	LINUX_PEM_XLOCK(pem);
> +	if (pem->epoll == NULL) {
> +		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
> +		emd->fdc = fd;
> +		pem->epoll = emd;
> +	} else {
> +		emd = pem->epoll;
> +		if (fd > emd->fdc) {
> +			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
> +			emd->fdc = fd;
> +			pem->epoll = emd;
> +		}
> +	}
> +	emd->udata[fd] = udata;
> +	LINUX_PEM_XUNLOCK(pem);
> +}
> +
> +static int
> +epoll_create_common(struct thread *td, int flags)
> +{
> +	int error;
> +
> +	error = kern_kqueue(td, flags);
> +	if (error)
> +		return (error);
> +
> +	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
> +
> +	return (0);
> +}
> +
> +int
> +linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
> +{
> +
> +	/*
> +	 * args->size is unused. Linux just tests it
> +	 * and then forgets it as well.
> +	 */
> +	if (args->size <= 0)
> +		return (EINVAL);
> +
> +	return (epoll_create_common(td, 0));
> +}
> +
> +int
> +linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
> +{
> +	int flags;
> +
> +	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
> +		return (EINVAL);
> +
> +	flags = 0;
> +	if ((args->flags & LINUX_O_CLOEXEC) != 0)
> +		flags |= O_CLOEXEC;
> +
> +	return (epoll_create_common(td, flags));
> +}
> +
> +/* Structure converting function from epoll to kevent. */
> +static int
> +epoll_to_kevent(struct thread *td, struct file *epfp,
> +    int fd, struct epoll_event *l_event, int *kev_flags,
> +    struct kevent *kevent, int *nkevents)
> +{
> +	uint32_t levents = l_event->events;
> +	struct linux_pemuldata *pem;
> +	struct proc *p;
> +
> +	/* flags related to how event is registered */
> +	if ((levents & LINUX_EPOLLONESHOT) != 0)
> +		*kev_flags |= EV_ONESHOT;
> +	if ((levents & LINUX_EPOLLET) != 0)
> +		*kev_flags |= EV_CLEAR;
> +
> +	/* flags related to what event is registered */
> +	if ((levents & LINUX_EPOLL_EVRD) != 0) {
> +		EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
> +		++(*nkevents);
> +	}
> +	if ((levents & LINUX_EPOLL_EVWR) != 0) {
> +		EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
> +		++(*nkevents);
> +	}
> +
> +	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
> +		p = td->td_proc;
> +
> +		pem = pem_find(p);
> +		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
> +		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
> +
> +		LINUX_PEM_XLOCK(pem);
> +		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
> +			pem->flags |= LINUX_XUNSUP_EPOLL;
> +			LINUX_PEM_XUNLOCK(pem);
> +			linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
> +			    levents);
> +		} else
> +			LINUX_PEM_XUNLOCK(pem);
> +		return (EINVAL);
> +	}
> +
> +	return (0);
> +}
> +
> +/*
> + * Structure converting function from kevent to epoll. In a case
> + * this is called on error in registration we store the error in
> + * event->data and pick it up later in linux_epoll_ctl().
> + */
> +static void
> +kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
> +{
> +
> +	if ((kevent->flags & EV_ERROR) != 0)
> +		return;
> +
> +	switch (kevent->filter) {
> +	case EVFILT_READ:
> +		l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
> +	break;
> +	case EVFILT_WRITE:
> +		l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
> +	break;
> +	}
> +}
> +
> +/*
> + * Copyout callback used by kevent. This converts kevent
> + * events to epoll events and copies them back to the
> + * userspace. This is also called on error on registering
> + * of the filter.
> + */
> +static int
> +epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
> +{
> +	struct epoll_copyout_args *args;
> +	struct linux_pemuldata *pem;
> +	struct epoll_emuldata *emd;
> +	struct epoll_event *eep;
> +	int error, fd, i;
> +
> +	args = (struct epoll_copyout_args*) arg;
> +	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
> +
> +	pem = pem_find(args->p);
> +	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
> +	LINUX_PEM_SLOCK(pem);
> +	emd = pem->epoll;
> +	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
> +
> +	for (i = 0; i < count; i++) {
> +		kevent_to_epoll(&kevp[i], &eep[i]);
> +
> +		fd = kevp[i].ident;
> +		KASSERT(fd <= emd->fdc, ("epoll user data vector"
> +						    " is too small.\n"));
> +		eep[i].data = emd->udata[fd];
> +	}
> +	LINUX_PEM_SUNLOCK(pem);
> +
> +	error = copyout(eep, args->leventlist, count * sizeof(*eep));
> +	if (error == 0) {
> +		args->leventlist += count;
> +		args->count += count;
> +	} else if (args->error == 0)
> +		args->error = error;
> +
> +	free(eep, M_EPOLL);
> +	return (error);
> +}
> +
> +/*
> + * Copyin callback used by kevent. This copies already
> + * converted filters from kernel memory to the kevent
> + * internal kernel memory. Hence the memcpy instead of
> + * copyin.
> + */
> +static int
> +epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
> +{
> +	struct epoll_copyin_args *args;
> +
> +	args = (struct epoll_copyin_args*) arg;
> +	
> +	memcpy(kevp, args->changelist, count * sizeof(*kevp));
> +	args->changelist += count;
> +
> +	return (0);
> +}
> +
> +/*
> + * Load epoll filter, convert it to kevent filter
> + * and load it into kevent subsystem.
> + */
> +int
> +linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
> +{
> +	struct file *epfp, *fp;
> +	struct epoll_copyin_args ciargs;
> +	struct kevent kev[2];
> +	struct kevent_copyops k_ops = { &ciargs,
> +					NULL,
> +					epoll_kev_copyin};
> +	struct epoll_event le;
> +	cap_rights_t rights;
> +	int kev_flags;
> +	int nchanges = 0;
> +	int error;
> +
> +	if (args->op != LINUX_EPOLL_CTL_DEL) {
> +		error = copyin(args->event, &le, sizeof(le));
> +		if (error != 0)
> +			return (error);
> +	}
> +
> +	error = fget(td, args->epfd,
> +	    cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
> +	if (error != 0)
> +		return (error);
> +	if (epfp->f_type != DTYPE_KQUEUE)
> +		goto leave1;
> +
> +	 /* Protect user data vector from incorrectly supplied fd. */
> +	error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
> +	if (error != 0)
> +		goto leave1;
> +
> +	/* Linux disallows spying on himself */
> +	if (epfp == fp) {
> +		error = EINVAL;
> +		goto leave0;
> +	}
> +
> +	ciargs.changelist = kev;
> +
> +	switch (args->op) {
> +	case LINUX_EPOLL_CTL_MOD:
> +		/*
> +		 * We don't memorize which events were set for this FD
> +		 * on this level, so just delete all we could have set:
> +		 * EVFILT_READ and EVFILT_WRITE, ignoring any errors
> +		 */
> +		error = epoll_delete_all_events(td, epfp, args->fd);
> +		if (error)
> +			goto leave0;
> +		/* FALLTHROUGH */
> +
> +	case LINUX_EPOLL_CTL_ADD:
> +			kev_flags = EV_ADD | EV_ENABLE;
> +		break;
> +
> +	case LINUX_EPOLL_CTL_DEL:
> +		/* CTL_DEL means unregister this fd with this epoll */
> +		error = epoll_delete_all_events(td, epfp, args->fd);
> +		goto leave0;
> +
> +	default:
> +		error = EINVAL;
> +		goto leave0;
> +	}
> +
> +	error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags,
> +	    kev, &nchanges);
> +	if (error)
> +		goto leave0;
> +
> +	epoll_fd_install(td, args->fd, le.data);
> +
> +	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
> +
> +leave0:
> +	fdrop(fp, td);
> +
> +leave1:
> +	fdrop(epfp, td);
> +	return (error);
> +}
> +
> +/*
> + * Wait for a filter to be triggered on the epoll file descriptor.
> + */
> +int
> +linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
> +{
> +	struct file *epfp;
> +	struct timespec ts, *tsp;
> +	cap_rights_t rights;
> +	struct epoll_copyout_args coargs;
> +	struct kevent_copyops k_ops = { &coargs,
> +					epoll_kev_copyout,
> +					NULL};
> +	int error;
> +
> +	if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
> +		return (EINVAL);
> +
> +	error = fget(td, args->epfd,
> +	    cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
> +	if (error != 0)
> +		return (error);
> +
> +	coargs.leventlist = args->events;
> +	coargs.p = td->td_proc;
> +	coargs.count = 0;
> +	coargs.error = 0;
> +
> +	if (args->timeout != -1) {
> +		if (args->timeout < 0) {
> +			error = EINVAL;
> +			goto leave;
> +		}
> +		/* Convert from milliseconds to timespec. */
> +		ts.tv_sec = args->timeout / 1000;
> +		ts.tv_nsec = (args->timeout % 1000) * 1000000;
> +		tsp = &ts;
> +	} else {
> +		tsp = NULL;
> +	}
> +
> +	error = kern_kevent_fp(td, epfp, 0, args->maxevents, &k_ops, tsp);
> +	if (error == 0 && coargs.error != 0)
> +		error = coargs.error;
> +
> +	/*
> +	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
> +	 * Maybe we should translate that but I don't think it matters at all.
> +	 */
> +	if (error == 0)
> +		td->td_retval[0] = coargs.count;
> +leave:
> +	fdrop(epfp, td);
> +	return (error);
> +}
> +
> +static int
> +epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
> +{
> +	struct epoll_copyin_args ciargs;
> +	struct kevent kev;
> +	struct kevent_copyops k_ops = { &ciargs,
> +					NULL,
> +					epoll_kev_copyin};
> +	int error;
> +
> +	ciargs.changelist = &kev;
> +	EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
> +
> +	error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL);
> +
> +	/*
> +	 * here we ignore ENONT, because we don't keep track of events here
> +	 */
> +	if (error == ENOENT)
> +		error = 0;
> +	return (error);
> +}
> +
> +static int
> +epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
> +{
> +	int error1, error2;
> +
> +	error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
> +	error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
> +
> +	/* report any errors we got */
> +	return (error1 == 0 ? error2 : error1);
> +}
>
> Added: head/sys/compat/linux/linux_event.h
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/compat/linux/linux_event.h	Sun May 24 16:41:39 2015	(r283441)
> @@ -0,0 +1,58 @@
> +/*-
> + * Copyright (c) 2007 Roman Divacky
> + * Copyright (c) 2014 Dmitry Chagin
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * $FreeBSD$
> + */
> +
> +#ifndef _LINUX_EVENT_H_
> +#define	_LINUX_EVENT_H_
> +
> +#define	LINUX_EPOLLIN		0x001
> +#define	LINUX_EPOLLPRI		0x002
> +#define	LINUX_EPOLLOUT		0x004
> +#define	LINUX_EPOLLRDNORM	0x040
> +#define	LINUX_EPOLLRDBAND	0x080
> +#define	LINUX_EPOLLWRNORM	0x100
> +#define	LINUX_EPOLLWRBAND	0x200
> +#define	LINUX_EPOLLMSG		0x400
> +#define	LINUX_EPOLLERR		0x008
> +#define	LINUX_EPOLLHUP		0x010
> +#define	LINUX_EPOLLRDHUP	0x2000
> +#define	LINUX_EPOLLWAKEUP	1u<<29
> +#define	LINUX_EPOLLONESHOT	1u<<30
> +#define	LINUX_EPOLLET		1u<<31
> +
> +#define	LINUX_EPOLL_EVRD	(LINUX_EPOLLIN|LINUX_EPOLLRDNORM	\
> +		|LINUX_EPOLLHUP|LINUX_EPOLLPRI)
> +#define	LINUX_EPOLL_EVWR	(LINUX_EPOLLOUT|LINUX_EPOLLWRNORM)
> +#define	LINUX_EPOLL_EVSUP	(LINUX_EPOLLET|LINUX_EPOLLONESHOT	\
> +		|LINUX_EPOLL_EVRD|LINUX_EPOLL_EVWR)
> +
> +#define	LINUX_EPOLL_CTL_ADD	1
> +#define	LINUX_EPOLL_CTL_DEL	2
> +#define	LINUX_EPOLL_CTL_MOD	3
> +
> +#endif	/* !_LINUX_EVENT_H_ */
>
> Modified: head/sys/compat/linux/linux_util.c
> ==============================================================================
> --- head/sys/compat/linux/linux_util.c	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/compat/linux/linux_util.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$");
>   #include <compat/linux/linux_util.h>
>   
>   MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
> +MALLOC_DEFINE(M_EPOLL, "lepoll", "Linux events structures");
>   MALLOC_DEFINE(M_FUTEX, "futex", "Linux futexes");
>   MALLOC_DEFINE(M_FUTEX_WP, "futex wp", "Linux futex waiting proc");
>   
>
> Modified: head/sys/compat/linux/linux_util.h
> ==============================================================================
> --- head/sys/compat/linux/linux_util.h	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/compat/linux/linux_util.h	Sun May 24 16:41:39 2015	(r283441)
> @@ -45,6 +45,7 @@
>   #include <sys/uio.h>
>   
>   MALLOC_DECLARE(M_LINUX);
> +MALLOC_DECLARE(M_EPOLL);
>   MALLOC_DECLARE(M_FUTEX);
>   MALLOC_DECLARE(M_FUTEX_WP);
>   
>
> Modified: head/sys/conf/files.amd64
> ==============================================================================
> --- head/sys/conf/files.amd64	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/conf/files.amd64	Sun May 24 16:41:39 2015	(r283441)
> @@ -509,6 +509,7 @@ compat/linux/linux_uid16.c	optional	comp
>   compat/linux/linux_util.c	optional	compat_linux32
>   compat/linux/linux_vdso.c	optional	compat_linux32
>   compat/linux/linux_common.c	optional	compat_linux32
> +compat/linux/linux_event.c	optional	compat_linux32
>   dev/amr/amr_linux.c		optional	compat_linux32 amr
>   dev/mfi/mfi_linux.c		optional	compat_linux32 mfi
>   #
>
> Modified: head/sys/conf/files.i386
> ==============================================================================
> --- head/sys/conf/files.i386	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/conf/files.i386	Sun May 24 16:41:39 2015	(r283441)
> @@ -81,6 +81,7 @@ hptrr_lib.o			optional	hptrr			\
>   cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S	optional zfs compile-with "${ZFS_S}"
>   compat/linprocfs/linprocfs.c	optional linprocfs
>   compat/linsysfs/linsysfs.c	optional linsysfs
> +compat/linux/linux_event.c	optional compat_linux
>   compat/linux/linux_emul.c	optional compat_linux
>   compat/linux/linux_file.c	optional compat_linux
>   compat/linux/linux_fork.c	optional compat_linux
>
> Modified: head/sys/conf/files.pc98
> ==============================================================================
> --- head/sys/conf/files.pc98	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/conf/files.pc98	Sun May 24 16:41:39 2015	(r283441)
> @@ -41,6 +41,7 @@ ukbdmap.h			optional	ukbd_dflt_keymap	\
>   cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S	optional zfs compile-with "${ZFS_S}"
>   compat/linprocfs/linprocfs.c	optional linprocfs
>   compat/linsysfs/linsysfs.c	optional linsysfs
> +compat/linux/linux_event.c	optional compat_linux
>   compat/linux/linux_emul.c	optional compat_linux
>   compat/linux/linux_file.c	optional compat_linux
>   compat/linux/linux_fork.c	optional compat_linux
>
> Modified: head/sys/i386/linux/linux_dummy.c
> ==============================================================================
> --- head/sys/i386/linux/linux_dummy.c	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/i386/linux/linux_dummy.c	Sun May 24 16:41:39 2015	(r283441)
> @@ -70,9 +70,6 @@ DUMMY(setfsgid);
>   DUMMY(pivot_root);
>   DUMMY(mincore);
>   DUMMY(lookup_dcookie);
> -DUMMY(epoll_create);
> -DUMMY(epoll_ctl);
> -DUMMY(epoll_wait);
>   DUMMY(remap_file_pages);
>   DUMMY(fstatfs64);
>   DUMMY(mbind);
> @@ -116,7 +113,6 @@ DUMMY(timerfd_gettime);
>   /* linux 2.6.27: */
>   DUMMY(signalfd4);
>   DUMMY(eventfd2);
> -DUMMY(epoll_create1);
>   DUMMY(inotify_init1);
>   /* linux 2.6.30: */
>   DUMMY(preadv);
>
> Modified: head/sys/i386/linux/syscalls.master
> ==============================================================================
> --- head/sys/i386/linux/syscalls.master	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/i386/linux/syscalls.master	Sun May 24 16:41:39 2015	(r283441)
> @@ -432,9 +432,11 @@
>   251	AUE_NULL	UNIMPL
>   252	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }
>   253	AUE_NULL	STD	{ int linux_lookup_dcookie(void); }
> -254	AUE_NULL	STD	{ int linux_epoll_create(void); }
> -255	AUE_NULL	STD	{ int linux_epoll_ctl(void); }
> -256	AUE_NULL	STD	{ int linux_epoll_wait(void); }
> +254	AUE_NULL	STD	{ int linux_epoll_create(l_int size); }
> +255	AUE_NULL	STD	{ int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
> +					struct epoll_event *event); }
> +256	AUE_NULL	STD	{ int linux_epoll_wait(l_int epfd, struct epoll_event *events, \
> +					l_int maxevents, l_int timeout); }
>   257	AUE_NULL	STD	{ int linux_remap_file_pages(void); }
>   258	AUE_NULL	STD	{ int linux_set_tid_address(int *tidptr); }
>   259	AUE_NULL	STD	{ int linux_timer_create(clockid_t clock_id, \
> @@ -535,7 +537,8 @@
>   317	AUE_NULL	STD	{ int linux_move_pages(void); }
>   ; linux 2.6.19:
>   318	AUE_NULL	STD	{ int linux_getcpu(void); }
> -319	AUE_NULL	STD	{ int linux_epoll_pwait(void); }
> +319	AUE_NULL	STD	{ int linux_epoll_pwait(l_int epfd, struct epoll_event *events, \
> +					l_int maxevents, l_int timeout, l_osigset_t *mask); }
>   ; linux 2.6.22:
>   320	AUE_NULL	STD	{ int linux_utimensat(void); }
>   321	AUE_NULL	STD	{ int linux_signalfd(void); }
> @@ -549,7 +552,7 @@
>   ; linux 2.6.27:
>   327	AUE_NULL	STD	{ int linux_signalfd4(void); }
>   328	AUE_NULL	STD	{ int linux_eventfd2(void); }
> -329	AUE_NULL	STD	{ int linux_epoll_create1(void); }
> +329	AUE_NULL	STD	{ int linux_epoll_create1(l_int flags); }
>   330	AUE_NULL	STD	{ int linux_dup3(l_int oldfd,		\
>   					l_int newfd, l_int flags); }
>   331	AUE_NULL	STD	{ int linux_pipe2(l_int *pipefds, l_int flags); }
>
> Modified: head/sys/modules/linux/Makefile
> ==============================================================================
> --- head/sys/modules/linux/Makefile	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/modules/linux/Makefile	Sun May 24 16:41:39 2015	(r283441)
> @@ -10,7 +10,7 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINU
>   VDSO=	linux${SFX}_vdso
>   
>   KMOD=	linux
> -SRCS=	linux_fork.c linux${SFX}_dummy.c linux_file.c \
> +SRCS=	linux_fork.c linux${SFX}_dummy.c linux_file.c linux_event.c \
>   	linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
>   	linux${SFX}_machdep.c linux_misc.c linux_signal.c \
>   	linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
>
> Modified: head/sys/modules/linux64/Makefile
> ==============================================================================
> --- head/sys/modules/linux64/Makefile	Sun May 24 16:36:29 2015	(r283440)
> +++ head/sys/modules/linux64/Makefile	Sun May 24 16:41:39 2015	(r283441)
> @@ -5,7 +5,7 @@
>   VDSO=	linux_vdso
>   
>   KMOD=	linux64
> -SRCS=	linux_fork.c linux_dummy.c linux_file.c \
> +SRCS=	linux_fork.c linux_dummy.c linux_file.c linux_event.c \
>   	linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
>   	linux_machdep.c linux_misc.c linux_signal.c \
>   	linux_socket.c linux_stats.c linux_sysctl.c linux_sysent.c \
>
>




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?55628BFE.5070306>