Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 29 Jun 2020 03:09:14 +0000 (UTC)
From:      Kyle Evans <kevans@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r362769 - in head/sys: amd64/linux amd64/linux32 arm64/linux compat/linux i386/linux
Message-ID:  <202006290309.05T39ETZ044859@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kevans
Date: Mon Jun 29 03:09:14 2020
New Revision: 362769
URL: https://svnweb.freebsd.org/changeset/base/362769

Log:
  linuxolator: implement memfd_create syscall
  
  This effectively mirrors our libc implementation, but with minor fudging --
  name needs to be copied in from userspace, so we just copy it straight into
  stack-allocated memfd_name into the correct position rather than allocating
  memory that needs to be cleaned up.
  
  The sealing-related fcntl(2) commands, F_GET_SEALS and F_ADD_SEALS, have
  also been implemented now that we support them.
  
  Note that this implementation is still not quite at feature parity w.r.t.
  the actual Linux version; some caveats, from my foggy memory:
  
  - Need to implement SHM_GROW_ON_WRITE, default for memfd (in progress)
  - LTP wants the memfd name exposed to fdescfs
  - Linux allows open() of an fdescfs fd with O_TRUNC to truncate after dup.
    (?)
  
  Interested parties can install and run LTP from ports (devel/linux-ltp) to
  confirm any fixes.
  
  PR:		240874
  Reviewed by:	kib, trasz
  Differential Revision:	https://reviews.freebsd.org/D21845

Modified:
  head/sys/amd64/linux/linux_dummy.c
  head/sys/amd64/linux32/linux32_dummy.c
  head/sys/arm64/linux/linux_dummy.c
  head/sys/compat/linux/linux.c
  head/sys/compat/linux/linux.h
  head/sys/compat/linux/linux_file.c
  head/sys/compat/linux/linux_file.h
  head/sys/i386/linux/linux_dummy.c

Modified: head/sys/amd64/linux/linux_dummy.c
==============================================================================
--- head/sys/amd64/linux/linux_dummy.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/amd64/linux/linux_dummy.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -138,7 +138,6 @@ DUMMY(sched_getattr);
 /* Linux 3.15: */
 DUMMY(kexec_file_load);
 /* Linux 3.17: */
-DUMMY(memfd_create);
 DUMMY(seccomp);
 /* Linux 3.18: */
 DUMMY(bpf);

Modified: head/sys/amd64/linux32/linux32_dummy.c
==============================================================================
--- head/sys/amd64/linux32/linux32_dummy.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/amd64/linux32/linux32_dummy.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -133,7 +133,6 @@ DUMMY(finit_module);
 DUMMY(sched_setattr);
 DUMMY(sched_getattr);
 /* Linux 3.17: */
-DUMMY(memfd_create);
 DUMMY(seccomp);
 /* Linux 3.18: */
 DUMMY(bpf);

Modified: head/sys/arm64/linux/linux_dummy.c
==============================================================================
--- head/sys/arm64/linux/linux_dummy.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/arm64/linux/linux_dummy.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -127,7 +127,6 @@ DUMMY(finit_module);
 DUMMY(sched_setattr);
 DUMMY(sched_getattr);
 /* Linux 3.17: */
-DUMMY(memfd_create);
 DUMMY(seccomp);
 /* Linux 3.18: */
 DUMMY(bpf);

Modified: head/sys/compat/linux/linux.c
==============================================================================
--- head/sys/compat/linux/linux.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/compat/linux/linux.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -551,3 +551,79 @@ linux_dev_shm_destroy(void)
 
 	destroy_dev(dev_shm_cdev);
 }
+
+int
+bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
+    size_t mapcnt, int no_value)
+{
+	int bsd_mask, bsd_value, linux_mask, linux_value;
+	int linux_ret;
+	size_t i;
+	bool applied;
+
+	applied = false;
+	linux_ret = 0;
+	for (i = 0; i < mapcnt; ++i) {
+		bsd_mask = bitmap[i].bsd_mask;
+		bsd_value = bitmap[i].bsd_value;
+		if (bsd_mask == 0)
+			bsd_mask = bsd_value;
+
+		linux_mask = bitmap[i].linux_mask;
+		linux_value = bitmap[i].linux_value;
+		if (linux_mask == 0)
+			linux_mask = linux_value;
+
+		/*
+		 * If a mask larger than just the value is set, we explicitly
+		 * want to make sure that only this bit we mapped within that
+		 * mask is set.
+		 */
+		if ((value & bsd_mask) == bsd_value) {
+			linux_ret = (linux_ret & ~linux_mask) | linux_value;
+			applied = true;
+		}
+	}
+
+	if (!applied)
+		return (no_value);
+	return (linux_ret);
+}
+
+int
+linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
+    size_t mapcnt, int no_value)
+{
+	int bsd_mask, bsd_value, linux_mask, linux_value;
+	int bsd_ret;
+	size_t i;
+	bool applied;
+
+	applied = false;
+	bsd_ret = 0;
+	for (i = 0; i < mapcnt; ++i) {
+		bsd_mask = bitmap[i].bsd_mask;
+		bsd_value = bitmap[i].bsd_value;
+		if (bsd_mask == 0)
+			bsd_mask = bsd_value;
+
+		linux_mask = bitmap[i].linux_mask;
+		linux_value = bitmap[i].linux_value;
+		if (linux_mask == 0)
+			linux_mask = linux_value;
+
+		/*
+		 * If a mask larger than just the value is set, we explicitly
+		 * want to make sure that only this bit we mapped within that
+		 * mask is set.
+		 */
+		if ((value & linux_mask) == linux_value) {
+			bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value;
+			applied = true;
+		}
+	}
+
+	if (!applied)
+		return (no_value);
+	return (bsd_ret);
+}

Modified: head/sys/compat/linux/linux.h
==============================================================================
--- head/sys/compat/linux/linux.h	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/compat/linux/linux.h	Mon Jun 29 03:09:14 2020	(r362769)
@@ -148,4 +148,49 @@ extern struct mtx futex_mtx;
 void linux_dev_shm_create(void);
 void linux_dev_shm_destroy(void);
 
+/*
+ * mask=0 is not sensible for this application, so it will be taken to mean
+ * a mask equivalent to the value.  Otherwise, (word & mask) == value maps to
+ * (word & ~mask) | value in a bitfield for the platform we're converting to.
+ */
+struct bsd_to_linux_bitmap {
+	int	bsd_mask;
+	int	bsd_value;
+	int	linux_mask;
+	int	linux_value;
+};
+
+int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
+    size_t mapcnt, int no_value);
+int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
+    size_t mapcnt, int no_value);
+
+#define	bsd_to_linux_bits(_val, _bmap, _noval) \
+    bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval))
+
+/*
+ * These functions are used for simplification of BSD <-> Linux bit conversions.
+ * Given `value`, a bit field, these functions will walk the given bitmap table
+ * and set the appropriate bits for the target platform.  If any bits were
+ * successfully converted, then the return value is the equivalent of value
+ * represented with the bit values appropriate for the target platform.
+ * Otherwise, the value supplied as `no_value` is returned.
+ */
+#define	linux_to_bsd_bits(_val, _bmap, _noval) \
+    linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval))
+
+/*
+ * Easy mapping helpers.  BITMAP_EASY_LINUX represents a single bit to be
+ * translated, and the FreeBSD and Linux values are supplied.  BITMAP_1t1_LINUX
+ * is the extreme version of this, where not only is it a single bit, but the
+ * name of the macro used to represent the Linux version of a bit literally has
+ * LINUX_ prepended to the normal name.
+ */
+#define	BITMAP_EASY_LINUX(_name, _linux_name)	\
+	{					\
+		.bsd_value = (_name),		\
+		.linux_value = (_linux_name),	\
+	}
+#define	BITMAP_1t1_LINUX(_name)	BITMAP_EASY_LINUX(_name, LINUX_##_name)
+
 #endif /* _LINUX_MI_H_ */

Modified: head/sys/compat/linux/linux_file.c
==============================================================================
--- head/sys/compat/linux/linux_file.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/compat/linux/linux_file.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
@@ -68,6 +69,37 @@ __FBSDID("$FreeBSD$");
 static int	linux_common_open(struct thread *, int, char *, int, int);
 static int	linux_getdents_error(struct thread *, int, int);
 
+static struct bsd_to_linux_bitmap seal_bitmap[] = {
+	BITMAP_1t1_LINUX(F_SEAL_SEAL),
+	BITMAP_1t1_LINUX(F_SEAL_SHRINK),
+	BITMAP_1t1_LINUX(F_SEAL_GROW),
+	BITMAP_1t1_LINUX(F_SEAL_WRITE),
+};
+
+#define	MFD_HUGETLB_ENTRY(_size)					\
+	{								\
+		.bsd_value = MFD_HUGE_##_size,				\
+		.linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size	\
+	}
+static struct bsd_to_linux_bitmap mfd_bitmap[] = {
+	BITMAP_1t1_LINUX(MFD_CLOEXEC),
+	BITMAP_1t1_LINUX(MFD_ALLOW_SEALING),
+	BITMAP_1t1_LINUX(MFD_HUGETLB),
+	MFD_HUGETLB_ENTRY(64KB),
+	MFD_HUGETLB_ENTRY(512KB),
+	MFD_HUGETLB_ENTRY(1MB),
+	MFD_HUGETLB_ENTRY(2MB),
+	MFD_HUGETLB_ENTRY(8MB),
+	MFD_HUGETLB_ENTRY(16MB),
+	MFD_HUGETLB_ENTRY(32MB),
+	MFD_HUGETLB_ENTRY(256MB),
+	MFD_HUGETLB_ENTRY(512MB),
+	MFD_HUGETLB_ENTRY(1GB),
+	MFD_HUGETLB_ENTRY(2GB),
+	MFD_HUGETLB_ENTRY(16GB),
+};
+#undef MFD_HUGETLB_ENTRY
+
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_creat(struct thread *td, struct linux_creat_args *args)
@@ -1371,6 +1403,21 @@ fcntl_common(struct thread *td, struct linux_fcntl_arg
 
 	case LINUX_F_DUPFD_CLOEXEC:
 		return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg));
+	/*
+	 * Our F_SEAL_* values match Linux one for maximum compatibility.  So we
+	 * only needed to account for different values for fcntl(2) commands.
+	 */
+	case LINUX_F_GET_SEALS:
+		error = kern_fcntl(td, args->fd, F_GET_SEALS, 0);
+		if (error != 0)
+			return (error);
+		td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0],
+		    seal_bitmap, 0);
+		return (0);
+
+	case LINUX_F_ADD_SEALS:
+		return (kern_fcntl(td, args->fd, F_ADD_SEALS,
+		    linux_to_bsd_bits(args->arg, seal_bitmap, 0)));
 	default:
 		linux_msg(td, "unsupported fcntl cmd %d\n", args->cmd);
 		return (EINVAL);
@@ -1676,3 +1723,46 @@ linux_copy_file_range(struct thread *td, struct linux_
 	return (error);
 }
 
+#define	LINUX_MEMFD_PREFIX	"memfd:"
+
+int
+linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args)
+{
+	char memfd_name[LINUX_NAME_MAX + 1];
+	int error, flags, shmflags, oflags;
+
+	/*
+	 * This is our clever trick to avoid the heap allocation to copy in the
+	 * uname.  We don't really need to go this far out of our way, but it
+	 * does keep the rest of this function fairly clean as they don't have
+	 * to worry about cleanup on the way out.
+	 */
+	error = copyinstr(args->uname_ptr,
+	    memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1,
+	    LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL);
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = EINVAL;
+		return (error);
+	}
+
+	memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1);
+	flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0);
+	if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB |
+	    MFD_HUGE_MASK)) != 0)
+		return (EINVAL);
+	/* Size specified but no HUGETLB. */
+	if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0)
+		return (EINVAL);
+	/* We don't actually support HUGETLB. */
+	if ((flags & MFD_HUGETLB) != 0)
+		return (ENOSYS);
+	oflags = O_RDWR;
+	shmflags = 0;
+	if ((flags & MFD_CLOEXEC) != 0)
+		oflags |= O_CLOEXEC;
+	if ((flags & MFD_ALLOW_SEALING) != 0)
+		shmflags |= SHM_ALLOW_SEALING;
+	return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, NULL,
+	    memfd_name));
+}

Modified: head/sys/compat/linux/linux_file.h
==============================================================================
--- head/sys/compat/linux/linux_file.h	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/compat/linux/linux_file.h	Mon Jun 29 03:09:14 2020	(r362769)
@@ -118,6 +118,9 @@
 #define	LINUX_F_SETPIPE_SZ	(LINUX_F_SPECIFIC_BASE + 7)
 #define	LINUX_F_GETPIPE_SZ	(LINUX_F_SPECIFIC_BASE + 8)
 
+#define	LINUX_F_ADD_SEALS	(LINUX_F_SPECIFIC_BASE + 9)
+#define	LINUX_F_GET_SEALS	(LINUX_F_SPECIFIC_BASE + 10)
+
 #define	LINUX_F_GETLKP		36
 #define	LINUX_F_SETLKP		37
 #define	LINUX_F_SETLKPW		38
@@ -145,5 +148,30 @@
 #define	LINUX_SYNC_FILE_RANGE_WAIT_BEFORE	1
 #define	LINUX_SYNC_FILE_RANGE_WRITE		2
 #define	LINUX_SYNC_FILE_RANGE_WAIT_AFTER	4
+
+#define	LINUX_F_SEAL_SEAL	0x0001
+#define	LINUX_F_SEAL_SHRINK	0x0002
+#define	LINUX_F_SEAL_GROW	0x0004
+#define	LINUX_F_SEAL_WRITE	0x0008
+
+#define	LINUX_MFD_CLOEXEC	0x0001
+#define	LINUX_MFD_ALLOW_SEALING	0x0002
+#define	LINUX_MFD_HUGETLB	0x0004
+
+#define	LINUX_HUGETLB_FLAG_ENCODE_SHIFT	26
+#define	LINUX_HUGETLB_FLAG_ENCODE_MASK	0x3f
+
+#define LINUX_HUGETLB_FLAG_ENCODE_64KB	(16 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_512KB	(19 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_1MB	(20 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_2MB	(21 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_8MB	(23 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_16MB	(24 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_32MB	(25 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_256MB	(28 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_512MB	(29 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_1GB	(30 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_2GB	(31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
+#define LINUX_HUGETLB_FLAG_ENCODE_16GB	(34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT)
 
 #endif	/* !_LINUX_FILE_H_ */

Modified: head/sys/i386/linux/linux_dummy.c
==============================================================================
--- head/sys/i386/linux/linux_dummy.c	Mon Jun 29 02:32:07 2020	(r362768)
+++ head/sys/i386/linux/linux_dummy.c	Mon Jun 29 03:09:14 2020	(r362769)
@@ -129,7 +129,6 @@ DUMMY(finit_module);
 DUMMY(sched_setattr);
 DUMMY(sched_getattr);
 /* Linux 3.17: */
-DUMMY(memfd_create);
 DUMMY(seccomp);
 /* Linux 3.18: */
 DUMMY(bpf);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202006290309.05T39ETZ044859>