Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 13 Jan 2003 20:11:42 -0800 (PST)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        "Alan L. Cox" <alc@imimic.com>
Cc:        Peter Wemm <peter@wemm.org>, arch@FreeBSD.ORG
Subject:   getsysfd() patch #1 (Re: Virtual memory question)
Message-ID:  <200301140411.h0E4BgpN078032@apollo.backplane.com>
References:  <20030114002831.1C8C12A89E@canning.wemm.org> <3E2381F8.85BB90A0@imimic.com>

next in thread | previous in thread | raw e-mail | index | archive | help
    This is a first-attempt workup of getsysfd().  See?  I told ya it was
    trivial!

    This isn't everything.  If we really want to do this right we need to
    create a filesystem inode type to represent a memory rendezvous,
    similar to how we represent a FIFO or SOCKET rendezvous.  If we do that
    then we can support all shm_open() situations using this new call.

    I have only done a small amount of testing, I have not double checked that
    I handle the reference counts properly and I had to reorganize 
    mmap() quite a bit (in fact, it looks like someone did a bunch of 
    rewriting in the mmap()/vm_mmap() code and we really need to rewrite
    the layering).

    Here is a test program.  The patch is below this program.  This should be
    considered a 'test' patch for the moment, my heart isn't set on the
    interface.  e.g. perhaps we want to add additional arguments to make it
    more useful/generic.

						-Matt

#include <sys/types.h>
#include <sys/sysfd.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdio.h>

int
main(int ac, char **av)
{
    int fd = getsysfd(SYSFD_MEMORY, 1024*1024);
    char *ptr1;
    char *ptr2;

    printf("fd = %d %d %s\n", fd, errno, strerror(errno));
    errno = 0;
    ptr1 = mmap(NULL, 1024*1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    printf("mmap: %p (%s)\n", ptr1, strerror(errno));
errno = 0;
    ptr2 = mmap(NULL, 1024*1024, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
    printf("mmap: %p (%s)\n", ptr2, strerror(errno));
    close(fd);
    ptr1[0] = 1;
    ptr1[1024*1024-1] = 2;
    if (fork() == 0) {
        printf("CONTENTS %d %d\n", ptr2[0], ptr2[1024*1024-1]);
        ptr2[0] = 2;		/* modify private mapping */
        ptr1[1024*1024-1] = 3;	/* modify original */
    }
    sleep(1);
    /* SHOULD BE 1 3 */
printf("ORIGCONTENTS %d %d\n", ptr1[0], ptr1[1024*1024-1]);
    return(0);
}





Index: conf/files
===================================================================
RCS file: /home/ncvs/src/sys/conf/files,v
retrieving revision 1.744
diff -u -r1.744 files
--- conf/files	8 Jan 2003 23:36:59 -0000	1.744
+++ conf/files	14 Jan 2003 02:30:47 -0000
@@ -1055,6 +1055,7 @@
 kern/subr_xxx.c		standard
 kern/sys_generic.c	standard
 kern/sys_pipe.c		standard
+kern/sys_sysfd.c	standard
 kern/sys_process.c	standard
 kern/sys_socket.c	standard
 kern/syscalls.c		optional witness
Index: kern/init_sysent.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.146
diff -u -r1.146 init_sysent.c
--- kern/init_sysent.c	8 Jan 2003 04:57:52 -0000	1.146
+++ kern/init_sysent.c	14 Jan 2003 01:58:05 -0000
@@ -2,7 +2,7 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.146 2003/01/08 04:57:52 davidxu Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.140 2003/01/04 11:41:12 davidxu Exp 
  */
 
@@ -457,4 +457,5 @@
 	{ SYF_MPSAFE | AS(__acl_set_link_args), (sy_call_t *)__acl_set_link },	/* 426 = __acl_set_link */
 	{ SYF_MPSAFE | AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link },	/* 427 = __acl_delete_link */
 	{ SYF_MPSAFE | AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link },	/* 428 = __acl_aclcheck_link */
+	{ SYF_MPSAFE | AS(getsysfd_args), (sy_call_t *)getsysfd },	/* 429 = getsysfd */
 };
Index: kern/sys_sysfd.c
===================================================================
RCS file: kern/sys_sysfd.c
diff -N kern/sys_sysfd.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ kern/sys_sysfd.c	14 Jan 2003 03:47:53 -0000
@@ -0,0 +1,208 @@
+/*
+ * KERN/SYS_SYSFD.C
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mac.h>
+#include <sys/mutex.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/event.h>
+#include <sys/sysfd.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * interfaces to the outside world
+ */
+static fo_rdwr_t	memfd_read;
+static fo_rdwr_t	memfd_write;
+static fo_ioctl_t	memfd_ioctl;
+static fo_poll_t	memfd_poll;
+static fo_stat_t	memfd_stat;
+static fo_close_t	memfd_close;
+
+static struct fileops memfdops = {
+	memfd_read, memfd_write, memfd_ioctl, memfd_poll, NULL,
+	memfd_stat, memfd_close
+};
+
+/*
+ * The getsysfd() system call.  getsysfd(int type, off_t size)
+ *
+ *	SYSFD_MEMORY	- Return a descriptor which can be mmap()'d, 
+ *			  representing anonymous, shareable swap-backed
+ *			  memory.
+ *
+ */
+
+int
+getsysfd(struct thread *td, struct getsysfd_args *uap)
+{
+	int error;
+	int fd;
+	vm_pindex_t npages;
+	struct file *fp;
+	struct filedesc *fdp;
+
+	/*
+	 * Validate the size
+	 */
+	printf("GETSYSFD %d %lld\n", uap->type, (long long)uap->size);
+	if (uap->size < 0)
+		return(EINVAL);
+	npages = round_page(uap->size) >> PAGE_SHIFT;
+
+	/*
+	 * Allocate a new descriptor.  the descriptor will be returned with a
+	 * reference associated with fd_ofiles[fd].
+	 *
+	 * XXX falloc() really should return with two references on the desc,
+	 * not one, so it can't be ripped out from under us.
+	 */
+	error = falloc(td, &fp, &fd);
+	if (error)
+		return(error);
+	fhold(fp);
+	FILE_LOCK(fp);
+	fp->f_flag = FREAD | FWRITE;
+
+	switch(uap->type) {
+	case SYSFD_MEMORY:
+		fp->f_type = DTYPE_MEMFD;
+		fp->f_data = vm_object_allocate(OBJT_DEFAULT, npages);
+		fp->f_ops = &memfdops;
+		if (fp->f_data == NULL)
+			error = ENOMEM;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	FILE_UNLOCK(fp);
+	if (error) {
+		fdp = td->td_proc->p_fd;
+		FILEDESC_LOCK(fdp);
+		if (fdp->fd_ofiles[fd] == fp) {
+			fdp->fd_ofiles[fd] = NULL;
+                        fdp->fd_ofileflags[fd] = 0;
+			fdrop(fp, td);	/* drop ofiles[] array reference */
+                        if (fd < fdp->fd_freefile)
+                                fdp->fd_freefile = fd;
+		}
+		FILEDESC_UNLOCK(fdp);
+		/* closef(fp, td); NOT NECESSARY */
+	} else {
+		td->td_retval[0] = fd;
+	}
+	fdrop(fp, td);	/* drop our reference */
+	return(error);
+}
+
+/* ARGSUSED */
+static int
+memfd_read(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	struct thread *td;
+	int flags;
+{
+	return(EOPNOTSUPP);
+}
+
+static int
+memfd_write(fp, uio, active_cred, flags, td)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *active_cred;
+	struct thread *td;
+	int flags;
+{
+	return(EOPNOTSUPP);
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+static int
+memfd_ioctl(fp, cmd, data, active_cred, td)
+	struct file *fp;
+	u_long cmd;
+	void *data;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	return(EINVAL);
+}
+
+static int
+memfd_poll(fp, events, active_cred, td)
+	struct file *fp;
+	int events;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	return(0);
+}
+
+/*
+ * We shouldn't need locks here as we're doing a read and this should
+ * be a natural race.
+ */
+static int
+memfd_stat(fp, ub, active_cred, td)
+	struct file *fp;
+	struct stat *ub;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	return(EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+static int
+memfd_close(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+	vm_object_t object;
+
+	FILE_LOCK(fp);
+	object = fp->f_data;
+	fp->f_data = NULL;
+	FILE_UNLOCK(fp);
+
+	mtx_lock(&Giant);
+	if (object)
+		vm_object_deallocate(object);
+	mtx_unlock(&Giant);
+	return(0);
+}
+
Index: kern/syscalls.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.c,v
retrieving revision 1.132
diff -u -r1.132 syscalls.c
--- kern/syscalls.c	8 Jan 2003 04:57:52 -0000	1.132
+++ kern/syscalls.c	14 Jan 2003 01:58:05 -0000
@@ -2,7 +2,7 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.132 2003/01/08 04:57:52 davidxu Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.140 2003/01/04 11:41:12 davidxu Exp 
  */
 
@@ -436,4 +436,5 @@
 	"__acl_set_link",			/* 426 = __acl_set_link */
 	"__acl_delete_link",			/* 427 = __acl_delete_link */
 	"__acl_aclcheck_link",			/* 428 = __acl_aclcheck_link */
+	"getsysfd",			/* 429 = getsysfd */
 };
Index: kern/syscalls.master
===================================================================
RCS file: /home/ncvs/src/sys/kern/syscalls.master,v
retrieving revision 1.140
diff -u -r1.140 syscalls.master
--- kern/syscalls.master	4 Jan 2003 11:41:12 -0000	1.140
+++ kern/syscalls.master	14 Jan 2003 01:58:03 -0000
@@ -621,6 +621,7 @@
 			    acl_type_t type); }
 428	MSTD	BSD	{ int __acl_aclcheck_link(const char *path, \
 			    acl_type_t type, struct acl *aclp); }
+429	MSTD	BSD	{ int getsysfd(int type, off_t size); }
 
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/ia64/ia32/syscalls.master  (take a best guess)
Index: sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.59
diff -u -r1.59 file.h
--- sys/file.h	13 Jan 2003 00:28:55 -0000	1.59
+++ sys/file.h	14 Jan 2003 02:04:13 -0000
@@ -62,6 +62,7 @@
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 #define	DTYPE_CRYPTO	6	/* crypto */
+#define DTYPE_MEMFD	7	/* memory descriptor */
 
 #ifdef _KERNEL
 
Index: sys/syscall.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.h,v
retrieving revision 1.130
diff -u -r1.130 syscall.h
--- sys/syscall.h	8 Jan 2003 04:57:52 -0000	1.130
+++ sys/syscall.h	14 Jan 2003 01:58:05 -0000
@@ -2,7 +2,7 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/syscall.h,v 1.130 2003/01/08 04:57:52 davidxu Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.140 2003/01/04 11:41:12 davidxu Exp 
  */
 
@@ -334,4 +334,5 @@
 #define	SYS___acl_set_link	426
 #define	SYS___acl_delete_link	427
 #define	SYS___acl_aclcheck_link	428
-#define	SYS_MAXSYSCALL	429
+#define	SYS_getsysfd	429
+#define	SYS_MAXSYSCALL	430
Index: sys/syscall.mk
===================================================================
RCS file: /home/ncvs/src/sys/sys/syscall.mk,v
retrieving revision 1.85
diff -u -r1.85 syscall.mk
--- sys/syscall.mk	8 Jan 2003 04:57:52 -0000	1.85
+++ sys/syscall.mk	14 Jan 2003 01:58:05 -0000
@@ -1,6 +1,6 @@
 # FreeBSD system call names.
 # DO NOT EDIT-- this file is automatically generated.
-# $FreeBSD: src/sys/sys/syscall.mk,v 1.85 2003/01/08 04:57:52 davidxu Exp $
+# $FreeBSD$
 # created from FreeBSD: src/sys/kern/syscalls.master,v 1.140 2003/01/04 11:41:12 davidxu Exp 
 MIASM =  \
 	syscall.o \
@@ -279,4 +279,5 @@
 	__acl_get_link.o \
 	__acl_set_link.o \
 	__acl_delete_link.o \
-	__acl_aclcheck_link.o
+	__acl_aclcheck_link.o \
+	getsysfd.o
Index: sys/sysfd.h
===================================================================
RCS file: sys/sysfd.h
diff -N sys/sysfd.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ sys/sysfd.h	14 Jan 2003 04:06:19 -0000
@@ -0,0 +1,21 @@
+/*
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_SYSFD_H_
+#define	_SYS_SYSFD_H_
+
+#define SYSFD_MEMORY		1
+#ifdef NOTYET
+#define SYSFD_TIMER_SECS	2
+#define SYSFD_TIMER_TENS	3
+#define SYSFD_TIMER_MICRO	4
+#define SYSFD_TIMER_SYS		5
+#define SYSFD_TIMER_REAL	6
+#define SYSFD_TIMER_VIRT	7
+#endif
+
+#endif	/* _SYS_SYSFD_H_ */
+
+extern int getsysfd(int type, off_t size);
+
Index: sys/sysproto.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sysproto.h,v
retrieving revision 1.123
diff -u -r1.123 sysproto.h
--- sys/sysproto.h	8 Jan 2003 04:57:53 -0000	1.123
+++ sys/sysproto.h	14 Jan 2003 01:58:05 -0000
@@ -2,7 +2,7 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/sysproto.h,v 1.123 2003/01/08 04:57:53 davidxu Exp $
+ * $FreeBSD$
  * created from FreeBSD: src/sys/kern/syscalls.master,v 1.140 2003/01/04 11:41:12 davidxu Exp 
  */
 
@@ -1223,6 +1223,10 @@
 	char type_l_[PADL_(acl_type_t)]; acl_type_t type; char type_r_[PADR_(acl_type_t)];
 	char aclp_l_[PADL_(struct acl *)]; struct acl * aclp; char aclp_r_[PADR_(struct acl *)];
 };
+struct getsysfd_args {
+	char type_l_[PADL_(int)]; int type; char type_r_[PADR_(int)];
+	char size_l_[PADL_(off_t)]; off_t size; char size_r_[PADR_(off_t)];
+};
 int	nosys(struct thread *, struct nosys_args *);
 void	sys_exit(struct thread *, struct sys_exit_args *);
 int	fork(struct thread *, struct fork_args *);
@@ -1499,6 +1503,7 @@
 int	__acl_set_link(struct thread *, struct __acl_set_link_args *);
 int	__acl_delete_link(struct thread *, struct __acl_delete_link_args *);
 int	__acl_aclcheck_link(struct thread *, struct __acl_aclcheck_link_args *);
+int	getsysfd(struct thread *, struct getsysfd_args *);
 
 #ifdef COMPAT_43
 
Index: vm/vm_extern.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_extern.h,v
retrieving revision 1.59
diff -u -r1.59 vm_extern.h
--- vm/vm_extern.h	24 Jul 2002 19:47:56 -0000	1.59
+++ vm/vm_extern.h	14 Jan 2003 03:12:06 -0000
@@ -80,6 +80,7 @@
 void vm_forkproc(struct thread *, struct proc *, struct thread *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, void *, vm_ooffset_t);
+int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, vm_object_t, vm_ooffset_t);
 vm_offset_t vm_page_alloc_contig(vm_offset_t, vm_offset_t, vm_offset_t, vm_offset_t);
 void vm_set_page_size(void);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
Index: vm/vm_mmap.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_mmap.c,v
retrieving revision 1.155
diff -u -r1.155 vm_mmap.c
--- vm/vm_mmap.c	13 Jan 2003 00:28:55 -0000	1.155
+++ vm/vm_mmap.c	14 Jan 2003 03:55:15 -0000
@@ -201,7 +201,7 @@
 	struct thread *td;
 	struct mmap_args *uap;
 {
-	struct file *fp = NULL;
+	struct file *fp;
 	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
@@ -264,49 +264,101 @@
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
-	}
-	/*
-	 * XXX for non-fixed mappings where no hint is provided or
-	 * the hint would fall in the potential heap space,
-	 * place it after the end of the largest possible heap.
-	 *
-	 * There should really be a pmap call to determine a reasonable
-	 * location.
-	 */
-	else if (addr == 0 ||
+	} else if (addr == 0 ||
 	    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
-	     addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
+	     addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) {
+		/*
+		 * XXX for non-fixed mappings where no hint is provided or
+		 * the hint would fall in the potential heap space,
+		 * place it after the end of the largest possible heap.
+		 *
+		 * There should really be a pmap call to determine a reasonable
+		 * location.
+		 */
 		addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
+	}
 
 	mtx_lock(&Giant);	/* syscall marked mp-safe but isn't */
+
+	/*
+	 * Do not allow more then a certain number of vm_map_entry structures
+	 * per process.  Scale with the number of rforks sharing the map
+	 * to make the limit reasonable for threads.
+	 */
+	if (max_proc_mmap && 
+	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
+		error = ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * Extract the file descriptor (if not an anonymous mmap)
+	 */
 	if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
-		handle = NULL;
 		maxprot = VM_PROT_ALL;
 		pos = 0;
 	} else {
 		/*
-		 * Mapping file, get fp for validation. Obtain vnode and make
-		 * sure it is of appropriate type.
-		 * don't let the descriptor disappear on us if we block
+		 * Mapping a file descriptor.  Reference the fp so it does
+		 * not go away on us.
 		 */
 		if ((error = fget(td, uap->fd, &fp)) != 0)
 			goto done;
-		if (fp->f_type != DTYPE_VNODE) {
-			error = EINVAL;
-			goto done;
-		}
 
 		/*
-		 * POSIX shared-memory objects are defined to have
-		 * kernel persistence, and are not defined to support
-		 * read(2)/write(2) -- or even open(2).  Thus, we can
-		 * use MAP_ASYNC to trade on-disk coherence for speed.
-		 * The shm_open(3) library routine turns on the FPOSIXSHM
-		 * flag to request this behavior.
+		 * Ensure that file and memory protections are
+		 * compatible.  Note that we only worry about
+		 * writability if mapping is shared; in this case,
+		 * current and max prot are dictated by the open file.
+		 * XXX use the vnode instead?  Problem is: what
+		 * credentials do we use for determination? What if
+		 * proc does a setuid?
 		 */
+		maxprot = VM_PROT_EXECUTE;	/* ??? */
+		if (fp->f_flag & FREAD) {
+			maxprot |= VM_PROT_READ;
+		} else if (prot & PROT_READ) {
+			error = EACCES;
+			goto done;
+		}
+	}
+
+	/*
+	 * Handle MEMFD descriptors.  These reference the VM object directly.
+	 */
+	if (fp && fp->f_type == DTYPE_MEMFD && fp->f_data) {
+		mtx_unlock(&Giant);
+		obj = fp->f_data;
+		vm_object_reference(obj);
+		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
+			    maxprot, flags, obj, pos);
+		if (error == 0)
+			td->td_retval[0] = (register_t) (addr + pageoff);
+		mtx_lock(&Giant);
+		vm_object_deallocate(obj);
+		goto done2;
+	}
+
+	/*
+	 * Otherwise it must be an anonymous mapping or a VNODE
+	 */ 
+	if (fp != NULL && fp->f_type != DTYPE_VNODE) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * POSIX shared-memory objects are defined to have
+	 * kernel persistence, and are not defined to support
+	 * read(2)/write(2) -- or even open(2).  Thus, we can
+	 * use MAP_ASYNC to trade on-disk coherence for speed.
+	 * The shm_open(3) library routine turns on the FPOSIXSHM
+	 * flag to request this behavior.
+	 */
+	if (fp) {
 		if (fp->f_flag & FPOSIXSHM)
 			flags |= MAP_NOSYNC;
 		vp = fp->f_data;
@@ -363,22 +415,7 @@
 				error = EINVAL;
 				goto done;
 			}
-			/*
-			 * Ensure that file and memory protections are
-			 * compatible.  Note that we only worry about
-			 * writability if mapping is shared; in this case,
-			 * current and max prot are dictated by the open file.
-			 * XXX use the vnode instead?  Problem is: what
-			 * credentials do we use for determination? What if
-			 * proc does a setuid?
-			 */
-			maxprot = VM_PROT_EXECUTE;	/* ??? */
-			if (fp->f_flag & FREAD) {
-				maxprot |= VM_PROT_READ;
-			} else if (prot & PROT_READ) {
-				error = EACCES;
-				goto done;
-			}
+
 			/*
 			 * If we are sharing potential changes (either via
 			 * MAP_SHARED or via the implicit sharing of character
@@ -414,17 +451,8 @@
 
 			handle = (void *)vp;
 		}
-	}
-
-	/*
-	 * Do not allow more then a certain number of vm_map_entry structures
-	 * per process.  Scale with the number of rforks sharing the map
-	 * to make the limit reasonable for threads.
-	 */
-	if (max_proc_mmap && 
-	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
-		error = ENOMEM;
-		goto done;
+	} else {
+		handle = NULL;
 	}
 
 	mtx_unlock(&Giant);
@@ -444,10 +472,10 @@
 done:
 	if (vp)
 		vput(vp);
+done2:
 	mtx_unlock(&Giant);
 	if (fp)
 		fdrop(fp, td);
-
 	return (error);
 }
 
@@ -1272,3 +1300,102 @@
 		return (EINVAL);
 	}
 }
+
+/*
+ * vm_mmap_object()
+ *
+ * MPSAFE
+ *
+ * Internal version of mmap that directly operates on a VM object.
+ * Currently used by mmap.
+ */
+int
+vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+	vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff)
+{
+	boolean_t fitit;
+	int rv = KERN_SUCCESS;
+	int docow;
+	struct thread *td = curthread;
+
+	if (size == 0)
+		return (0);
+
+	size = round_page(size);
+
+	if (td->td_proc->p_vmspace->vm_map.size + size >
+	    td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
+		return(ENOMEM);
+	}
+
+	/*
+	 * We currently can only deal with page aligned file offsets.
+	 * The check is here rather than in the syscall because the
+	 * kernel calls this function internally for other mmaping
+	 * operations (such as in exec) and non-aligned offsets will
+	 * cause pmap inconsistencies...so we want to be sure to
+	 * disallow this in all cases.
+	 */
+	if (foff & PAGE_MASK)
+		return (EINVAL);
+
+	if ((flags & MAP_FIXED) == 0) {
+		fitit = TRUE;
+		*addr = round_page(*addr);
+	} else {
+		if (*addr != trunc_page(*addr))
+			return (EINVAL);
+		fitit = FALSE;
+		(void) vm_map_remove(map, *addr, *addr + size);
+	}
+
+	docow = MAP_PREFAULT_PARTIAL;
+
+	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
+		docow |= MAP_COPY_ON_WRITE;
+	if (flags & MAP_NOCORE)
+		docow |= MAP_DISABLE_COREDUMP;
+
+#if defined(VM_PROT_READ_IS_EXEC)
+	if (prot & VM_PROT_READ)
+		prot |= VM_PROT_EXECUTE;
+
+	if (maxprot & VM_PROT_READ)
+		maxprot |= VM_PROT_EXECUTE;
+#endif
+
+	if (fitit)
+		*addr = pmap_addr_hint(object, *addr, size);
+
+	vm_object_reference(object);
+	if (flags & MAP_STACK) {
+		rv = vm_map_stack (map, *addr, size, prot, maxprot, docow);
+	} else {
+		rv = vm_map_find(map, object, foff, addr, size, fitit,
+				 prot, maxprot, docow);
+	}
+	if (rv != KERN_SUCCESS)
+		vm_object_deallocate(object);
+
+	switch(rv) {
+	case KERN_SUCCESS:
+		if (flags & MAP_SHARED) {
+			/*
+			 * Shared memory is also shared with children.
+			 */
+			rv = vm_map_inherit(map, *addr, *addr + size,
+				    VM_INHERIT_SHARE);
+			if (rv != KERN_SUCCESS)
+				(void)vm_map_remove(map, *addr, *addr + size);
+		}
+		return(0);
+	case KERN_INVALID_ADDRESS:
+	case KERN_NO_SPACE:
+		return (ENOMEM);
+	case KERN_PROTECTION_FAILURE:
+		return (EACCES);
+	default:
+		return (EINVAL);
+	}
+}
+

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200301140411.h0E4BgpN078032>