Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 18 Jun 2015 06:00:17 +0000 (UTC)
From:      Neel Natu <neel@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r284539 - in head: lib/libvmmapi share/examples/bhyve sys/amd64/include sys/amd64/vmm sys/amd64/vmm/amd sys/amd64/vmm/intel sys/amd64/vmm/io usr.sbin/bhyve usr.sbin/bhyvectl usr.sbin/bh...
Message-ID:  <201506180600.t5I60Hh5094232@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: neel
Date: Thu Jun 18 06:00:17 2015
New Revision: 284539
URL: https://svnweb.freebsd.org/changeset/base/284539

Log:
  Restructure memory allocation in bhyve to support "devmem".
  
  devmem is used to represent MMIO devices like the boot ROM or a VESA framebuffer
  where doing a trap-and-emulate for every access is impractical. devmem is a
  hybrid of system memory (sysmem) and emulated device models.
  
  devmem is mapped in the guest address space via nested page tables similar
  to sysmem. However the address range where devmem is mapped may be changed
  by the guest at runtime (e.g. by reprogramming a PCI BAR). Also devmem is
  usually mapped RO or RW as compared to RWX mappings for sysmem.
  
  Each devmem segment is named (e.g. "bootrom") and this name is used to
  create a device node for the devmem segment (e.g. /dev/vmm/testvm.bootrom).
  The device node supports mmap(2) and this decouples the host mapping of
  devmem from its mapping in the guest address space (which can change).
  
  Reviewed by:	tychon
  Discussed with:	grehan
  Differential Revision:	https://reviews.freebsd.org/D2762
  MFC after:	4 weeks

Added:
  head/usr.sbin/bhyve/bootrom.c   (contents, props changed)
  head/usr.sbin/bhyve/bootrom.h   (contents, props changed)
Modified:
  head/lib/libvmmapi/vmmapi.c
  head/lib/libvmmapi/vmmapi.h
  head/share/examples/bhyve/vmrun.sh
  head/sys/amd64/include/vmm.h
  head/sys/amd64/include/vmm_dev.h
  head/sys/amd64/vmm/amd/svm.c
  head/sys/amd64/vmm/intel/vmx.c
  head/sys/amd64/vmm/io/ppt.c
  head/sys/amd64/vmm/vmm.c
  head/sys/amd64/vmm/vmm_dev.c
  head/sys/amd64/vmm/vmm_instruction_emul.c
  head/sys/amd64/vmm/vmm_mem.c
  head/sys/amd64/vmm/vmm_mem.h
  head/usr.sbin/bhyve/Makefile
  head/usr.sbin/bhyve/bhyve.8
  head/usr.sbin/bhyve/bhyverun.c
  head/usr.sbin/bhyve/pci_lpc.c
  head/usr.sbin/bhyve/pci_lpc.h
  head/usr.sbin/bhyve/pci_passthru.c
  head/usr.sbin/bhyvectl/Makefile
  head/usr.sbin/bhyvectl/bhyvectl.c
  head/usr.sbin/bhyveload/bhyveload.8
  head/usr.sbin/bhyveload/bhyveload.c

Modified: head/lib/libvmmapi/vmmapi.c
==============================================================================
--- head/lib/libvmmapi/vmmapi.c	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/lib/libvmmapi/vmmapi.c	Thu Jun 18 06:00:17 2015	(r284539)
@@ -58,15 +58,23 @@ __FBSDID("$FreeBSD$");
 #define	MB	(1024 * 1024UL)
 #define	GB	(1024 * 1024 * 1024UL)
 
+/*
+ * Size of the guard region before and after the virtual address space
+ * mapping the guest physical memory. This must be a multiple of the
+ * superpage size for performance reasons.
+ */
+#define	VM_MMAP_GUARD_SIZE	(4 * MB)
+
+#define	PROT_RW		(PROT_READ | PROT_WRITE)
+#define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
+
 struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
-	enum vm_mmap_style vms;
 	int	memflags;
 	size_t	lowmem;
-	char	*lowmem_addr;
 	size_t	highmem;
-	char	*highmem_addr;
+	char	*baseaddr;
 	char	*name;
 };
 
@@ -157,22 +165,6 @@ vm_parse_memsize(const char *optarg, siz
 	return (error);
 }
 
-int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-		  int *wired)
-{
-	int error;
-	struct vm_memory_segment seg;
-
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
-	*ret_len = seg.len;
-	if (wired != NULL)
-		*wired = seg.wired;
-	return (error);
-}
-
 uint32_t
 vm_get_lowmem_limit(struct vmctx *ctx)
 {
@@ -194,39 +186,184 @@ vm_set_memflags(struct vmctx *ctx, int f
 	ctx->memflags = flags;
 }
 
+int
+vm_get_memflags(struct vmctx *ctx)
+{
+
+	return (ctx->memflags);
+}
+
+/*
+ * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
+ */
+int
+vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot)
+{
+	struct vm_memmap memmap;
+	int error, flags;
+
+	memmap.gpa = gpa;
+	memmap.segid = segid;
+	memmap.segoff = off;
+	memmap.len = len;
+	memmap.prot = prot;
+	memmap.flags = 0;
+
+	if (ctx->memflags & VM_MEM_F_WIRED)
+		memmap.flags |= VM_MEMMAP_F_WIRED;
+
+	/*
+	 * If this mapping already exists then don't create it again. This
+	 * is the common case for SYSMEM mappings created by bhyveload(8).
+	 */
+	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
+	if (error == 0 && gpa == memmap.gpa) {
+		if (segid != memmap.segid || off != memmap.segoff ||
+		    prot != memmap.prot || flags != memmap.flags) {
+			errno = EEXIST;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
+	return (error);
+}
+
+int
+vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
+{
+	struct vm_memmap memmap;
+	int error;
+
+	bzero(&memmap, sizeof(struct vm_memmap));
+	memmap.gpa = *gpa;
+	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
+	if (error == 0) {
+		*gpa = memmap.gpa;
+		*segid = memmap.segid;
+		*segoff = memmap.segoff;
+		*len = memmap.len;
+		*prot = memmap.prot;
+		*flags = memmap.flags;
+	}
+	return (error);
+}
+
+/*
+ * Return 0 if the segments are identical and non-zero otherwise.
+ *
+ * This is slightly complicated by the fact that only device memory segments
+ * are named.
+ */
 static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+cmpseg(size_t len, const char *str, size_t len2, const char *str2)
 {
-	int error, mmap_flags;
-	struct vm_memory_segment seg;
+
+	if (len == len2) {
+		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
+			return (0);
+	}
+	return (-1);
+}
+
+static int
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
+{
+	struct vm_memseg memseg;
+	size_t n;
+	int error;
 
 	/*
-	 * Create and optionally map 'len' bytes of memory at guest
-	 * physical address 'gpa'
+	 * If the memory segment has already been created then just return.
+	 * This is the usual case for the SYSMEM segment created by userspace
+	 * loaders like bhyveload(8).
 	 */
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	seg.len = len;
-	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
-	if (error == 0 && addr != NULL) {
-		mmap_flags = MAP_SHARED;
-		if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
-			mmap_flags |= MAP_NOCORE;
-		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
-		    ctx->fd, gpa);
+	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
+	    sizeof(memseg.name));
+	if (error)
+		return (error);
+
+	if (memseg.len != 0) {
+		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
+			errno = EINVAL;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	bzero(&memseg, sizeof(struct vm_memseg));
+	memseg.segid = segid;
+	memseg.len = len;
+	if (name != NULL) {
+		n = strlcpy(memseg.name, name, sizeof(memseg.name));
+		if (n >= sizeof(memseg.name)) {
+			errno = ENAMETOOLONG;
+			return (-1);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
+	return (error);
+}
+
+int
+vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
+    size_t bufsize)
+{
+	struct vm_memseg memseg;
+	size_t n;
+	int error;
+
+	memseg.segid = segid;
+	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
+	if (error == 0) {
+		*lenp = memseg.len;
+		n = strlcpy(namebuf, memseg.name, bufsize);
+		if (n >= bufsize) {
+			errno = ENAMETOOLONG;
+			error = -1;
+		}
 	}
 	return (error);
 }
 
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+{
+	char *ptr;
+	int error, flags;
+
+	/* Map 'len' bytes starting at 'gpa' in the guest address space */
+	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+	if (error)
+		return (error);
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+	/* mmap into the process address space on the host */
+	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	return (0);
+}
+
 int
 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 {
-	char **addr;
-	int error;
+	size_t objsize, len;
+	vm_paddr_t gpa;
+	char *baseaddr, *ptr;
+	int error, flags;
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
-	ctx->vms = vms;
+	assert(vms == VM_MMAP_ALL);
 
 	/*
 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
@@ -234,46 +371,63 @@ vm_setup_memory(struct vmctx *ctx, size_
 	 */
 	if (memsize > ctx->lowmem_limit) {
 		ctx->lowmem = ctx->lowmem_limit;
-		ctx->highmem = memsize - ctx->lowmem;
+		ctx->highmem = memsize - ctx->lowmem_limit;
+		objsize = 4*GB + ctx->highmem;
 	} else {
 		ctx->lowmem = memsize;
 		ctx->highmem = 0;
+		objsize = ctx->lowmem;
 	}
 
-	if (ctx->lowmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
-		error = setup_memory_segment(ctx, 0, ctx->lowmem, addr);
+	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
+	if (error)
+		return (error);
+
+	/*
+	 * Stake out a contiguous region covering the guest physical memory
+	 * and the adjoining guard regions.
+	 */
+	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+	flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+	ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
+	if (ctx->highmem > 0) {
+		gpa = 4*GB;
+		len = ctx->highmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
-	if (ctx->highmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
-		error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+	if (ctx->lowmem > 0) {
+		gpa = 0;
+		len = ctx->lowmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
+	ctx->baseaddr = baseaddr;
+
 	return (0);
 }
 
 void *
 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
+	vm_paddr_t start, end, mapend;
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(ctx->vms == VM_MMAP_ALL);
+	start = gaddr;
+	end = gaddr + len;
+	mapend = ctx->highmem ? 4*GB + ctx->highmem : ctx->lowmem;
 
-	if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
-		return ((void *)(ctx->lowmem_addr + gaddr));
-
-	if (gaddr >= 4*GB) {
-		gaddr -= 4*GB;
-		if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
-			return ((void *)(ctx->highmem_addr + gaddr));
-	}
-
-	return (NULL);
+	if (start <= end && end <= mapend)
+		return (ctx->baseaddr + start);
+	else
+		return (NULL);
 }
 
 size_t
@@ -290,6 +444,56 @@ vm_get_highmem_size(struct vmctx *ctx)
 	return (ctx->highmem);
 }
 
+void *
+vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
+{
+	char pathname[MAXPATHLEN];
+	size_t len2;
+	char *base, *ptr;
+	int fd, error, flags;
+
+	fd = -1;
+	ptr = MAP_FAILED;
+	if (name == NULL || strlen(name) == 0) {
+		errno = EINVAL;
+		goto done;
+	}
+
+	error = vm_alloc_memseg(ctx, segid, len, name);
+	if (error)
+		goto done;
+
+	strlcpy(pathname, "/dev/vmm/", sizeof(pathname));
+	strlcat(pathname, ctx->name, sizeof(pathname));
+	strlcat(pathname, ".", sizeof(pathname));
+	strlcat(pathname, name, sizeof(pathname));
+
+	fd = open(pathname, O_RDWR);
+	if (fd < 0)
+		goto done;
+
+	/*
+	 * Stake out a contiguous region covering the device memory and the
+	 * adjoining guard regions.
+	 */
+	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
+	flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+	base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
+	if (base == MAP_FAILED)
+		goto done;
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+	/* mmap the devmem region in the host address space */
+	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
+done:
+	if (fd >= 0)
+		close(fd);
+	return (ptr);
+}
+
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)

Modified: head/lib/libvmmapi/vmmapi.h
==============================================================================
--- head/lib/libvmmapi/vmmapi.h	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/lib/libvmmapi/vmmapi.h	Thu Jun 18 06:00:17 2015	(r284539)
@@ -36,7 +36,7 @@
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
-#define	VMMAPI_VERSION	0101	/* 2 digit major followed by 2 digit minor */
+#define	VMMAPI_VERSION	0102	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
@@ -52,14 +52,59 @@ enum vm_mmap_style {
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
+/*
+ * 'flags' value passed to 'vm_set_memflags()'.
+ */
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
+#define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
+
+/*
+ * Identifiers for memory segments:
+ * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
+ * - the remaining identifiers can be used to create devmem segments.
+ */
+enum {
+	VM_SYSMEM,
+	VM_BOOTROM,
+	VM_FRAMEBUFFER,
+};
+
+/*
+ * Get the length and name of the memory segment identified by 'segid'.
+ * Note that system memory segments are identified with a nul name.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+int	vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
+	    size_t namesiz);
+
+/*
+ * Iterate over the guest address space. This function finds an address range
+ * that starts at an address >= *gpa.
+ *
+ * Returns 0 if the next address range was found and non-zero otherwise.
+ */
+int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+	    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+/*
+ * Create a device memory segment identified by 'segid'.
+ *
+ * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
+ */
+void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
+	    size_t len);
+
+/*
+ * Map the memory segment identified by 'segid' into the guest address space
+ * at [gpa,gpa+len) with protection 'prot'.
+ */
+int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
+	    vm_ooffset_t segoff, size_t len, int prot);
 
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
@@ -68,6 +113,7 @@ int	vm_gla2gpa(struct vmctx *, int vcpui
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
+int	vm_get_memflags(struct vmctx *ctx);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,

Modified: head/share/examples/bhyve/vmrun.sh
==============================================================================
--- head/share/examples/bhyve/vmrun.sh	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/share/examples/bhyve/vmrun.sh	Thu Jun 18 06:00:17 2015	(r284539)
@@ -87,15 +87,15 @@ console=${DEFAULT_CONSOLE}
 cpus=${DEFAULT_CPUS}
 tap_total=0
 disk_total=0
-apic_opt=""
 gdbport=0
 loader_opt=""
+bhyverun_opt="-H -A -P"
 pass_total=0
 
 while getopts ac:C:d:e:g:hH:iI:m:p:t: c ; do
 	case $c in
 	a)
-		apic_opt="-a"
+		bhyverun_opt="${bhyverun_opt} -a"
 		;;
 	c)
 		cpus=${OPTARG}
@@ -163,6 +163,12 @@ if [ -n "${host_base}" ]; then
 	loader_opt="${loader_opt} -h ${host_base}"
 fi
 
+# If PCI passthru devices are configured then guest memory must be wired
+if [ ${pass_total} -gt 0 ]; then
+	loader_opt="${loader_opt} -S"
+	bhyverun_opt="${bhyverun_opt} -S"
+fi
+
 make_and_check_diskdev()
 {
     local virtio_diskdev="$1"
@@ -263,7 +269,7 @@ while [ 1 ]; do
 	    i=$(($i + 1))
         done
 
-	${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P	\
+	${FBSDRUN} -c ${cpus} -m ${memsize} ${bhyverun_opt}		\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\
 		-s 1:0,lpc						\

Modified: head/sys/amd64/include/vmm.h
==============================================================================
--- head/sys/amd64/include/vmm.h	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/include/vmm.h	Thu Jun 18 06:00:17 2015	(r284539)
@@ -108,7 +108,6 @@ enum x2apic_state {
 
 struct vm;
 struct vm_exception;
-struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
@@ -175,17 +174,33 @@ int vm_create(const char *name, struct v
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
-int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+
+/*
+ * APIs that modify the guest memory map require all vcpus to be frozen.
+ */
+int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot, int flags);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
-		  void **cookie);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+
+/*
+ * APIs that inspect the guest memory map require only a *single* vcpu to
+ * be frozen. This acts like a read lock on the guest memory map since any
+ * modification requires *all* vcpus to be frozen.
+ */
+int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    struct vm_object **objptr);
+void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
+    int prot, void **cookie);
 void vm_gpa_release(void *cookie);
-int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
-	      struct vm_memory_segment *seg);
-int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
-		  vm_offset_t *offset, struct vm_object **object);
-boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
+bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
+
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@@ -302,8 +317,6 @@ vcpu_should_yield(struct vm *vm, int vcp
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
-int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
-int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);

Modified: head/sys/amd64/include/vmm_dev.h
==============================================================================
--- head/sys/amd64/include/vmm_dev.h	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/include/vmm_dev.h	Thu Jun 18 06:00:17 2015	(r284539)
@@ -34,10 +34,22 @@ void	vmmdev_init(void);
 int	vmmdev_cleanup(void);
 #endif
 
-struct vm_memory_segment {
-	vm_paddr_t	gpa;	/* in */
+struct vm_memmap {
+	vm_paddr_t	gpa;
+	int		segid;		/* memory segment */
+	vm_ooffset_t	segoff;		/* offset into memory segment */
+	size_t		len;		/* mmap length */
+	int		prot;		/* RWX */
+	int		flags;
+};
+#define	VM_MEMMAP_F_WIRED	0x01
+#define	VM_MEMMAP_F_IOMMU	0x02
+
+#define	VM_MEMSEG_NAME(m)	((m)->name[0] != '\0' ? (m)->name : NULL)
+struct vm_memseg {
+	int		segid;
 	size_t		len;
-	int		wired;
+	char		name[SPECNAMELEN + 1];
 };
 
 struct vm_register {
@@ -214,10 +226,14 @@ enum {
 	IOCNUM_REINIT = 5,
 
 	/* memory apis */
-	IOCNUM_MAP_MEMORY = 10,
-	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_MAP_MEMORY = 10,			/* deprecated */
+	IOCNUM_GET_MEMORY_SEG = 11,		/* deprecated */
 	IOCNUM_GET_GPA_PMAP = 12,
 	IOCNUM_GLA2GPA = 13,
+	IOCNUM_ALLOC_MEMSEG = 14,
+	IOCNUM_GET_MEMSEG = 15,
+	IOCNUM_MMAP_MEMSEG = 16,
+	IOCNUM_MMAP_GETNEXT = 17,
 
 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
@@ -278,10 +294,14 @@ enum {
 	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
 #define	VM_REINIT	\
 	_IO('v', IOCNUM_REINIT)
-#define	VM_MAP_MEMORY	\
-	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
-#define	VM_GET_MEMORY_SEG \
-	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_ALLOC_MEMSEG	\
+	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
+#define	VM_GET_MEMSEG	\
+	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
+#define	VM_MMAP_MEMSEG	\
+	_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
+#define	VM_MMAP_GETNEXT	\
+	_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
 #define	VM_SET_REGISTER \
 	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
 #define	VM_GET_REGISTER \

Modified: head/sys/amd64/vmm/amd/svm.c
==============================================================================
--- head/sys/amd64/vmm/amd/svm.c	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/vmm/amd/svm.c	Thu Jun 18 06:00:17 2015	(r284539)
@@ -1477,7 +1477,7 @@ svm_vmexit(struct svm_softc *svm_sc, int
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
-		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
+		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);

Modified: head/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- head/sys/amd64/vmm/intel/vmx.c	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/vmm/intel/vmx.c	Thu Jun 18 06:00:17 2015	(r284539)
@@ -2425,7 +2425,7 @@ vmx_exit_process(struct vmx *vmx, int vc
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
-		if (vm_mem_allocated(vmx->vm, gpa) ||
+		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;

Modified: head/sys/amd64/vmm/io/ppt.c
==============================================================================
--- head/sys/amd64/vmm/io/ppt.c	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/vmm/io/ppt.c	Thu Jun 18 06:00:17 2015	(r284539)
@@ -76,11 +76,17 @@ struct pptintr_arg {				/* pptintr(pptin
 	uint64_t	msg_data;
 };
 
+struct pptseg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	int		wired;
+};
+
 struct pptdev {
 	device_t	dev;
 	struct vm	*vm;			/* owner of this device */
 	TAILQ_ENTRY(pptdev)	next;
-	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct pptseg mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
 
@@ -207,14 +213,14 @@ static void
 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
 {
 	int i;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 
 	for (i = 0; i < MAX_MMIOSEGS; i++) {
 		seg = &ppt->mmio[i];
 		if (seg->len == 0)
 			continue;
 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
-		bzero(seg, sizeof(struct vm_memory_segment));
+		bzero(seg, sizeof(struct pptseg));
 	}
 }
 
@@ -324,7 +330,7 @@ ppt_is_mmio(struct vm *vm, vm_paddr_t gp
 {
 	int i;
 	struct pptdev *ppt;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm != vm)
@@ -410,7 +416,7 @@ ppt_map_mmio(struct vm *vm, int bus, int
 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	int i, error;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);

Modified: head/sys/amd64/vmm/vmm.c
==============================================================================
--- head/sys/amd64/vmm/vmm.c	Thu Jun 18 05:58:15 2015	(r284538)
+++ head/sys/amd64/vmm/vmm.c	Thu Jun 18 06:00:17 2015	(r284539)
@@ -119,12 +119,21 @@ struct vcpu {
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
+	size_t	len;
+	bool	sysmem;
+	struct vm_object *object;
+};
+#define	VM_MAX_MEMSEGS	2
+
+struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
-	boolean_t	wired;
-	vm_object_t	object;
+	vm_ooffset_t	segoff;
+	int		segid;
+	int		prot;
+	int		flags;
 };
-#define	VM_MAX_MEMORY_SEGMENTS	2
+#define	VM_MAX_MEMMAPS	4
 
 /*
  * Initialization:
@@ -150,8 +159,8 @@ struct vm {
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
-	int		num_mem_segs;		/* (o) guest memory segments */
-	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
+	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
@@ -222,6 +231,8 @@ TUNABLE_INT("hw.vmm.force_iommu", &vmm_f
 SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
     "Force use of I/O MMU even if no passthrough devices were found.");
 
+static void vm_free_memmap(struct vm *vm, int ident);
+static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
@@ -442,7 +453,6 @@ vm_create(const char *name, struct vm **
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
@@ -453,18 +463,9 @@ vm_create(const char *name, struct vm **
 }
 
 static void
-vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
-{
-
-	if (seg->object != NULL)
-		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
-
-	bzero(seg, sizeof(*seg));
-}
-
-static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
+	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
@@ -487,11 +488,23 @@ vm_cleanup(struct vm *vm, bool destroy)
 
 	VMCLEANUP(vm->cookie);
 
-	if (destroy) {
-		for (i = 0; i < vm->num_mem_segs; i++)
-			vm_free_mem_seg(vm, &vm->mem_segs[i]);
+	/*
+	 * System memory is removed from the guest address space only when
+	 * the VM is destroyed. This is because the mapping remains the same
+	 * across VM reset.
+	 *
+	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
+	 * so those mappings are removed on a VM reset.
+	 */
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (destroy || !sysmem_mapping(vm, mm))
+			vm_free_memmap(vm, i);
+	}
 
-		vm->num_mem_segs = 0;
+	if (destroy) {
+		for (i = 0; i < VM_MAX_MEMSEGS; i++)
+			vm_free_memseg(vm, i);
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
@@ -549,146 +562,243 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t 
 	return (0);
 }
 
-boolean_t
-vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
+/*
+ * Return 'true' if 'gpa' is allocated in the guest address space.
+ *
+ * This function is called in the context of a running vcpu which acts as
+ * an implicit lock on 'vm->mem_maps[]'.
+ */
+bool
+vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
+	struct mem_map *mm;
 	int i;
-	vm_paddr_t gpabase, gpalimit;
 
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		gpabase = vm->mem_segs[i].gpa;
-		gpalimit = gpabase + vm->mem_segs[i].len;
-		if (gpa >= gpabase && gpa < gpalimit)
-			return (TRUE);		/* 'gpa' is regular memory */
+#ifdef INVARIANTS
+	int hostcpu, state;
+	state = vcpu_get_state(vm, vcpuid, &hostcpu);
+	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
+	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
+#endif
+
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
+			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
-		return (TRUE);			/* 'gpa' is pci passthru mmio */
+		return (true);			/* 'gpa' is pci passthru mmio */
 
-	return (FALSE);
+	return (false);
 }
 
 int
-vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
-	int available, allocated;
 	struct mem_seg *seg;
-	vm_object_t object;
-	vm_paddr_t g;
+	vm_object_t obj;
 
-	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
-	
-	available = allocated = 0;
-	g = gpa;
-	while (g < gpa + len) {
-		if (vm_mem_allocated(vm, g))
-			allocated++;
-		else
-			available++;
 
-		g += PAGE_SIZE;
-	}
-
-	/*
-	 * If there are some allocated and some available pages in the address
-	 * range then it is an error.
-	 */
-	if (allocated && available)
+	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
-	/*
-	 * If the entire address range being requested has already been
-	 * allocated then there isn't anything more to do.
-	 */
-	if (allocated && available == 0)
-		return (0);
-
-	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
-		return (E2BIG);
-
-	seg = &vm->mem_segs[vm->num_mem_segs];
+	seg = &vm->mem_segs[ident];
+	if (seg->object != NULL) {
+		if (seg->len == len && seg->sysmem == sysmem)
+			return (EEXIST);
+		else
+			return (EINVAL);
+	}
 
-	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+	if (obj == NULL)
 		return (ENOMEM);
 
-	seg->gpa = gpa;
 	seg->len = len;
-	seg->object = object;
-	seg->wired = FALSE;
+	seg->object = obj;
+	seg->sysmem = sysmem;
+	return (0);
+}
 
-	vm->num_mem_segs++;
+int
+vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    vm_object_t *objptr)
+{
+	struct mem_seg *seg;
+
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
+		return (EINVAL);
 
+	seg = &vm->mem_segs[ident];
+	if (len)
+		*len = seg->len;
+	if (sysmem)
+		*sysmem = seg->sysmem;
+	if (objptr)
+		*objptr = seg->object;
 	return (0);
 }
 
-static vm_paddr_t
-vm_maxmem(struct vm *vm)
+void
+vm_free_memseg(struct vm *vm, int ident)
 {
-	int i;
-	vm_paddr_t gpa, maxmem;
+	struct mem_seg *seg;
 
-	maxmem = 0;
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
-		if (gpa > maxmem)
-			maxmem = gpa;
+	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
+	    ("%s: invalid memseg ident %d", __func__, ident));
+
+	seg = &vm->mem_segs[ident];
+	if (seg->object != NULL) {
+		vm_object_deallocate(seg->object);
+		bzero(seg, sizeof(struct mem_seg));
 	}
-	return (maxmem);
 }
 
-static void
-vm_gpa_unwire(struct vm *vm)
+int
+vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
+    size_t len, int prot, int flags)
 {
-	int i, rv;
 	struct mem_seg *seg;
+	struct mem_map *m, *map;
+	vm_ooffset_t last;
+	int i, error;
 
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		seg = &vm->mem_segs[i];
-		if (!seg->wired)
-			continue;
+	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
+		return (EINVAL);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201506180600.t5I60Hh5094232>