Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 1 Feb 2016 14:56:11 +0000 (UTC)
From:      Peter Grehan <grehan@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r295124 - in stable/10: lib/libvmmapi share/examples/bhyve sys/amd64/include sys/amd64/vmm sys/amd64/vmm/amd sys/amd64/vmm/intel sys/amd64/vmm/io sys/sys usr.sbin/bhyve usr.sbin/bhyvect...
Message-ID:  <201602011456.u11EuBDE016423@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: grehan
Date: Mon Feb  1 14:56:11 2016
New Revision: 295124
URL: https://svnweb.freebsd.org/changeset/base/295124

Log:
  MFC r284539, r284630, r284688, r284877, r285217, r285218,
      r286837, r286838, r288470, r288522, r288524, r288826,
      r289001
  
  Pull in bhyve bug fixes and changes to allow UEFI booting.
  This provides Windows support.
  
  Tested on Intel and AMD with:
    - Arch Linux i386+amd64 (kernel 4.3.3)
    - Ubuntu 15.10 server 64-bit
    - FreeBSD-CURRENT/amd64 20160127 snap
    - FreeBSD 10.2 i386+amd64
    - OpenBSD 5.8 i386+amd64
    - SmartOS latest
    - Windows 10 build 1511'
  
  Huge thanks to Yamagi Burmeister who submitted the patch
  and did the majority of the testing.
  
  r284539 - bootrom mem allocation support
  r284630 - Add SO_REUSEADDR when starting debug port
  r284688 - Fix a regression in "movs" emulation
  r284877 - verify_gla() non-zero segment base fix
  r285217 - Always assert DCD and DSR in the uart
  r285218 - devmem nodes moved to /dev/vmm.io/
  r286837 - Add define for SATA Check-Power-Mode
  r286838 - Add simple (no-op) SATA cmd emulations
  r288470 - Increase virtio-blk indirect descs
  r288522 - Firmware guest query interface
  r288524 - Fix post-test typo
  r288826 - Clean up SATA unimplemented cmd msg
  r289001 - Add -l option to specify userboot path
  
  Submitted by:	Yamagi Burmeister
  Approved by:	re (kib)

Added:
  stable/10/usr.sbin/bhyve/bootrom.c
     - copied unchanged from r284539, head/usr.sbin/bhyve/bootrom.c
  stable/10/usr.sbin/bhyve/bootrom.h
     - copied unchanged from r284539, head/usr.sbin/bhyve/bootrom.h
  stable/10/usr.sbin/bhyve/fwctl.c
     - copied, changed from r288522, head/usr.sbin/bhyve/fwctl.c
  stable/10/usr.sbin/bhyve/fwctl.h
     - copied unchanged from r288522, head/usr.sbin/bhyve/fwctl.h
Modified:
  stable/10/lib/libvmmapi/vmmapi.c
  stable/10/lib/libvmmapi/vmmapi.h
  stable/10/share/examples/bhyve/vmrun.sh
  stable/10/sys/amd64/include/vmm.h
  stable/10/sys/amd64/include/vmm_dev.h
  stable/10/sys/amd64/vmm/amd/svm.c
  stable/10/sys/amd64/vmm/intel/vmx.c
  stable/10/sys/amd64/vmm/io/ppt.c
  stable/10/sys/amd64/vmm/vmm.c
  stable/10/sys/amd64/vmm/vmm_dev.c
  stable/10/sys/amd64/vmm/vmm_instruction_emul.c
  stable/10/sys/amd64/vmm/vmm_mem.c
  stable/10/sys/amd64/vmm/vmm_mem.h
  stable/10/sys/sys/ata.h
  stable/10/usr.sbin/bhyve/Makefile
  stable/10/usr.sbin/bhyve/bhyve.8
  stable/10/usr.sbin/bhyve/bhyverun.c
  stable/10/usr.sbin/bhyve/dbgport.c
  stable/10/usr.sbin/bhyve/pci_ahci.c
  stable/10/usr.sbin/bhyve/pci_lpc.c
  stable/10/usr.sbin/bhyve/pci_lpc.h
  stable/10/usr.sbin/bhyve/pci_passthru.c
  stable/10/usr.sbin/bhyve/pci_virtio_net.c
  stable/10/usr.sbin/bhyve/uart_emul.c
  stable/10/usr.sbin/bhyvectl/bhyvectl.c
  stable/10/usr.sbin/bhyveload/bhyveload.8
  stable/10/usr.sbin/bhyveload/bhyveload.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/lib/libvmmapi/vmmapi.c
==============================================================================
--- stable/10/lib/libvmmapi/vmmapi.c	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/lib/libvmmapi/vmmapi.c	Mon Feb  1 14:56:11 2016	(r295124)
@@ -58,15 +58,23 @@ __FBSDID("$FreeBSD$");
 #define	MB	(1024 * 1024UL)
 #define	GB	(1024 * 1024 * 1024UL)
 
+/*
+ * Size of the guard region before and after the virtual address space
+ * mapping the guest physical memory. This must be a multiple of the
+ * superpage size for performance reasons.
+ */
+#define	VM_MMAP_GUARD_SIZE	(4 * MB)
+
+#define	PROT_RW		(PROT_READ | PROT_WRITE)
+#define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
+
 struct vmctx {
 	int	fd;
 	uint32_t lowmem_limit;
-	enum vm_mmap_style vms;
 	int	memflags;
 	size_t	lowmem;
-	char	*lowmem_addr;
 	size_t	highmem;
-	char	*highmem_addr;
+	char	*baseaddr;
 	char	*name;
 };
 
@@ -157,22 +165,6 @@ vm_parse_memsize(const char *optarg, siz
 	return (error);
 }
 
-int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-		  int *wired)
-{
-	int error;
-	struct vm_memory_segment seg;
-
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
-	*ret_len = seg.len;
-	if (wired != NULL)
-		*wired = seg.wired;
-	return (error);
-}
-
 uint32_t
 vm_get_lowmem_limit(struct vmctx *ctx)
 {
@@ -194,39 +186,184 @@ vm_set_memflags(struct vmctx *ctx, int f
 	ctx->memflags = flags;
 }
 
+int
+vm_get_memflags(struct vmctx *ctx)
+{
+
+	return (ctx->memflags);
+}
+
+/*
+ * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
+ */
+int
+vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot)
+{
+	struct vm_memmap memmap;
+	int error, flags;
+
+	memmap.gpa = gpa;
+	memmap.segid = segid;
+	memmap.segoff = off;
+	memmap.len = len;
+	memmap.prot = prot;
+	memmap.flags = 0;
+
+	if (ctx->memflags & VM_MEM_F_WIRED)
+		memmap.flags |= VM_MEMMAP_F_WIRED;
+
+	/*
+	 * If this mapping already exists then don't create it again. This
+	 * is the common case for SYSMEM mappings created by bhyveload(8).
+	 */
+	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
+	if (error == 0 && gpa == memmap.gpa) {
+		if (segid != memmap.segid || off != memmap.segoff ||
+		    prot != memmap.prot || flags != memmap.flags) {
+			errno = EEXIST;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
+	return (error);
+}
+
+int
+vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
+{
+	struct vm_memmap memmap;
+	int error;
+
+	bzero(&memmap, sizeof(struct vm_memmap));
+	memmap.gpa = *gpa;
+	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
+	if (error == 0) {
+		*gpa = memmap.gpa;
+		*segid = memmap.segid;
+		*segoff = memmap.segoff;
+		*len = memmap.len;
+		*prot = memmap.prot;
+		*flags = memmap.flags;
+	}
+	return (error);
+}
+
+/*
+ * Return 0 if the segments are identical and non-zero otherwise.
+ *
+ * This is slightly complicated by the fact that only device memory segments
+ * are named.
+ */
 static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+cmpseg(size_t len, const char *str, size_t len2, const char *str2)
 {
-	int error, mmap_flags;
-	struct vm_memory_segment seg;
+
+	if (len == len2) {
+		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
+			return (0);
+	}
+	return (-1);
+}
+
+static int
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
+{
+	struct vm_memseg memseg;
+	size_t n;
+	int error;
 
 	/*
-	 * Create and optionally map 'len' bytes of memory at guest
-	 * physical address 'gpa'
+	 * If the memory segment has already been created then just return.
+	 * This is the usual case for the SYSMEM segment created by userspace
+	 * loaders like bhyveload(8).
 	 */
-	bzero(&seg, sizeof(seg));
-	seg.gpa = gpa;
-	seg.len = len;
-	error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
-	if (error == 0 && addr != NULL) {
-		mmap_flags = MAP_SHARED;
-		if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
-			mmap_flags |= MAP_NOCORE;
-		*addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
-		    ctx->fd, gpa);
+	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
+	    sizeof(memseg.name));
+	if (error)
+		return (error);
+
+	if (memseg.len != 0) {
+		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
+			errno = EINVAL;
+			return (-1);
+		} else {
+			return (0);
+		}
+	}
+
+	bzero(&memseg, sizeof(struct vm_memseg));
+	memseg.segid = segid;
+	memseg.len = len;
+	if (name != NULL) {
+		n = strlcpy(memseg.name, name, sizeof(memseg.name));
+		if (n >= sizeof(memseg.name)) {
+			errno = ENAMETOOLONG;
+			return (-1);
+		}
+	}
+
+	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
+	return (error);
+}
+
+int
+vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
+    size_t bufsize)
+{
+	struct vm_memseg memseg;
+	size_t n;
+	int error;
+
+	memseg.segid = segid;
+	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
+	if (error == 0) {
+		*lenp = memseg.len;
+		n = strlcpy(namebuf, memseg.name, bufsize);
+		if (n >= bufsize) {
+			errno = ENAMETOOLONG;
+			error = -1;
+		}
 	}
 	return (error);
 }
 
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+{
+	char *ptr;
+	int error, flags;
+
+	/* Map 'len' bytes starting at 'gpa' in the guest address space */
+	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+	if (error)
+		return (error);
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+	/* mmap into the process address space on the host */
+	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	return (0);
+}
+
 int
 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
 {
-	char **addr;
-	int error;
+	size_t objsize, len;
+	vm_paddr_t gpa;
+	char *baseaddr, *ptr;
+	int error, flags;
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
-	ctx->vms = vms;
+	assert(vms == VM_MMAP_ALL);
 
 	/*
 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
@@ -234,43 +371,69 @@ vm_setup_memory(struct vmctx *ctx, size_
 	 */
 	if (memsize > ctx->lowmem_limit) {
 		ctx->lowmem = ctx->lowmem_limit;
-		ctx->highmem = memsize - ctx->lowmem;
+		ctx->highmem = memsize - ctx->lowmem_limit;
+		objsize = 4*GB + ctx->highmem;
 	} else {
 		ctx->lowmem = memsize;
 		ctx->highmem = 0;
+		objsize = ctx->lowmem;
 	}
 
-	if (ctx->lowmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
-		error = setup_memory_segment(ctx, 0, ctx->lowmem, addr);
+	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
+	if (error)
+		return (error);
+
+	/*
+	 * Stake out a contiguous region covering the guest physical memory
+	 * and the adjoining guard regions.
+	 */
+	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+	flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+	ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
+	if (ptr == MAP_FAILED)
+		return (-1);
+
+	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
+	if (ctx->highmem > 0) {
+		gpa = 4*GB;
+		len = ctx->highmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
-	if (ctx->highmem > 0) {
-		addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
-		error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+	if (ctx->lowmem > 0) {
+		gpa = 0;
+		len = ctx->lowmem;
+		error = setup_memory_segment(ctx, gpa, len, baseaddr);
 		if (error)
 			return (error);
 	}
 
+	ctx->baseaddr = baseaddr;
+
 	return (0);
 }
 
+/*
+ * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
+ * the lowmem or highmem regions.
+ *
+ * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
+ * The instruction emulation code depends on this behavior.
+ */
 void *
 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
 
-	/* XXX VM_MMAP_SPARSE not implemented yet */
-	assert(ctx->vms == VM_MMAP_ALL);
-
-	if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
-		return ((void *)(ctx->lowmem_addr + gaddr));
+	if (ctx->lowmem > 0) {
+		if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
+			return (ctx->baseaddr + gaddr);
+	}
 
-	if (gaddr >= 4*GB) {
-		gaddr -= 4*GB;
-		if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
-			return ((void *)(ctx->highmem_addr + gaddr));
+	if (ctx->highmem > 0) {
+		if (gaddr >= 4*GB && gaddr + len <= 4*GB + ctx->highmem)
+			return (ctx->baseaddr + gaddr);
 	}
 
 	return (NULL);
@@ -290,6 +453,56 @@ vm_get_highmem_size(struct vmctx *ctx)
 	return (ctx->highmem);
 }
 
+void *
+vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
+{
+	char pathname[MAXPATHLEN];
+	size_t len2;
+	char *base, *ptr;
+	int fd, error, flags;
+
+	fd = -1;
+	ptr = MAP_FAILED;
+	if (name == NULL || strlen(name) == 0) {
+		errno = EINVAL;
+		goto done;
+	}
+
+	error = vm_alloc_memseg(ctx, segid, len, name);
+	if (error)
+		goto done;
+
+	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
+	strlcat(pathname, ctx->name, sizeof(pathname));
+	strlcat(pathname, ".", sizeof(pathname));
+	strlcat(pathname, name, sizeof(pathname));
+
+	fd = open(pathname, O_RDWR);
+	if (fd < 0)
+		goto done;
+
+	/*
+	 * Stake out a contiguous region covering the device memory and the
+	 * adjoining guard regions.
+	 */
+	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
+	flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+	base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
+	if (base == MAP_FAILED)
+		goto done;
+
+	flags = MAP_SHARED | MAP_FIXED;
+	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+		flags |= MAP_NOCORE;
+
+	/* mmap the devmem region in the host address space */
+	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
+done:
+	if (fd >= 0)
+		close(fd);
+	return (ptr);
+}
+
 int
 vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
 	    uint64_t base, uint32_t limit, uint32_t access)

Modified: stable/10/lib/libvmmapi/vmmapi.h
==============================================================================
--- stable/10/lib/libvmmapi/vmmapi.h	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/lib/libvmmapi/vmmapi.h	Mon Feb  1 14:56:11 2016	(r295124)
@@ -36,7 +36,7 @@
  * API version for out-of-tree consumers like grub-bhyve for making compile
  * time decisions.
  */
-#define	VMMAPI_VERSION	0101	/* 2 digit major followed by 2 digit minor */
+#define	VMMAPI_VERSION	0102	/* 2 digit major followed by 2 digit minor */
 
 struct iovec;
 struct vmctx;
@@ -52,14 +52,59 @@ enum vm_mmap_style {
 	VM_MMAP_SPARSE,		/* mappings created on-demand */
 };
 
+/*
+ * 'flags' value passed to 'vm_set_memflags()'.
+ */
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
+#define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
+
+/*
+ * Identifiers for memory segments:
+ * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
+ * - the remaining identifiers can be used to create devmem segments.
+ */
+enum {
+	VM_SYSMEM,
+	VM_BOOTROM,
+	VM_FRAMEBUFFER,
+};
+
+/*
+ * Get the length and name of the memory segment identified by 'segid'.
+ * Note that system memory segments are identified with a nul name.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+int	vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
+	    size_t namesiz);
+
+/*
+ * Iterate over the guest address space. This function finds an address range
+ * that starts at an address >= *gpa.
+ *
+ * Returns 0 if the next address range was found and non-zero otherwise.
+ */
+int	vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+	    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+/*
+ * Create a device memory segment identified by 'segid'.
+ *
+ * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
+ */
+void	*vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
+	    size_t len);
+
+/*
+ * Map the memory segment identified by 'segid' into the guest address space
+ * at [gpa,gpa+len) with protection 'prot'.
+ */
+int	vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
+	    vm_ooffset_t segoff, size_t len, int prot);
 
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
-			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
@@ -68,6 +113,7 @@ int	vm_gla2gpa(struct vmctx *, int vcpui
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);
+int	vm_get_memflags(struct vmctx *ctx);
 size_t	vm_get_lowmem_size(struct vmctx *ctx);
 size_t	vm_get_highmem_size(struct vmctx *ctx);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,

Modified: stable/10/share/examples/bhyve/vmrun.sh
==============================================================================
--- stable/10/share/examples/bhyve/vmrun.sh	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/share/examples/bhyve/vmrun.sh	Mon Feb  1 14:56:11 2016	(r295124)
@@ -48,8 +48,8 @@ usage() {
 
 	echo "Usage: vmrun.sh [-ahi] [-c <CPUs>] [-C <console>] [-d <disk file>]"
 	echo "                [-e <name=value>] [-g <gdbport> ] [-H <directory>]"
-	echo "                [-I <location of installation iso>] [-m <memsize>]"
-	echo "                [-t <tapdev>] <vmname>"
+	echo "                [-I <location of installation iso>] [-l <loader>]"
+	echo "                [-m <memsize>] [-t <tapdev>] <vmname>"
 	echo ""
 	echo "       -h: display this help message"
 	echo "       -a: force memory mapped local APIC access"
@@ -61,6 +61,7 @@ usage() {
 	echo "       -H: host filesystem to export to the loader"
 	echo "       -i: force boot of the Installation CDROM image"
 	echo "       -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})"
+	echo "       -l: the OS loader to use (default is /boot/userboot.so)"
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
 	echo "       -p: pass-through a host PCI device at bus/slot/func (e.g. 10/0/0)"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
@@ -87,15 +88,15 @@ console=${DEFAULT_CONSOLE}
 cpus=${DEFAULT_CPUS}
 tap_total=0
 disk_total=0
-apic_opt=""
 gdbport=0
 loader_opt=""
+bhyverun_opt="-H -A -P"
 pass_total=0
 
-while getopts ac:C:d:e:g:hH:iI:m:p:t: c ; do
+while getopts ac:C:d:e:g:hH:iI:l:m:p:t: c ; do
 	case $c in
 	a)
-		apic_opt="-a"
+		bhyverun_opt="${bhyverun_opt} -a"
 		;;
 	c)
 		cpus=${OPTARG}
@@ -125,6 +126,9 @@ while getopts ac:C:d:e:g:hH:iI:m:p:t: c 
 	I)
 		isofile=${OPTARG}
 		;;
+	l)
+		loader_opt="${loader_opt} -l ${OPTARG}"
+		;;
 	m)
 		memsize=${OPTARG}
 		;;
@@ -163,6 +167,12 @@ if [ -n "${host_base}" ]; then
 	loader_opt="${loader_opt} -h ${host_base}"
 fi
 
+# If PCI passthru devices are configured then guest memory must be wired
+if [ ${pass_total} -gt 0 ]; then
+	loader_opt="${loader_opt} -S"
+	bhyverun_opt="${bhyverun_opt} -S"
+fi
+
 make_and_check_diskdev()
 {
     local virtio_diskdev="$1"
@@ -263,7 +273,7 @@ while [ 1 ]; do
 	    i=$(($i + 1))
         done
 
-	${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P	\
+	${FBSDRUN} -c ${cpus} -m ${memsize} ${bhyverun_opt}		\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\
 		-s 1:0,lpc						\

Modified: stable/10/sys/amd64/include/vmm.h
==============================================================================
--- stable/10/sys/amd64/include/vmm.h	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/include/vmm.h	Mon Feb  1 14:56:11 2016	(r295124)
@@ -108,7 +108,6 @@ enum x2apic_state {
 
 struct vm;
 struct vm_exception;
-struct vm_memory_segment;
 struct seg_desc;
 struct vm_exit;
 struct vm_run;
@@ -175,17 +174,33 @@ int vm_create(const char *name, struct v
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
-int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+
+/*
+ * APIs that modify the guest memory map require all vcpus to be frozen.
+ */
+int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+    size_t len, int prot, int flags);
+int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
+void vm_free_memseg(struct vm *vm, int ident);
 int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
 int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
-void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
-		  void **cookie);
+int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
+
+/*
+ * APIs that inspect the guest memory map require only a *single* vcpu to
+ * be frozen. This acts like a read lock on the guest memory map since any
+ * modification requires *all* vcpus to be frozen.
+ */
+int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
+    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    struct vm_object **objptr);
+void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
+    int prot, void **cookie);
 void vm_gpa_release(void *cookie);
-int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
-	      struct vm_memory_segment *seg);
-int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
-		  vm_offset_t *offset, struct vm_object **object);
-boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
+bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
+
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
@@ -302,8 +317,6 @@ vcpu_should_yield(struct vm *vm, int vcp
 void *vcpu_stats(struct vm *vm, int vcpu);
 void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
 struct vmspace *vm_get_vmspace(struct vm *vm);
-int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
-int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
 struct vpmtmr *vm_pmtmr(struct vm *vm);

Modified: stable/10/sys/amd64/include/vmm_dev.h
==============================================================================
--- stable/10/sys/amd64/include/vmm_dev.h	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/include/vmm_dev.h	Mon Feb  1 14:56:11 2016	(r295124)
@@ -34,10 +34,22 @@ void	vmmdev_init(void);
 int	vmmdev_cleanup(void);
 #endif
 
-struct vm_memory_segment {
-	vm_paddr_t	gpa;	/* in */
+struct vm_memmap {
+	vm_paddr_t	gpa;
+	int		segid;		/* memory segment */
+	vm_ooffset_t	segoff;		/* offset into memory segment */
+	size_t		len;		/* mmap length */
+	int		prot;		/* RWX */
+	int		flags;
+};
+#define	VM_MEMMAP_F_WIRED	0x01
+#define	VM_MEMMAP_F_IOMMU	0x02
+
+#define	VM_MEMSEG_NAME(m)	((m)->name[0] != '\0' ? (m)->name : NULL)
+struct vm_memseg {
+	int		segid;
 	size_t		len;
-	int		wired;
+	char		name[SPECNAMELEN + 1];
 };
 
 struct vm_register {
@@ -214,10 +226,14 @@ enum {
 	IOCNUM_REINIT = 5,
 
 	/* memory apis */
-	IOCNUM_MAP_MEMORY = 10,
-	IOCNUM_GET_MEMORY_SEG = 11,
+	IOCNUM_MAP_MEMORY = 10,			/* deprecated */
+	IOCNUM_GET_MEMORY_SEG = 11,		/* deprecated */
 	IOCNUM_GET_GPA_PMAP = 12,
 	IOCNUM_GLA2GPA = 13,
+	IOCNUM_ALLOC_MEMSEG = 14,
+	IOCNUM_GET_MEMSEG = 15,
+	IOCNUM_MMAP_MEMSEG = 16,
+	IOCNUM_MMAP_GETNEXT = 17,
 
 	/* register/state accessors */
 	IOCNUM_SET_REGISTER = 20,
@@ -278,10 +294,14 @@ enum {
 	_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
 #define	VM_REINIT	\
 	_IO('v', IOCNUM_REINIT)
-#define	VM_MAP_MEMORY	\
-	_IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
-#define	VM_GET_MEMORY_SEG \
-	_IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define	VM_ALLOC_MEMSEG	\
+	_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
+#define	VM_GET_MEMSEG	\
+	_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
+#define	VM_MMAP_MEMSEG	\
+	_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
+#define	VM_MMAP_GETNEXT	\
+	_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
 #define	VM_SET_REGISTER \
 	_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
 #define	VM_GET_REGISTER \

Modified: stable/10/sys/amd64/vmm/amd/svm.c
==============================================================================
--- stable/10/sys/amd64/vmm/amd/svm.c	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/vmm/amd/svm.c	Mon Feb  1 14:56:11 2016	(r295124)
@@ -1477,7 +1477,7 @@ svm_vmexit(struct svm_softc *svm_sc, int
 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
 			    "reserved bits set: info1(%#lx) info2(%#lx)",
 			    info1, info2);
-		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
+		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->u.paging.gpa = info2;
 			vmexit->u.paging.fault_type = npf_fault_type(info1);

Modified: stable/10/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- stable/10/sys/amd64/vmm/intel/vmx.c	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/vmm/intel/vmx.c	Mon Feb  1 14:56:11 2016	(r295124)
@@ -2426,7 +2426,7 @@ vmx_exit_process(struct vmx *vmx, int vc
 		 * this must be an instruction that accesses MMIO space.
 		 */
 		gpa = vmcs_gpa();
-		if (vm_mem_allocated(vmx->vm, gpa) ||
+		if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 		    apic_access_fault(vmx, vcpu, gpa)) {
 			vmexit->exitcode = VM_EXITCODE_PAGING;
 			vmexit->inst_length = 0;

Modified: stable/10/sys/amd64/vmm/io/ppt.c
==============================================================================
--- stable/10/sys/amd64/vmm/io/ppt.c	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/vmm/io/ppt.c	Mon Feb  1 14:56:11 2016	(r295124)
@@ -76,11 +76,17 @@ struct pptintr_arg {				/* pptintr(pptin
 	uint64_t	msg_data;
 };
 
+struct pptseg {
+	vm_paddr_t	gpa;
+	size_t		len;
+	int		wired;
+};
+
 struct pptdev {
 	device_t	dev;
 	struct vm	*vm;			/* owner of this device */
 	TAILQ_ENTRY(pptdev)	next;
-	struct vm_memory_segment mmio[MAX_MMIOSEGS];
+	struct pptseg mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
 
@@ -207,14 +213,14 @@ static void
 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
 {
 	int i;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 
 	for (i = 0; i < MAX_MMIOSEGS; i++) {
 		seg = &ppt->mmio[i];
 		if (seg->len == 0)
 			continue;
 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
-		bzero(seg, sizeof(struct vm_memory_segment));
+		bzero(seg, sizeof(struct pptseg));
 	}
 }
 
@@ -324,7 +330,7 @@ ppt_is_mmio(struct vm *vm, vm_paddr_t gp
 {
 	int i;
 	struct pptdev *ppt;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 
 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm != vm)
@@ -410,7 +416,7 @@ ppt_map_mmio(struct vm *vm, int bus, int
 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
 	int i, error;
-	struct vm_memory_segment *seg;
+	struct pptseg *seg;
 	struct pptdev *ppt;
 
 	ppt = ppt_find(bus, slot, func);

Modified: stable/10/sys/amd64/vmm/vmm.c
==============================================================================
--- stable/10/sys/amd64/vmm/vmm.c	Mon Feb  1 14:28:58 2016	(r295123)
+++ stable/10/sys/amd64/vmm/vmm.c	Mon Feb  1 14:56:11 2016	(r295124)
@@ -120,12 +120,21 @@ struct vcpu {
 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
 
 struct mem_seg {
+	size_t	len;
+	bool	sysmem;
+	struct vm_object *object;
+};
+#define	VM_MAX_MEMSEGS	2
+
+struct mem_map {
 	vm_paddr_t	gpa;
 	size_t		len;
-	boolean_t	wired;
-	vm_object_t	object;
+	vm_ooffset_t	segoff;
+	int		segid;
+	int		prot;
+	int		flags;
 };
-#define	VM_MAX_MEMORY_SEGMENTS	2
+#define	VM_MAX_MEMMAPS	4
 
 /*
  * Initialization:
@@ -151,8 +160,8 @@ struct vm {
 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
-	int		num_mem_segs;		/* (o) guest memory segments */
-	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
+	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
+	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
@@ -224,6 +233,8 @@ TUNABLE_INT("hw.vmm.force_iommu", &vmm_f
 SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
     "Force use of I/O MMU even if no passthrough devices were found.");
 
+static void vm_free_memmap(struct vm *vm, int ident);
+static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
 
 #ifdef KTR
@@ -444,7 +455,6 @@ vm_create(const char *name, struct vm **
 
 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
 	strcpy(vm->name, name);
-	vm->num_mem_segs = 0;
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 
@@ -455,18 +465,9 @@ vm_create(const char *name, struct vm **
 }
 
 static void
-vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
-{
-
-	if (seg->object != NULL)
-		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
-
-	bzero(seg, sizeof(*seg));
-}
-
-static void
 vm_cleanup(struct vm *vm, bool destroy)
 {
+	struct mem_map *mm;
 	int i;
 
 	ppt_unassign_all(vm);
@@ -489,11 +490,23 @@ vm_cleanup(struct vm *vm, bool destroy)
 
 	VMCLEANUP(vm->cookie);
 
-	if (destroy) {
-		for (i = 0; i < vm->num_mem_segs; i++)
-			vm_free_mem_seg(vm, &vm->mem_segs[i]);
+	/*
+	 * System memory is removed from the guest address space only when
+	 * the VM is destroyed. This is because the mapping remains the same
+	 * across VM reset.
+	 *
+	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
+	 * so those mappings are removed on a VM reset.
+	 */
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (destroy || !sysmem_mapping(vm, mm))
+			vm_free_memmap(vm, i);
+	}
 
-		vm->num_mem_segs = 0;
+	if (destroy) {
+		for (i = 0; i < VM_MAX_MEMSEGS; i++)
+			vm_free_memseg(vm, i);
 
 		VMSPACE_FREE(vm->vmspace);
 		vm->vmspace = NULL;
@@ -551,146 +564,243 @@ vm_unmap_mmio(struct vm *vm, vm_paddr_t 
 	return (0);
 }
 
-boolean_t
-vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
+/*
+ * Return 'true' if 'gpa' is allocated in the guest address space.
+ *
+ * This function is called in the context of a running vcpu which acts as
+ * an implicit lock on 'vm->mem_maps[]'.
+ */
+bool
+vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
 {
+	struct mem_map *mm;
 	int i;
-	vm_paddr_t gpabase, gpalimit;
 
-	for (i = 0; i < vm->num_mem_segs; i++) {
-		gpabase = vm->mem_segs[i].gpa;
-		gpalimit = gpabase + vm->mem_segs[i].len;
-		if (gpa >= gpabase && gpa < gpalimit)
-			return (TRUE);		/* 'gpa' is regular memory */
+#ifdef INVARIANTS
+	int hostcpu, state;
+	state = vcpu_get_state(vm, vcpuid, &hostcpu);
+	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
+	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
+#endif
+
+	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
+		mm = &vm->mem_maps[i];
+		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
+			return (true);		/* 'gpa' is sysmem or devmem */
 	}
 
 	if (ppt_is_mmio(vm, gpa))
-		return (TRUE);			/* 'gpa' is pci passthru mmio */
+		return (true);			/* 'gpa' is pci passthru mmio */
 
-	return (FALSE);
+	return (false);
 }
 
 int
-vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 {
-	int available, allocated;
 	struct mem_seg *seg;
-	vm_object_t object;
-	vm_paddr_t g;
+	vm_object_t obj;
 
-	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
-	
-	available = allocated = 0;
-	g = gpa;
-	while (g < gpa + len) {
-		if (vm_mem_allocated(vm, g))
-			allocated++;
-		else
-			available++;
 
-		g += PAGE_SIZE;
-	}
-
-	/*
-	 * If there are some allocated and some available pages in the address
-	 * range then it is an error.
-	 */
-	if (allocated && available)
+	if (len == 0 || (len & PAGE_MASK))
 		return (EINVAL);
 
-	/*
-	 * If the entire address range being requested has already been
-	 * allocated then there isn't anything more to do.
-	 */
-	if (allocated && available == 0)
-		return (0);
-
-	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
-		return (E2BIG);
-
-	seg = &vm->mem_segs[vm->num_mem_segs];
+	seg = &vm->mem_segs[ident];
+	if (seg->object != NULL) {
+		if (seg->len == len && seg->sysmem == sysmem)
+			return (EEXIST);
+		else
+			return (EINVAL);
+	}
 
-	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
+	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
+	if (obj == NULL)
 		return (ENOMEM);
 
-	seg->gpa = gpa;
 	seg->len = len;
-	seg->object = object;
-	seg->wired = FALSE;
+	seg->object = obj;
+	seg->sysmem = sysmem;
+	return (0);
+}
 
-	vm->num_mem_segs++;
+int
+vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
+    vm_object_t *objptr)
+{
+	struct mem_seg *seg;
+
+	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
+		return (EINVAL);
 
+	seg = &vm->mem_segs[ident];
+	if (len)
+		*len = seg->len;
+	if (sysmem)
+		*sysmem = seg->sysmem;
+	if (objptr)
+		*objptr = seg->object;
 	return (0);
 }
 
-static vm_paddr_t
-vm_maxmem(struct vm *vm)
+void
+vm_free_memseg(struct vm *vm, int ident)
 {
-	int i;
-	vm_paddr_t gpa, maxmem;
+	struct mem_seg *seg;
 
-	maxmem = 0;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201602011456.u11EuBDE016423>