Date: Tue, 19 Aug 2014 01:20:25 +0000 (UTC) From: Peter Grehan <grehan@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r270159 - in stable/10: lib/libvmmapi sys/amd64/amd64 sys/amd64/include sys/amd64/vmm sys/amd64/vmm/intel sys/amd64/vmm/io sys/x86/include usr.sbin/bhyve usr.sbin/bhyvectl usr.sbin/bhyv... Message-ID: <201408190120.s7J1KP93011521@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: grehan Date: Tue Aug 19 01:20:24 2014 New Revision: 270159 URL: http://svnweb.freebsd.org/changeset/base/270159 Log: MFC r267921, r267934, r267949, r267959, r267966, r268202, r268276, r268427, r268428, r268521, r268638, r268639, r268701, r268777, r268889, r268922, r269008, r269042, r269043, r269080, r269094, r269108, r269109, r269281, r269317, r269700, r269896, r269962, r269989. Catch bhyve up to CURRENT. Lightly tested with FreeBSD i386/amd64, Linux i386/amd64, and OpenBSD/amd64. Still resolving an issue with OpenBSD/i386. Many thanks to jhb@ for all the hard work on the prior MFCs ! r267921 - support the "mov r/m8, imm8" instruction r267934 - document options r267949 - set DMI vers/date to fixed values r267959 - doc: sort cmd flags r267966 - EPT misconf post-mortem info r268202 - use correct flag for event index r268276 - 64-bit virtio capability api r268427 - invalidate guest TLB when cr3 is updated, needed for TSS r268428 - identify vcpu's operating mode r268521 - use correct offset in guest logical-to-linear translation r268638 - chs value r268639 - chs fake values r268701 - instr emul operand/address size override prefix support r268777 - emulation for legacy x86 task switching r268889 - nested exception support r268922 - fix INVARIANTS build r269008 - emulate instructions found in the OpenBSD/i386 5.5 kernel r269042 - fix fault injection r269043 - Reduce VMEXIT_RESTARTs in task_switch.c r269080 - fix issues in PUSH emulation r269094 - simplify return values from the inout handlers r269108 - don't return -1 from the push emulation handler r269109 - avoid permanent sleep in vm_handle_hlt() r269281 - list VT-x features in base kernel dmesg r269317 - Mark AHCI fatal errors as not completed r269700 - Support PCI extended config space in bhyve r269896 - Minor cleanup r269962 - use max guest memory when creating IOMMU domain r269989 - fix interrupt mode names Added: stable/10/usr.sbin/bhyve/task_switch.c - copied, changed from r268777, head/usr.sbin/bhyve/task_switch.c Modified: stable/10/lib/libvmmapi/vmmapi.c stable/10/lib/libvmmapi/vmmapi.h stable/10/sys/amd64/amd64/identcpu.c stable/10/sys/amd64/include/vmm.h stable/10/sys/amd64/include/vmm_dev.h stable/10/sys/amd64/include/vmm_instruction_emul.h stable/10/sys/amd64/vmm/intel/vmcs.c stable/10/sys/amd64/vmm/intel/vmcs.h stable/10/sys/amd64/vmm/intel/vmx.c stable/10/sys/amd64/vmm/intel/vmx_msr.c stable/10/sys/amd64/vmm/intel/vmx_msr.h stable/10/sys/amd64/vmm/intel/vtd.c stable/10/sys/amd64/vmm/io/vatpic.c stable/10/sys/amd64/vmm/vmm.c stable/10/sys/amd64/vmm/vmm_dev.c stable/10/sys/amd64/vmm/vmm_instruction_emul.c stable/10/sys/x86/include/specialreg.h stable/10/usr.sbin/bhyve/Makefile stable/10/usr.sbin/bhyve/acpi.c stable/10/usr.sbin/bhyve/atkbdc.c stable/10/usr.sbin/bhyve/bhyve.8 stable/10/usr.sbin/bhyve/bhyverun.c stable/10/usr.sbin/bhyve/bhyverun.h stable/10/usr.sbin/bhyve/block_if.c stable/10/usr.sbin/bhyve/block_if.h stable/10/usr.sbin/bhyve/inout.c stable/10/usr.sbin/bhyve/inout.h stable/10/usr.sbin/bhyve/mem.c stable/10/usr.sbin/bhyve/mem.h stable/10/usr.sbin/bhyve/pci_ahci.c stable/10/usr.sbin/bhyve/pci_emul.c stable/10/usr.sbin/bhyve/pci_emul.h stable/10/usr.sbin/bhyve/pci_irq.c stable/10/usr.sbin/bhyve/pm.c stable/10/usr.sbin/bhyve/smbiostbl.c stable/10/usr.sbin/bhyve/virtio.c stable/10/usr.sbin/bhyve/virtio.h stable/10/usr.sbin/bhyvectl/bhyvectl.c stable/10/usr.sbin/bhyveload/bhyveload.8 stable/10/usr.sbin/bhyveload/bhyveload.c Directory Properties: stable/10/ (props changed) Modified: stable/10/lib/libvmmapi/vmmapi.c ============================================================================== --- stable/10/lib/libvmmapi/vmmapi.c Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/lib/libvmmapi/vmmapi.c Tue Aug 19 01:20:24 2014 (r270159) @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include <sys/_iovec.h> #include <sys/cpuset.h> +#include <x86/segments.h> #include <machine/specialreg.h> #include <machine/param.h> @@ -327,6 +328,16 @@ vm_get_desc(struct vmctx *ctx, int vcpu, } int +vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc) +{ + int error; + + error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit, + &seg_desc->access); + return (error); +} + +int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val) { int error; @@ -988,7 +999,7 @@ gla2gpa(struct vmctx *ctx, int vcpu, str #endif int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) { uint64_t gpa; @@ -1106,3 +1117,32 @@ vm_activate_cpu(struct vmctx *ctx, int v error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac); return (error); } + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + vmii.info1 = info1; + error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + return (error); +} Modified: stable/10/lib/libvmmapi/vmmapi.h ============================================================================== --- stable/10/lib/libvmmapi/vmmapi.h Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/lib/libvmmapi/vmmapi.h Tue Aug 19 01:20:24 2014 (r270159) @@ -66,6 +66,8 @@ int vm_set_desc(struct vmctx *ctx, int v uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); +int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, + struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); int vm_run(struct vmctx *ctx, int vcpu, uint64_t rip, @@ -104,6 +106,9 @@ int vm_setup_pptdev_msix(struct vmctx *c int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ @@ -121,7 +126,7 @@ int vm_get_hpet_capabilities(struct vmct * The 'iovcnt' should be big enough to accomodate all GPA segments. * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. */ -int vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, +int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); Modified: stable/10/sys/amd64/amd64/identcpu.c ============================================================================== --- stable/10/sys/amd64/amd64/identcpu.c Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/amd64/identcpu.c Tue Aug 19 01:20:24 2014 (r270159) @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <machine/specialreg.h> #include <machine/md_var.h> +#include <amd64/vmm/intel/vmx_controls.h> #include <x86/isa/icu.h> /* XXX - should be in header file: */ @@ -73,6 +74,7 @@ static u_int find_cpu_vendor_id(void); static void print_AMD_info(void); static void print_AMD_assoc(int i); static void print_via_padlock_info(void); +static void print_vmx_info(void); int cpu_class; char machine[] = "amd64"; @@ -428,6 +430,9 @@ printcpuinfo(void) if (via_feature_rng != 0 || via_feature_xcrypt != 0) print_via_padlock_info(); + if (cpu_feature2 & CPUID2_VMX) + print_vmx_info(); + if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_AMD) cpu_feature &= ~CPUID_HTT; @@ -722,3 +727,197 @@ print_via_padlock_info(void) "\015RSA" /* PMM */ ); } + +static uint32_t +vmx_settable(uint64_t basic, int msr, int true_msr) +{ + uint64_t val; + + if (basic & (1UL << 55)) + val = rdmsr(true_msr); + else + val = rdmsr(msr); + + /* Just report the controls that can be set to 1. */ + return (val >> 32); +} + +static void +print_vmx_info(void) +{ + uint64_t basic, msr; + uint32_t entry, exit, mask, pin, proc, proc2; + int comma; + + printf("\n VT-x: "); + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) + printf("(disabled in BIOS) "); + basic = rdmsr(MSR_VMX_BASIC); + pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS); + proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS); + if (proc & PROCBASED_SECONDARY_CONTROLS) + proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2); + else + proc2 = 0; + exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS); + entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS); + + if (!bootverbose) { + comma = 0; + if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT && + entry & VM_ENTRY_LOAD_PAT) { + printf("%sPAT", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_HLT_EXITING) { + printf("%sHLT", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_MTF) { + printf("%sMTF", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_PAUSE_EXITING) { + printf("%sPAUSE", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_ENABLE_EPT) { + printf("%sEPT", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) { + printf("%sUG", comma ? "," : ""); + comma = 1; + } + if (proc2 & PROCBASED2_ENABLE_VPID) { + printf("%sVPID", comma ? "," : ""); + comma = 1; + } + if (proc & PROCBASED_USE_TPR_SHADOW && + proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES && + proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE && + proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION && + proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) { + printf("%sVID", comma ? "," : ""); + comma = 1; + if (pin & PINBASED_POSTED_INTERRUPT) + printf(",PostIntr"); + } + return; + } + + mask = basic >> 32; + printf("Basic Features=0x%b", mask, + "\020" + "\02132PA" /* 32-bit physical addresses */ + "\022SMM" /* SMM dual-monitor */ + "\027INS/OUTS" /* VM-exit info for INS and OUTS */ + "\030TRUE" /* TRUE_CTLS MSRs */ + ); + printf("\n Pin-Based Controls=0x%b", pin, + "\020" + "\001ExtINT" /* External-interrupt exiting */ + "\004NMI" /* NMI exiting */ + "\006VNMI" /* Virtual NMIs */ + "\007PreTmr" /* Activate VMX-preemption timer */ + "\010PostIntr" /* Process posted interrupts */ + ); + printf("\n Primary Processor Controls=0x%b", proc, + "\020" + "\003INTWIN" /* Interrupt-window exiting */ + "\004TSCOff" /* Use TSC offsetting */ + "\010HLT" /* HLT exiting */ + "\012INVLPG" /* INVLPG exiting */ + "\013MWAIT" /* MWAIT exiting */ + "\014RDPMC" /* RDPMC exiting */ + "\015RDTSC" /* RDTSC exiting */ + "\020CR3-LD" /* CR3-load exiting */ + "\021CR3-ST" /* CR3-store exiting */ + "\024CR8-LD" /* CR8-load exiting */ + "\025CR8-ST" /* CR8-store exiting */ + "\026TPR" /* Use TPR shadow */ + "\027NMIWIN" /* NMI-window exiting */ + "\030MOV-DR" /* MOV-DR exiting */ + "\031IO" /* Unconditional I/O exiting */ + "\032IOmap" /* Use I/O bitmaps */ + "\034MTF" /* Monitor trap flag */ + "\035MSRmap" /* Use MSR bitmaps */ + "\036MONITOR" /* MONITOR exiting */ + "\037PAUSE" /* PAUSE exiting */ + ); + if (proc & PROCBASED_SECONDARY_CONTROLS) + printf("\n Secondary Processor Controls=0x%b", proc2, + "\020" + "\001APIC" /* Virtualize APIC accesses */ + "\002EPT" /* Enable EPT */ + "\003DT" /* Descriptor-table exiting */ + "\004RDTSCP" /* Enable RDTSCP */ + "\005x2APIC" /* Virtualize x2APIC mode */ + "\006VPID" /* Enable VPID */ + "\007WBINVD" /* WBINVD exiting */ + "\010UG" /* Unrestricted guest */ + "\011APIC-reg" /* APIC-register virtualization */ + "\012VID" /* Virtual-interrupt delivery */ + "\013PAUSE-loop" /* PAUSE-loop exiting */ + "\014RDRAND" /* RDRAND exiting */ + "\015INVPCID" /* Enable INVPCID */ + "\016VMFUNC" /* Enable VM functions */ + "\017VMCS" /* VMCS shadowing */ + "\020EPT#VE" /* EPT-violation #VE */ + "\021XSAVES" /* Enable XSAVES/XRSTORS */ + ); + printf("\n Exit Controls=0x%b", mask, + "\020" + "\003DR" /* Save debug controls */ + /* Ignore Host address-space size */ + "\015PERF" /* Load MSR_PERF_GLOBAL_CTRL */ + "\020AckInt" /* Acknowledge interrupt on exit */ + "\023PAT-SV" /* Save MSR_PAT */ + "\024PAT-LD" /* Load MSR_PAT */ + "\025EFER-SV" /* Save MSR_EFER */ + "\026EFER-LD" /* Load MSR_EFER */ + "\027PTMR-SV" /* Save VMX-preemption timer value */ + ); + printf("\n Entry Controls=0x%b", mask, + "\020" + "\003DR" /* Save debug controls */ + /* Ignore IA-32e mode guest */ + /* Ignore Entry to SMM */ + /* Ignore Deactivate dual-monitor treatment */ + "\016PERF" /* Load MSR_PERF_GLOBAL_CTRL */ + "\017PAT" /* Load MSR_PAT */ + "\020EFER" /* Load MSR_EFER */ + ); + if (proc & PROCBASED_SECONDARY_CONTROLS && + (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) { + msr = rdmsr(MSR_VMX_EPT_VPID_CAP); + mask = msr; + printf("\n EPT Features=0x%b", mask, + "\020" + "\001XO" /* Execute-only translations */ + "\007PW4" /* Page-walk length of 4 */ + "\011UC" /* EPT paging-structure mem can be UC */ + "\017WB" /* EPT paging-structure mem can be WB */ + "\0212M" /* EPT PDE can map a 2-Mbyte page */ + "\0221G" /* EPT PDPTE can map a 1-Gbyte page */ + "\025INVEPT" /* INVEPT is supported */ + "\026AD" /* Accessed and dirty flags for EPT */ + "\032single" /* INVEPT single-context type */ + "\033all" /* INVEPT all-context type */ + ); + mask = msr >> 32; + printf("\n VPID Features=0x%b", mask, + "\020" + "\001INVVPID" /* INVVPID is supported */ + "\011individual" /* INVVPID individual-address type */ + "\012single" /* INVVPID single-context type */ + "\013all" /* INVVPID all-context type */ + /* INVVPID single-context-retaining-globals type */ + "\014single-globals" + ); + } +} Modified: stable/10/sys/amd64/include/vmm.h ============================================================================== --- stable/10/sys/amd64/include/vmm.h Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/include/vmm.h Tue Aug 19 01:20:24 2014 (r270159) @@ -29,11 +29,14 @@ #ifndef _VMM_H_ #define _VMM_H_ +#include <x86/segments.h> + enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; @@ -75,6 +78,10 @@ enum vm_reg_name { VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, VM_REG_LAST }; @@ -84,6 +91,16 @@ enum x2apic_state { X2APIC_STATE_LAST }; +#define VM_INTINFO_VECTOR(info) ((info) & 0xff) +#define VM_INTINFO_DEL_ERRCODE 0x800 +#define VM_INTINFO_RSVD 0x7ffff000 +#define VM_INTINFO_VALID 0x80000000 +#define VM_INTINFO_TYPE 0x700 +#define VM_INTINFO_HWINTR (0 << 8) +#define VM_INTINFO_NMI (2 << 8) +#define VM_INTINFO_HWEXCEPTION (3 << 8) +#define VM_INTINFO_SWINTR (4 << 8) + #ifdef _KERNEL #define VM_MAX_NAMELEN 32 @@ -99,6 +116,7 @@ struct vioapic; struct vlapic; struct vmspace; struct vm_object; +struct vm_guest_paging; struct pmap; typedef int (*vmm_init_func_t)(int ipinum); @@ -252,6 +270,14 @@ vcpu_is_running(struct vm *vm, int vcpu, return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } +#ifdef _SYS_PROC_H_ +static int __inline +vcpu_should_yield(struct vm *vm, int vcpu) +{ + return (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)); +} +#endif + void *vcpu_stats(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); struct vmspace *vm_get_vmspace(struct vm *vm); @@ -274,21 +300,63 @@ struct vatpit *vm_atpit(struct vm *vm); int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); /* - * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an - * exception is pending and also updates 'vme'. The pending exception is - * cleared when this function returns. + * This function is called after a VM-exit that occurred during exception or + * interrupt delivery through the IDT. The format of 'intinfo' is described + * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2. * - * This function should only be called in the context of the thread that is - * executing this vcpu. + * If a VM-exit handler completes the event delivery successfully then it + * should call vm_exit_intinfo() to extinguish the pending event. For e.g., + * if the task switch emulation is triggered via a task gate then it should + * call this function with 'intinfo=0' to indicate that the external event + * is not pending anymore. + * + * Return value is 0 on success and non-zero on failure. */ -int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo); -void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */ -void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */ -void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2); +/* + * This function is called before every VM-entry to retrieve a pending + * event that should be injected into the guest. This function combines + * nested events into a double or triple fault. + * + * Returns 0 if there are no events that need to be injected into the guest + * and non-zero otherwise. + */ +int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info); + +int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); enum vm_reg_name vm_segment_name(int seg_encoding); +struct vm_copyinfo { + uint64_t gpa; + size_t len; + void *hva; + void *cookie; +}; + +/* + * Set up 'copyinfo[]' to copy to/from guest linear address space starting + * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for + * a copyin or PROT_WRITE for a copyout. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + * + * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if + * the return value is 0. The 'copyinfo[]' resources should be freed by calling + * 'vm_copy_teardown()' after the copy is done. + */ +int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + int num_copyinfo); +void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, + void *kaddr, size_t len); +void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, + struct vm_copyinfo *copyinfo, size_t len); #endif /* KERNEL */ #define VM_MAXCPU 16 /* maximum virtual cpus */ @@ -322,13 +390,16 @@ struct seg_desc { uint32_t limit; uint32_t access; }; -#define SEG_DESC_TYPE(desc) ((desc)->access & 0x001f) -#define SEG_DESC_PRESENT(desc) ((desc)->access & 0x0080) -#define SEG_DESC_DEF32(desc) ((desc)->access & 0x4000) -#define SEG_DESC_GRANULARITY(desc) ((desc)->access & 0x8000) -#define SEG_DESC_UNUSABLE(desc) ((desc)->access & 0x10000) +#define SEG_DESC_TYPE(access) ((access) & 0x001f) +#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) +#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) +#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0) +#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0) +#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0) enum vm_cpu_mode { + CPU_MODE_REAL, + CPU_MODE_PROTECTED, CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */ }; @@ -364,11 +435,14 @@ struct vie { uint8_t num_valid; /* size of the instruction */ uint8_t num_processed; + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ uint8_t rex_w:1, /* REX prefix */ rex_r:1, rex_x:1, rex_b:1, - rex_present:1; + rex_present:1, + opsize_override:1, /* Operand size override */ + addrsize_override:1; /* Address size override */ uint8_t mod:2, /* ModRM byte */ reg:4, @@ -410,6 +484,7 @@ enum vm_exitcode { VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MAX }; @@ -434,6 +509,22 @@ struct vm_inout_str { struct seg_desc seg_desc; }; +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -448,6 +539,7 @@ struct vm_exit { struct { uint64_t gpa; uint64_t gla; + int cs_d; /* CS.D */ struct vm_guest_paging paging; struct vie vie; } inst_emul; @@ -487,7 +579,38 @@ struct vm_exit { struct { enum vm_suspend_how how; } suspended; + struct vm_task_switch task_switch; } u; }; +/* APIs to inject faults into the guest */ +void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid, + int errcode); + +static void __inline +vm_inject_ud(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0); +} + +static void __inline +vm_inject_gp(void *vm, int vcpuid) +{ + vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0); +} + +static void __inline +vm_inject_ac(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode); +} + +static void __inline +vm_inject_ss(void *vm, int vcpuid, int errcode) +{ + vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode); +} + +void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); + #endif /* _VMM_H_ */ Modified: stable/10/sys/amd64/include/vmm_dev.h ============================================================================== --- stable/10/sys/amd64/include/vmm_dev.h Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/include/vmm_dev.h Tue Aug 19 01:20:24 2014 (r270159) @@ -189,6 +189,12 @@ struct vm_cpuset { #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -211,6 +217,8 @@ enum { IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, @@ -324,4 +332,8 @@ enum { _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) #endif Modified: stable/10/sys/amd64/include/vmm_instruction_emul.h ============================================================================== --- stable/10/sys/amd64/include/vmm_instruction_emul.h Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/include/vmm_instruction_emul.h Tue Aug 19 01:20:24 2014 (r270159) @@ -52,8 +52,8 @@ typedef int (*mem_region_write_t)(void * * s */ int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t mrr, mem_region_write_t mrw, - void *mrarg); + struct vm_guest_paging *paging, mem_region_read_t mrr, + mem_region_write_t mrw, void *mrarg); int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, int size); @@ -108,7 +108,7 @@ void vie_init(struct vie *vie); */ #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vm_cpu_mode cpu_mode, struct vie *vie); + enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); #endif /* _KERNEL */ #endif /* _VMM_INSTRUCTION_EMUL_H_ */ Modified: stable/10/sys/amd64/vmm/intel/vmcs.c ============================================================================== --- stable/10/sys/amd64/vmm/intel/vmcs.c Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/vmm/intel/vmcs.c Tue Aug 19 01:20:24 2014 (r270159) @@ -103,6 +103,14 @@ vmcs_field_encoding(int ident) return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); default: return (-1); } Modified: stable/10/sys/amd64/vmm/intel/vmcs.h ============================================================================== --- stable/10/sys/amd64/vmm/intel/vmcs.h Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/vmm/intel/vmcs.h Tue Aug 19 01:20:24 2014 (r270159) @@ -346,6 +346,9 @@ vmcs_write(uint32_t encoding, uint64_t v #define VMCS_INTR_T_HWINTR (0 << 8) #define VMCS_INTR_T_NMI (2 << 8) #define VMCS_INTR_T_HWEXCEPTION (3 << 8) +#define VMCS_INTR_T_SWINTR (4 << 8) +#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8) +#define VMCS_INTR_T_SWEXCEPTION (6 << 8) #define VMCS_INTR_DEL_ERRCODE (1 << 11) /* Modified: stable/10/sys/amd64/vmm/intel/vmx.c ============================================================================== --- stable/10/sys/amd64/vmm/intel/vmx.c Mon Aug 18 23:45:40 2014 (r270158) +++ stable/10/sys/amd64/vmm/intel/vmx.c Tue Aug 19 01:20:24 2014 (r270159) @@ -149,8 +149,6 @@ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); -static int vmx_no_patmsr; - static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); @@ -158,18 +156,38 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initia /* * Optional capabilities */ +static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + +static int vmx_patmsr; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, patmsr, CTLFLAG_RD, &vmx_patmsr, 0, + "PAT MSR saved and restored in VCMS"); + static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec; @@ -618,6 +636,7 @@ vmx_init(int ipinum) } /* Check support for VM-exit controls */ + vmx_patmsr = 1; error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, @@ -637,12 +656,12 @@ vmx_init(int ipinum) if (bootverbose) printf("vmm: PAT MSR access not supported\n"); guest_msr_valid(MSR_PAT); - vmx_no_patmsr = 1; + vmx_patmsr = 0; } } /* Check support for VM-entry controls */ - if (!vmx_no_patmsr) { + if (vmx_patmsr) { error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, @@ -918,7 +937,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) * MSR_PAT save/restore support, leave access disabled so accesses * will be trapped. */ - if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) + if (vmx_patmsr && guest_msr_rw(vmx, MSR_PAT)) panic("vmx_vminit: error setting guest pat msr access"); vpid_alloc(vpid, VM_MAXCPU); @@ -974,7 +993,7 @@ vmx_vminit(struct vm *vm, pmap_t pmap) vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; - vmx->state[i].lastcpu = -1; + vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); @@ -1047,27 +1066,37 @@ vmx_astpending_trace(struct vmx *vmx, in } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); -static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; - if (vmxstate->lastcpu == curcpu) + if (vmxstate->vpid == 0) return; - vmxstate->lastcpu = curcpu; - - vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } - vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); - vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); - vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); /* - * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated @@ -1081,25 +1110,43 @@ vmx_set_pcpu_defaults(struct vmx *vmx, i * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ - if (vmxstate->vpid != 0) { - if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { - invvpid_desc._res1 = 0; - invvpid_desc._res2 = 0; - invvpid_desc.vpid = vmxstate->vpid; - invvpid_desc.linear_addr = 0; - invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); - } else { - /* - * The invvpid can be skipped if an invept is going to - * be performed before entering the guest. The invept - * will invalidate combined mappings tagged with - * 'vmx->eptp' for all vpids. - */ - vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); - } + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; + invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; + invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) +{ + struct vmxstate *vmxstate; + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ @@ -1183,24 +1230,32 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) { - struct vm_exception exc; int vector, need_nmi_exiting, extint_pending; - uint64_t rflags; + uint64_t rflags, entryinfo; uint32_t gi, info; - if (vm_exception_pending(vmx->vm, vcpu, &exc)) { - KASSERT(exc.vector >= 0 && exc.vector < 32, - ("%s: invalid exception vector %d", __func__, exc.vector)); + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { + KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " + "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " - "pending exception %d: %#x", __func__, exc.vector, info)); + "pending exception: %#lx/%#x", __func__, entryinfo, info)); - info = exc.vector | VMCS_INTR_T_HWEXCEPTION | VMCS_INTR_VALID; - if (exc.error_code_valid) { - info |= VMCS_INTR_DEL_ERRCODE; - vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, exc.error_code); + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + vmcs_write(VMCS_ENTRY_INTR_INFO, info); } @@ -1379,6 +1434,16 @@ vmx_clear_nmi_blocking(struct vmx *vmx, vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { @@ -1659,11 +1724,19 @@ vmx_cpl(void) static enum vm_cpu_mode vmx_cpu_mode(void) { + uint32_t csar; - if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) - return (CPU_MODE_64BIT); - else - return (CPU_MODE_COMPATIBILITY); + if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + if (csar & 0x2000) + return (CPU_MODE_64BIT); /* CS.L = 1 */ + else + return (CPU_MODE_COMPATIBILITY); + } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } } static enum vm_paging_mode @@ -1757,10 +1830,25 @@ vmx_paging_info(struct vm_guest_paging * static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { + struct vm_guest_paging *paging; + uint32_t csar; + + paging = &vmexit->u.inst_emul.paging; + vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; - vmx_paging_info(&vmexit->u.inst_emul.paging); + vmx_paging_info(paging); + switch (paging->cpu_mode) { + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); + vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + break; + default: + vmexit->u.inst_emul.cs_d = 0; + break; + } } static int @@ -1969,6 +2057,26 @@ vmx_handle_apic_access(struct vmx *vmx, return (UNHANDLED); } +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201408190120.s7J1KP93011521>