Date: Thu, 16 Aug 2018 09:02:02 +0000 (UTC) From: =?UTF-8?Q?Roger_Pau_Monn=c3=a9?= <royger@FreeBSD.org> To: ports-committers@freebsd.org, svn-ports-all@freebsd.org, svn-ports-head@freebsd.org Subject: svn commit: r477316 - in head: emulators/xen-kernel411 emulators/xen-kernel411/files sysutils/xen-tools411 sysutils/xen-tools411/files Message-ID: <201808160902.w7G922kJ047574@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: royger (src committer) Date: Thu Aug 16 09:02:02 2018 New Revision: 477316 URL: https://svnweb.freebsd.org/changeset/ports/477316 Log: xen411: apply fixes for XSA-269, XSA-272 and XSA-273 Added: head/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch (contents, props changed) head/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch (contents, props changed) head/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch (contents, props changed) head/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch (contents, props changed) head/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch (contents, props changed) head/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch (contents, props changed) head/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch (contents, props changed) head/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch (contents, props changed) head/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch (contents, props changed) head/emulators/xen-kernel411/files/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch (contents, props changed) head/emulators/xen-kernel411/files/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch (contents, props changed) head/emulators/xen-kernel411/files/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch (contents, props changed) head/emulators/xen-kernel411/files/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch (contents, props changed) head/emulators/xen-kernel411/files/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch (contents, props changed) head/emulators/xen-kernel411/files/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch (contents, props changed) head/emulators/xen-kernel411/files/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch (contents, props changed) head/emulators/xen-kernel411/files/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch (contents, props changed) head/emulators/xen-kernel411/files/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch (contents, props changed) head/emulators/xen-kernel411/files/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch (contents, props changed) head/emulators/xen-kernel411/files/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch (contents, props changed) head/emulators/xen-kernel411/files/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch (contents, props changed) head/emulators/xen-kernel411/files/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch (contents, props changed) head/emulators/xen-kernel411/files/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch (contents, props changed) head/emulators/xen-kernel411/files/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch (contents, props changed) head/emulators/xen-kernel411/files/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch (contents, props changed) head/emulators/xen-kernel411/files/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch (contents, props changed) head/emulators/xen-kernel411/files/0028-VMX-fix-vmx_-find-del-_msr-build.patch (contents, props changed) head/emulators/xen-kernel411/files/0029-ARM-disable-grant-table-v2.patch (contents, props changed) head/emulators/xen-kernel411/files/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch (contents, props changed) head/emulators/xen-kernel411/files/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch (contents, props changed) head/emulators/xen-kernel411/files/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch (contents, props changed) head/emulators/xen-kernel411/files/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch (contents, props changed) head/emulators/xen-kernel411/files/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch (contents, props changed) head/emulators/xen-kernel411/files/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch (contents, props changed) head/emulators/xen-kernel411/files/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch (contents, props changed) head/emulators/xen-kernel411/files/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch (contents, props changed) head/emulators/xen-kernel411/files/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch (contents, props changed) head/emulators/xen-kernel411/files/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch (contents, props changed) head/emulators/xen-kernel411/files/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch (contents, props changed) head/sysutils/xen-tools411/files/0031-tools-oxenstored-Make-evaluation-order-explicit.patch (contents, props changed) head/sysutils/xen-tools411/files/0041-xl.conf-Add-global-affinity-masks.patch (contents, props changed) Modified: head/emulators/xen-kernel411/Makefile head/sysutils/xen-tools411/Makefile Modified: head/emulators/xen-kernel411/Makefile ============================================================================== --- head/emulators/xen-kernel411/Makefile Thu Aug 16 08:56:17 2018 (r477315) +++ head/emulators/xen-kernel411/Makefile Thu Aug 16 09:02:02 2018 (r477316) @@ -2,7 +2,7 @@ PORTNAME= xen PORTVERSION= 4.11.0 -PORTREVISION= 0 +PORTREVISION= 1 CATEGORIES= emulators MASTER_SITES= http://downloads.xenproject.org/release/xen/${PORTVERSION}/ PKGNAMESUFFIX= -kernel411 @@ -47,6 +47,49 @@ EXTRA_PATCHES+= ${FILESDIR}/0001-x86-replace-usage-in- ${FILESDIR}/0002-x86-efi-split-compiler-vs-linker-support.patch:-p1 # Fix PVH Dom0 build with shadow paging EXTRA_PATCHES+= ${FILESDIR}/0001-x86-pvh-change-the-order-of-the-iommu-initialization.patch:-p1 +# XSA-269 (MSR_DEBUGCTL handling) and XSA-273 (L1TF) +# Note that due to the high value of patches needed to fix L1TF the package is +# brought up to the state of the staging-4.11 branch. This can be removed when +# 4.11.1 is released. +EXTRA_PATCHES+= ${FILESDIR}/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch:-p1 \ + ${FILESDIR}/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch:-p1 \ + ${FILESDIR}/0003-x86-spec-ctrl-command-line-handling-adjustments.patch:-p1 \ + ${FILESDIR}/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch:-p1 \ + ${FILESDIR}/0006-allow-cpu_down-to-be-called-earlier.patch:-p1 \ + ${FILESDIR}/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch:-p1 \ + ${FILESDIR}/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch:-p1 \ + ${FILESDIR}/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch:-p1 \ + ${FILESDIR}/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch:-p1 \ + ${FILESDIR}/0011-x86-possibly-bring-up-all-CPUs-even-if-not-all-are-s.patch:-p1 \ + ${FILESDIR}/0012-x86-command-line-option-to-avoid-use-of-secondary-hy.patch:-p1 \ + ${FILESDIR}/0013-x86-vmx-Don-t-clobber-dr6-while-debugging-state-is-l.patch:-p1 \ + ${FILESDIR}/0014-x86-xstate-Use-a-guests-CPUID-policy-rather-than-all.patch:-p1 \ + ${FILESDIR}/0015-x86-xstate-Make-errors-in-xstate-calculations-more-o.patch:-p1 \ + ${FILESDIR}/0016-x86-hvm-Disallow-unknown-MSR_EFER-bits.patch:-p1 \ + ${FILESDIR}/0017-x86-spec-ctrl-Fix-the-parsing-of-xpti-on-fixed-Intel.patch:-p1 \ + ${FILESDIR}/0018-x86-spec-ctrl-Yet-more-fixes-for-xpti-parsing.patch:-p1 \ + ${FILESDIR}/0019-x86-vmx-Fix-handing-of-MSR_DEBUGCTL-on-VMExit.patch:-p1 \ + ${FILESDIR}/0020-x86-vmx-Defer-vmx_vmcs_exit-as-long-as-possible-in-c.patch:-p1 \ + ${FILESDIR}/0021-x86-vmx-API-improvements-for-MSR-load-save-infrastru.patch:-p1 \ + ${FILESDIR}/0022-x86-vmx-Internal-cleanup-for-MSR-load-save-infrastru.patch:-p1 \ + ${FILESDIR}/0023-x86-vmx-Factor-locate_msr_entry-out-of-vmx_find_msr-.patch:-p1 \ + ${FILESDIR}/0024-x86-vmx-Support-remote-access-to-the-MSR-lists.patch:-p1 \ + ${FILESDIR}/0025-x86-vmx-Improvements-to-LBR-MSR-handling.patch:-p1 \ + ${FILESDIR}/0026-x86-vmx-Pass-an-MSR-value-into-vmx_msr_add.patch:-p1 \ + ${FILESDIR}/0027-x86-vmx-Support-load-only-guest-MSR-list-entries.patch:-p1 \ + ${FILESDIR}/0028-VMX-fix-vmx_-find-del-_msr-build.patch:-p1 \ + ${FILESDIR}/0029-ARM-disable-grant-table-v2.patch:-p1 \ + ${FILESDIR}/0030-x86-vtx-Fix-the-checking-for-unknown-invalid-MSR_DEB.patch:-p1 \ + ${FILESDIR}/0032-x86-spec-ctrl-Calculate-safe-PTE-addresses-for-L1TF-.patch:-p1 \ + ${FILESDIR}/0033-x86-spec-ctrl-Introduce-an-option-to-control-L1TF-mi.patch:-p1 \ + ${FILESDIR}/0034-x86-shadow-Infrastructure-to-force-a-PV-guest-into-s.patch:-p1 \ + ${FILESDIR}/0035-x86-mm-Plumbing-to-allow-any-PTE-update-to-fail-with.patch:-p1 \ + ${FILESDIR}/0036-x86-pv-Force-a-guest-into-shadow-mode-when-it-writes.patch:-p1 \ + ${FILESDIR}/0037-x86-spec-ctrl-CPUID-MSR-definitions-for-L1D_FLUSH.patch:-p1 \ + ${FILESDIR}/0038-x86-msr-Virtualise-MSR_FLUSH_CMD-for-guests.patch:-p1 \ + ${FILESDIR}/0039-x86-spec-ctrl-Introduce-an-option-to-control-L1D_FLU.patch:-p1 \ + ${FILESDIR}/0040-x86-Make-spec-ctrl-no-a-global-disable-of-all-mitiga.patch:-p1 \ + ${FILESDIR}/0042-x86-write-to-correct-variable-in-parse_pv_l1tf.patch:-p1 .include <bsd.port.options.mk> Added: head/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0001-xen-Port-the-array_index_nospec-infrastructure-from-.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,213 @@ +From e932371d6ae0f69b89abb2dce725483c75356de2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 30 Jul 2018 11:17:27 +0200 +Subject: [PATCH 01/42] xen: Port the array_index_nospec() infrastructure from + Linux + +This is as the infrastructure appeared in Linux 4.17, adapted slightly for +Xen. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Signed-off-by: Julien Grall <julien.grall@arm.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: 2ddfae51d8b1d7b8cd33a4f6ad4d16d27cb869ae +master date: 2018-07-06 16:49:57 +0100 +--- + xen/include/asm-arm/arm32/system.h | 18 ++++++++ + xen/include/asm-arm/arm64/system.h | 22 ++++++++++ + xen/include/asm-x86/system.h | 24 ++++++++++ + xen/include/xen/compiler.h | 3 ++ + xen/include/xen/nospec.h | 70 ++++++++++++++++++++++++++++++ + 5 files changed, 137 insertions(+) + create mode 100644 xen/include/xen/nospec.h + +diff --git a/xen/include/asm-arm/arm32/system.h b/xen/include/asm-arm/arm32/system.h +index c617b40438..ab57abfbc5 100644 +--- a/xen/include/asm-arm/arm32/system.h ++++ b/xen/include/asm-arm/arm32/system.h +@@ -48,6 +48,24 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define CSDB ".inst 0xe320f014" ++ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile( "cmp %1, %2\n" ++ "sbc %0, %1, %1\n" ++ CSDB ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h +index 2e2ee212a1..2e36573ac6 100644 +--- a/xen/include/asm-arm/arm64/system.h ++++ b/xen/include/asm-arm/arm64/system.h +@@ -58,6 +58,28 @@ static inline int local_fiq_is_enabled(void) + return !(flags & PSR_FIQ_MASK); + } + ++#define csdb() asm volatile ( "hint #20" : : : "memory" ) ++ ++/* ++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz ++ * and 0 otherwise. ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long idx, ++ unsigned long sz) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %1, %2\n" ++ "sbc %0, xzr, xzr\n" ++ : "=r" (mask) ++ : "r" (idx), "Ir" (sz) ++ : "cc" ); ++ csdb(); ++ ++ return mask; ++} ++#define array_index_mask_nospec array_index_mask_nospec ++ + #endif + /* + * Local variables: +diff --git a/xen/include/asm-x86/system.h b/xen/include/asm-x86/system.h +index 43fb6fe489..483cd20afd 100644 +--- a/xen/include/asm-x86/system.h ++++ b/xen/include/asm-x86/system.h +@@ -221,6 +221,30 @@ static always_inline unsigned long __xadd( + #define set_mb(var, value) do { xchg(&var, value); } while (0) + #define set_wmb(var, value) do { var = value; smp_wmb(); } while (0) + ++/** ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the ++ * bounds check succeeds and 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * Returns: ++ * 0 - (index < size) ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ unsigned long mask; ++ ++ asm volatile ( "cmp %[size], %[index]; sbb %[mask], %[mask];" ++ : [mask] "=r" (mask) ++ : [size] "g" (size), [index] "r" (index) ); ++ ++ return mask; ++} ++ ++/* Override default implementation in nospec.h. */ ++#define array_index_mask_nospec array_index_mask_nospec ++ + #define local_irq_disable() asm volatile ( "cli" : : : "memory" ) + #define local_irq_enable() asm volatile ( "sti" : : : "memory" ) + +diff --git a/xen/include/xen/compiler.h b/xen/include/xen/compiler.h +index 533a8ea0f3..a7e05681c9 100644 +--- a/xen/include/xen/compiler.h ++++ b/xen/include/xen/compiler.h +@@ -81,6 +81,9 @@ + #pragma GCC visibility push(hidden) + #endif + ++/* Make the optimizer believe the variable can be manipulated arbitrarily. */ ++#define OPTIMIZER_HIDE_VAR(var) __asm__ ( "" : "+g" (var) ) ++ + /* This macro obfuscates arithmetic on a variable address so that gcc + shouldn't recognize the original var, and make assumptions about it */ + /* +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +new file mode 100644 +index 0000000000..48793996e8 +--- /dev/null ++++ b/xen/include/xen/nospec.h +@@ -0,0 +1,70 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Copyright(c) 2018 Linus Torvalds. All rights reserved. */ ++/* Copyright(c) 2018 Alexei Starovoitov. All rights reserved. */ ++/* Copyright(c) 2018 Intel Corporation. All rights reserved. */ ++/* Copyright(c) 2018 Citrix Systems R&D Ltd. All rights reserved. */ ++ ++#ifndef XEN_NOSPEC_H ++#define XEN_NOSPEC_H ++ ++#include <asm/system.h> ++ ++/** ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * When @index is out of bounds (@index >= @size), the sign bit will be ++ * set. Extend the sign bit to all bits and invert, giving a result of ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size). ++ */ ++#ifndef array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ /* ++ * Always calculate and emit the mask even if the compiler ++ * thinks the mask is not needed. The compiler does not take ++ * into account the value of @index under speculation. ++ */ ++ OPTIMIZER_HIDE_VAR(index); ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); ++} ++#endif ++ ++/* ++ * array_index_nospec - sanitize an array index after a bounds check ++ * ++ * For a code sequence like: ++ * ++ * if (index < size) { ++ * index = array_index_nospec(index, size); ++ * val = array[index]; ++ * } ++ * ++ * ...if the CPU speculates past the bounds check then ++ * array_index_nospec() will clamp the index within the range of [0, ++ * size). ++ */ ++#define array_index_nospec(index, size) \ ++({ \ ++ typeof(index) _i = (index); \ ++ typeof(size) _s = (size); \ ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \ ++ \ ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ ++ \ ++ (typeof(_i)) (_i & _mask); \ ++}) ++ ++#endif /* XEN_NOSPEC_H */ ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0002-x86-correctly-set-nonlazy_xstate_used-when-loading-f.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,51 @@ +From da33530ab393dcc04d3e35424956277669b8d8ce Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:18:54 +0200 +Subject: [PATCH 02/42] x86: correctly set nonlazy_xstate_used when loading + full state + +In this case, just like xcr0_accum, nonlazy_xstate_used should always be +set to the intended new value, rather than possibly leaving the flag set +from a prior state load. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Wei Liu <wei.liu2@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: f46bf0e101ca63118b9db2616e8f51e972d7f563 +master date: 2018-07-09 10:51:02 +0200 +--- + xen/arch/x86/domctl.c | 3 +-- + xen/arch/x86/hvm/hvm.c | 3 +-- + 2 files changed, 2 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c +index 8fbbf3aeb3..b04388d663 100644 +--- a/xen/arch/x86/domctl.c ++++ b/xen/arch/x86/domctl.c +@@ -1187,8 +1187,7 @@ long arch_do_domctl( + vcpu_pause(v); + v->arch.xcr0 = _xcr0; + v->arch.xcr0_accum = _xcr0_accum; +- if ( _xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, _xsave_area, + evc->size - PV_XSAVE_HDR_SIZE); + vcpu_unpause(v); +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index c23983cdff..279cb88e45 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -1324,8 +1324,7 @@ static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) + + v->arch.xcr0 = ctxt->xcr0; + v->arch.xcr0_accum = ctxt->xcr0_accum; +- if ( ctxt->xcr0_accum & XSTATE_NONLAZY ) +- v->arch.nonlazy_xstate_used = 1; ++ v->arch.nonlazy_xstate_used = ctxt->xcr0_accum & XSTATE_NONLAZY; + compress_xsave_states(v, &ctxt->save_area, + size - offsetof(struct hvm_hw_cpu_xsave, save_area)); + +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0003-x86-spec-ctrl-command-line-handling-adjustments.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,45 @@ +From 4bdeedbd611c59f07878eb22955f655a81452835 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:19:41 +0200 +Subject: [PATCH 03/42] x86/spec-ctrl: command line handling adjustments + +For one, "no-xen" should not imply "no-eager-fpu", as "eager FPU" mode +is to guard guests, not Xen itself, which is also expressed so by +print_details(). + +And then opt_ssbd, despite being off by default, should also be cleared +by the "no" and "no-xen" sub-options. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: ac3f9a72141a48d40fabfff561d5a7dc0e1b810d +master date: 2018-07-10 12:22:31 +0200 +--- + xen/arch/x86/spec_ctrl.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 08e6784c4c..73dc7170c7 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -124,6 +124,8 @@ static int __init parse_spec_ctrl(const char *s) + opt_msr_sc_pv = false; + opt_msr_sc_hvm = false; + ++ opt_eager_fpu = 0; ++ + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +@@ -131,7 +133,7 @@ static int __init parse_spec_ctrl(const char *s) + opt_thunk = THUNK_JMP; + opt_ibrs = 0; + opt_ibpb = false; +- opt_eager_fpu = 0; ++ opt_ssbd = false; + } + else if ( val > 0 ) + rc = -EINVAL; +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0005-mm-page_alloc-correct-first_dirty-calculations-durin.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,66 @@ +From ac35e050b64a565fe234dd42e8dac163e946e58d Mon Sep 17 00:00:00 2001 +From: Sergey Dyasli <sergey.dyasli@citrix.com> +Date: Mon, 30 Jul 2018 11:21:28 +0200 +Subject: [PATCH 05/42] mm/page_alloc: correct first_dirty calculations during + block merging + +Currently it's possible to hit an assertion in alloc_heap_pages(): + +Assertion 'first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub)' failed at page_alloc.c:988 + +This can happen because a piece of logic to calculate first_dirty +during block merging in free_heap_pages() is missing for the following +scenario: + +1. Current block's first_dirty equals to INVALID_DIRTY_IDX +2. Successor block is free but its first_dirty != INVALID_DIRTY_IDX +3. The successor is merged into current block +4. Current block's first_dirty still equals to INVALID_DIRTY_IDX + +This will trigger the assertion during allocation of such block in +alloc_heap_pages() because there will be pages with PGC_need_scrub +bit set despite the claim of first_dirty that the block is scrubbed. + +Add the missing piece of logic and slightly update the comment for +the predecessor case to better capture the code's intent. + +Fixes 1a37f33ea613 ("mm: Place unscrubbed pages at the end of pagelist") + +Signed-off-by: Sergey Dyasli <sergey.dyasli@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> +master commit: 1e2df9608857b5355f2ec3b1a34b87a2007dcd16 +master date: 2018-07-12 10:45:11 +0200 +--- + xen/common/page_alloc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c +index 20ee1e4897..02aeed7c47 100644 +--- a/xen/common/page_alloc.c ++++ b/xen/common/page_alloc.c +@@ -1426,7 +1426,7 @@ static void free_heap_pages( + + page_list_del(predecessor, &heap(node, zone, order)); + +- /* Keep predecessor's first_dirty if it is already set. */ ++ /* Update predecessor's first_dirty if necessary. */ + if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX && + pg->u.free.first_dirty != INVALID_DIRTY_IDX ) + predecessor->u.free.first_dirty = (1U << order) + +@@ -1447,6 +1447,12 @@ static void free_heap_pages( + + check_and_stop_scrub(successor); + ++ /* Update pg's first_dirty if necessary. */ ++ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX && ++ successor->u.free.first_dirty != INVALID_DIRTY_IDX ) ++ pg->u.free.first_dirty = (1U << order) + ++ successor->u.free.first_dirty; ++ + page_list_del(successor, &heap(node, zone, order)); + } + +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0006-allow-cpu_down-to-be-called-earlier.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,58 @@ +From a44cf0c8728e08858638170a057675ca5479fdc7 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:22:06 +0200 +Subject: [PATCH 06/42] allow cpu_down() to be called earlier + +The function's use of the stop-machine logic has so far prevented its +use ahead of the processing of the "ordinary" initcalls. Since at this +early time we're in a controlled environment anyway, there's no need for +such a heavy tool. Additionally this ought to have less of a performance +impact especially on large systems, compared to the alternative of +making stop-machine functionality available earlier. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Wei Liu <wei.liu2@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 5894c0a2da66243a89088d309c7e1ea212ab28d6 +master date: 2018-07-16 15:15:12 +0200 +--- + xen/common/cpu.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/xen/common/cpu.c b/xen/common/cpu.c +index 6350f150bd..653a56b840 100644 +--- a/xen/common/cpu.c ++++ b/xen/common/cpu.c +@@ -67,12 +67,17 @@ void __init register_cpu_notifier(struct notifier_block *nb) + spin_unlock(&cpu_add_remove_lock); + } + +-static int take_cpu_down(void *unused) ++static void _take_cpu_down(void *unused) + { + void *hcpu = (void *)(long)smp_processor_id(); + int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL); + BUG_ON(notifier_rc != NOTIFY_DONE); + __cpu_disable(); ++} ++ ++static int take_cpu_down(void *arg) ++{ ++ _take_cpu_down(arg); + return 0; + } + +@@ -98,7 +103,9 @@ int cpu_down(unsigned int cpu) + goto fail; + } + +- if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) ++ if ( unlikely(system_state < SYS_STATE_active) ) ++ on_selected_cpus(cpumask_of(cpu), _take_cpu_down, NULL, true); ++ else if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) + goto fail; + + __cpu_die(cpu); +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0007-x86-svm-Fixes-and-cleanup-to-svm_inject_event.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,109 @@ +From b53e0defcea1400c03f83d1d5cc30a3b237c8cfe Mon Sep 17 00:00:00 2001 +From: Andrew Cooper <andrew.cooper3@citrix.com> +Date: Mon, 30 Jul 2018 11:22:42 +0200 +Subject: [PATCH 07/42] x86/svm Fixes and cleanup to svm_inject_event() + + * State adjustments (and debug tracing) for #DB/#BP/#PF should not be done + for `int $n` instructions. Updates to %cr2 occur even if the exception + combines to #DF. + * Don't opencode DR_STEP when updating %dr6. + * Simplify the logic for calling svm_emul_swint_injection() as in the common + case, every condition needs checking. + * Fix comments which have become stale as code has moved between components. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> +master commit: 8dab867c81ede455009028a9a88edc4ff3b9da88 +master date: 2018-07-17 10:12:40 +0100 +--- + xen/arch/x86/hvm/svm/svm.c | 41 ++++++++++++++++---------------------- + 1 file changed, 17 insertions(+), 24 deletions(-) + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index 165500e3f2..b964c59dad 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1432,24 +1432,18 @@ static void svm_inject_event(const struct x86_event *event) + * Xen must emulate enough of the event injection to be sure that a + * further fault shouldn't occur during delivery. This covers the fact + * that hardware doesn't perform DPL checking on injection. +- * +- * Also, it accounts for proper positioning of %rip for an event with trap +- * semantics (where %rip should point after the instruction) which suffers +- * a fault during injection (at which point %rip should point at the +- * instruction). + */ + if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION || +- (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT || +- event->type == X86_EVENTTYPE_SW_EXCEPTION)) ) ++ (!cpu_has_svm_nrips && (event->type >= X86_EVENTTYPE_SW_INTERRUPT)) ) + svm_emul_swint_injection(&_event); + +- switch ( _event.vector ) ++ switch ( _event.vector | -(_event.type == X86_EVENTTYPE_SW_INTERRUPT) ) + { + case TRAP_debug: + if ( regs->eflags & X86_EFLAGS_TF ) + { + __restore_debug_registers(vmcb, curr); +- vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000); ++ vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | DR_STEP); + } + /* fall through */ + case TRAP_int3: +@@ -1459,6 +1453,13 @@ static void svm_inject_event(const struct x86_event *event) + domain_pause_for_debugger(); + return; + } ++ break; ++ ++ case TRAP_page_fault: ++ ASSERT(_event.type == X86_EVENTTYPE_HW_EXCEPTION); ++ curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; ++ vmcb_set_cr2(vmcb, _event.cr2); ++ break; + } + + if ( unlikely(eventinj.fields.v) && +@@ -1481,13 +1482,9 @@ static void svm_inject_event(const struct x86_event *event) + * icebp, software events with trap semantics need emulating, so %rip in + * the trap frame points after the instruction. + * +- * The x86 emulator (if requested by the x86_swint_emulate_* choice) will +- * have performed checks such as presence/dpl/etc and believes that the +- * event injection will succeed without faulting. +- * +- * The x86 emulator will always provide fault semantics for software +- * events, with _trap.insn_len set appropriately. If the injection +- * requires emulation, move %rip forwards at this point. ++ * svm_emul_swint_injection() has already confirmed that events with trap ++ * semantics won't fault on injection. Position %rip/NextRIP suitably, ++ * and restrict the event type to what hardware will tolerate. + */ + switch ( _event.type ) + { +@@ -1544,16 +1541,12 @@ static void svm_inject_event(const struct x86_event *event) + eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode); + vmcb->eventinj = eventinj; + +- if ( _event.vector == TRAP_page_fault ) +- { +- curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2; +- vmcb_set_cr2(vmcb, _event.cr2); +- HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2)); +- } ++ if ( _event.vector == TRAP_page_fault && ++ _event.type == X86_EVENTTYPE_HW_EXCEPTION ) ++ HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, ++ TRC_PAR_LONG(_event.cr2)); + else +- { + HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code); +- } + } + + static int svm_event_pending(struct vcpu *v) +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0008-cpupools-fix-state-when-downing-a-CPU-failed.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,55 @@ +From 0a2016ca2fabfe674c311dcfd8e15fec0ba3f7b6 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:23:22 +0200 +Subject: [PATCH 08/42] cpupools: fix state when downing a CPU failed + +While I've run into the issue with further patches in place which no +longer guarantee the per-CPU area to start out as all zeros, the +CPU_DOWN_FAILED processing looks to have the same issue: By not zapping +the per-CPU cpupool pointer, cpupool_cpu_add()'s (indirect) invocation +of schedule_cpu_switch() will trigger the "c != old_pool" assertion +there. + +Clearing the field during CPU_DOWN_PREPARE is too early (afaict this +should not happen before cpu_disable_scheduler()). Clearing it in +CPU_DEAD and CPU_DOWN_FAILED would be an option, but would take the same +piece of code twice. Since the field's value shouldn't matter while the +CPU is offline, simply clear it (implicitly) for CPU_ONLINE and +CPU_DOWN_FAILED, but only for other than the suspend/resume case (which +gets specially handled in cpupool_cpu_remove()). + +By adjusting the conditional in cpupool_cpu_add() CPU_DOWN_FAILED +handling in the suspend case should now also be handled better. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +master commit: cb1ae9a27819cea0c5008773c68a7be6f37eb0e5 +master date: 2018-07-19 09:41:55 +0200 +--- + xen/common/cpupool.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c +index 999839444e..1e8edcbd57 100644 +--- a/xen/common/cpupool.c ++++ b/xen/common/cpupool.c +@@ -490,7 +490,7 @@ static int cpupool_cpu_add(unsigned int cpu) + cpumask_clear_cpu(cpu, &cpupool_locked_cpus); + cpumask_set_cpu(cpu, &cpupool_free_cpus); + +- if ( system_state == SYS_STATE_resume ) ++ if ( system_state == SYS_STATE_suspend || system_state == SYS_STATE_resume ) + { + struct cpupool **c; + +@@ -522,6 +522,7 @@ static int cpupool_cpu_add(unsigned int cpu) + * (or unplugging would have failed) and that is the default behavior + * anyway. + */ ++ per_cpu(cpupool, cpu) = NULL; + ret = cpupool_assign_cpu_locked(cpupool0, cpu); + } + out: +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0009-x86-AMD-distinguish-compute-units-from-hyper-threads.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,121 @@ +From bd51a6424202a5f1cd13dee6614bcb69ecbd2458 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:24:01 +0200 +Subject: [PATCH 09/42] x86/AMD: distinguish compute units from hyper-threads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fam17 replaces CUs by HTs, which we should reflect accordingly, even if +the difference is not very big. The most relevant change (requiring some +code restructuring) is that the topoext feature no longer means there is +a valid CU ID. + +Take the opportunity and convert wrongly plain int variables in +set_cpu_sibling_map() to unsigned int. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Brian Woods <brian.woods@amd.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 9429b07a0af7f92a5f25e4068e11db881e157495 +master date: 2018-07-19 09:42:42 +0200 +--- + xen/arch/x86/cpu/amd.c | 16 +++++++++++----- + xen/arch/x86/smpboot.c | 32 ++++++++++++++++++++------------ + 2 files changed, 31 insertions(+), 17 deletions(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 458a3fe60c..76078b55b2 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -505,17 +505,23 @@ static void amd_get_topology(struct cpuinfo_x86 *c) + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); +- c->compute_unit_id = ebx & 0xFF; + c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1; ++ ++ if (c->x86 < 0x17) ++ c->compute_unit_id = ebx & 0xFF; ++ else { ++ c->cpu_core_id = ebx & 0xFF; ++ c->x86_max_cores /= c->x86_num_siblings; ++ } + } + + if (opt_cpu_info) + printk("CPU %d(%d) -> Processor %d, %s %d\n", + cpu, c->x86_max_cores, c->phys_proc_id, +- cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" : +- "Core", +- cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id : +- c->cpu_core_id); ++ c->compute_unit_id != INVALID_CUID ? "Compute Unit" ++ : "Core", ++ c->compute_unit_id != INVALID_CUID ? c->compute_unit_id ++ : c->cpu_core_id); + } + + static void early_init_amd(struct cpuinfo_x86 *c) +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index d4478e6132..78ba73578a 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -234,33 +234,41 @@ static void link_thread_siblings(int cpu1, int cpu2) + cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1)); + } + +-static void set_cpu_sibling_map(int cpu) ++static void set_cpu_sibling_map(unsigned int cpu) + { +- int i; ++ unsigned int i; + struct cpuinfo_x86 *c = cpu_data; + + cpumask_set_cpu(cpu, &cpu_sibling_setup_map); + + cpumask_set_cpu(cpu, socket_cpumask[cpu_to_socket(cpu)]); ++ cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu)); ++ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); + + if ( c[cpu].x86_num_siblings > 1 ) + { + for_each_cpu ( i, &cpu_sibling_setup_map ) + { +- if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) { +- if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].compute_unit_id == c[i].compute_unit_id) ) ++ if ( cpu == i || c[cpu].phys_proc_id != c[i].phys_proc_id ) ++ continue; ++ if ( c[cpu].compute_unit_id != INVALID_CUID && ++ c[i].compute_unit_id != INVALID_CUID ) ++ { ++ if ( c[cpu].compute_unit_id == c[i].compute_unit_id ) ++ link_thread_siblings(cpu, i); ++ } ++ else if ( c[cpu].cpu_core_id != XEN_INVALID_CORE_ID && ++ c[i].cpu_core_id != XEN_INVALID_CORE_ID ) ++ { ++ if ( c[cpu].cpu_core_id == c[i].cpu_core_id ) + link_thread_siblings(cpu, i); +- } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && +- (c[cpu].cpu_core_id == c[i].cpu_core_id) ) { +- link_thread_siblings(cpu, i); + } ++ else ++ printk(XENLOG_WARNING ++ "CPU%u: unclear relationship with CPU%u\n", ++ cpu, i); + } + } +- else +- { +- cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); +- } + + if ( c[cpu].x86_max_cores == 1 ) + { +-- +2.18.0 + Added: head/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/emulators/xen-kernel411/files/0010-x86-distinguish-CPU-offlining-from-CPU-removal.patch Thu Aug 16 09:02:02 2018 (r477316) @@ -0,0 +1,423 @@ +From 5908b4866b682d9189c36eddf7c898fd95b27ec1 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Mon, 30 Jul 2018 11:24:53 +0200 +Subject: [PATCH 10/42] x86: distinguish CPU offlining from CPU removal + +In order to be able to service #MC on offlined CPUs, the GDT, IDT, +stack, and per-CPU data (which includes the TSS) need to be kept +allocated. They should only be freed upon CPU removal (which we +currently don't support, so some code is becoming effectively dead for +the moment). + +Note that for now park_offline_cpus doesn't get set to true anywhere - +this is going to be the subject of a subsequent patch. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Wei Liu <wei.liu2@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 2e6c8f182c9c50129b1c7a620242861e6ad6a9fb +master date: 2018-07-19 13:43:33 +0100 +--- + xen/arch/x86/cpu/mcheck/mce.c | 15 ++++++-- + xen/arch/x86/domain.c | 9 +++-- + xen/arch/x86/genapic/x2apic.c | 9 +++-- + xen/arch/x86/percpu.c | 9 +++-- + xen/arch/x86/smpboot.c | 71 ++++++++++++++++++++++------------- + xen/include/asm-x86/smp.h | 2 + + xen/include/xen/cpu.h | 2 + + xen/include/xen/cpumask.h | 23 ++++++++++++ + xen/include/xen/mm.h | 8 ++++ + xen/include/xen/xmalloc.h | 6 +++ + 10 files changed, 115 insertions(+), 39 deletions(-) + +diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c +index a8c287d124..32273d9208 100644 +--- a/xen/arch/x86/cpu/mcheck/mce.c ++++ b/xen/arch/x86/cpu/mcheck/mce.c +@@ -692,12 +692,15 @@ static void cpu_bank_free(unsigned int cpu) + + mcabanks_free(poll); + mcabanks_free(clr); ++ ++ per_cpu(poll_bankmask, cpu) = NULL; ++ per_cpu(mce_clear_banks, cpu) = NULL; + } + + static int cpu_bank_alloc(unsigned int cpu) + { +- struct mca_banks *poll = mcabanks_alloc(); +- struct mca_banks *clr = mcabanks_alloc(); ++ struct mca_banks *poll = per_cpu(poll_bankmask, cpu) ?: mcabanks_alloc(); ++ struct mca_banks *clr = per_cpu(mce_clear_banks, cpu) ?: mcabanks_alloc(); + + if ( !poll || !clr ) + { +@@ -725,7 +728,13 @@ static int cpu_callback( + + case CPU_UP_CANCELED: + case CPU_DEAD: +- cpu_bank_free(cpu); ++ if ( !park_offline_cpus ) ++ cpu_bank_free(cpu); ++ break; ++ ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ cpu_bank_free(cpu); + break; + } + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 9850a782ec..c39cf2c6e5 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -107,10 +107,11 @@ static void play_dead(void) + local_irq_disable(); + + /* +- * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible, +- * as they may be freed at any time. In this case, heap corruption or +- * #PF can occur (when heap debugging is enabled). For example, even +- * printk() can involve tasklet scheduling, which touches per-cpu vars. ++ * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible, ++ * as they may be freed at any time if offline CPUs don't get parked. In ++ * this case, heap corruption or #PF can occur (when heap debugging is ++ * enabled). For example, even printk() can involve tasklet scheduling, ++ * which touches per-cpu vars. + * + * Consider very carefully when adding code to *dead_idle. Most hypervisor + * subsystems are unsafe to call. +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index 4779b0d0d5..d997806272 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -201,18 +201,21 @@ static int update_clusterinfo( + if ( !cluster_cpus_spare ) + cluster_cpus_spare = xzalloc(cpumask_t); + if ( !cluster_cpus_spare || +- !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) ++ !cond_alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) + err = -ENOMEM; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: ++ case CPU_REMOVE: ++ if ( park_offline_cpus == (action != CPU_REMOVE) ) ++ break; + if ( per_cpu(cluster_cpus, cpu) ) + { + cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu)); + if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) ) +- xfree(per_cpu(cluster_cpus, cpu)); ++ XFREE(per_cpu(cluster_cpus, cpu)); + } +- free_cpumask_var(per_cpu(scratch_mask, cpu)); ++ FREE_CPUMASK_VAR(per_cpu(scratch_mask, cpu)); + break; + } + +diff --git a/xen/arch/x86/percpu.c b/xen/arch/x86/percpu.c +index c9997b7937..8be4ebddf4 100644 +--- a/xen/arch/x86/percpu.c ++++ b/xen/arch/x86/percpu.c +@@ -28,7 +28,7 @@ static int init_percpu_area(unsigned int cpu) + char *p; + + if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA ) +- return -EBUSY; ++ return 0; + + if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL ) + return -ENOMEM; +@@ -76,9 +76,12 @@ static int cpu_percpu_callback( + break; + case CPU_UP_CANCELED: + case CPU_DEAD: +- free_percpu_area(cpu); ++ if ( !park_offline_cpus ) ++ free_percpu_area(cpu); + break; +- default: ++ case CPU_REMOVE: ++ if ( park_offline_cpus ) ++ free_percpu_area(cpu); + break; + } + +diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c +index 78ba73578a..7e76cc3d68 100644 +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -63,6 +63,8 @@ static cpumask_t scratch_cpu0mask; + cpumask_t cpu_online_map __read_mostly; + EXPORT_SYMBOL(cpu_online_map); + ++bool __read_mostly park_offline_cpus; ++ + unsigned int __read_mostly nr_sockets; + cpumask_t **__read_mostly socket_cpumask; + static cpumask_t *secondary_socket_cpumask; +@@ -895,7 +897,14 @@ static void cleanup_cpu_root_pgt(unsigned int cpu) + } + } + +-static void cpu_smpboot_free(unsigned int cpu) ++/* ++ * The 'remove' boolean controls whether a CPU is just getting offlined (and ++ * parked), or outright removed / offlined without parking. Parked CPUs need ++ * things like their stack, GDT, IDT, TSS, and per-CPU data still available. ++ * A few other items, in particular CPU masks, are also retained, as it's ++ * difficult to prove that they're entirely unreferenced from parked CPUs. ++ */ ++static void cpu_smpboot_free(unsigned int cpu, bool remove) + { + unsigned int order, socket = cpu_to_socket(cpu); + struct cpuinfo_x86 *c = cpu_data; +@@ -906,15 +915,19 @@ static void cpu_smpboot_free(unsigned int cpu) + socket_cpumask[socket] = NULL; + } + +- c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; +- c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; +- c[cpu].compute_unit_id = INVALID_CUID; + cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); + +- free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); +- free_cpumask_var(per_cpu(cpu_core_mask, cpu)); +- if ( per_cpu(scratch_cpumask, cpu) != &scratch_cpu0mask ) +- free_cpumask_var(per_cpu(scratch_cpumask, cpu)); ++ if ( remove ) ++ { ++ c[cpu].phys_proc_id = XEN_INVALID_SOCKET_ID; ++ c[cpu].cpu_core_id = XEN_INVALID_CORE_ID; ++ c[cpu].compute_unit_id = INVALID_CUID; ++ *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808160902.w7G922kJ047574>