Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 22 Apr 2015 15:55:21 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        arch@freebsd.org, amd64@freebsd.org
Subject:   Cx MWAIT
Message-ID:  <20150422125521.GQ2390@kib.kiev.ua>

next in thread | raw e-mail | index | archive | help
Below is the patch to start using mwait instead of 'legacy' port read
to enter the higher Cx states when idle. This is the Intel' recommended
way of entering Cx, using hints provided by the vendor-specific
fixed function hardware GAS encoding. See the "Intel(R) Processor
Vendor-Specific ACPI Interface Specification" revision 007. Patch was
written after I become interested why my Haswell desktop test box does
not report any C-states besides C1. It appeared to be due to combination
of BIOS misconfiguration and FreeBSD code lacking mwait support.

Also an enchanced C1 entry sequence, "I/O then halt", for coordination
of C1 entry with PCH, is supported. The "sti;hlt" sequence usage was
consolidated by calling acpi_cpu_c1().

Intel hardware automatically handles per-core and per-package state
aggregated from the thread-local C-states, which is indicated as
"hardware-coordinated" C-state entry. It is theoretically possible that
OS must handle software-coordinated package C-entry, but I am not aware
of real processors which need this mode. Intel is hw-coordinated, and it
seems that AMD does not advertise mwait sequence for C-states at all.

I know that BIOS _CST tables are believed to be buggy.  In particular,
for Linux, Intel wrote a driver which has hard-coded model tables with
the encoding of supported C-states, latencies and caches/busmastering
behaviour.  I agree with avg that we cannot support this approach.

I tried to keep the dev/acpica/acpi_cpu.c to be MI as much as possible.
At least, all mwait-specific code is put under #ifdef x86. The
acpi_PkgFFH_IntelCPU() helper to parse Intel FFH GAS is MI, but only
usable on x86; I believe this is fine.  Note that currently ACPI is only
used on x86: we lost ia64, but it might be used on arm shortly.

diff --git a/sys/amd64/acpica/acpi_machdep.c b/sys/amd64/acpica/acpi_machdep.c
index 049b51bb4e..8f88a00 100644
--- a/sys/amd64/acpica/acpi_machdep.c
+++ b/sys/amd64/acpica/acpi_machdep.c
@@ -87,13 +87,6 @@ acpi_machdep_quirks(int *quirks)
 	return (0);
 }
 
-void
-acpi_cpu_c1()
-{
-
-	__asm __volatile("sti; hlt");
-}
-
 /*
  * Support for mapping ACPI tables during early boot.  Currently this
  * uses the crashdump map to map each table.  However, the crashdump
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index 9083421..0813e5f 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -91,6 +91,7 @@ struct	dumperinfo;
 void	*alloc_fpusave(int flags);
 void	amd64_syscall(struct thread *td, int traced);
 void	busdma_swi(void);
+bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 void	doreti_iret(void) __asm(__STRING(doreti_iret));
diff --git a/sys/dev/acpica/acpi_cpu.c b/sys/dev/acpica/acpi_cpu.c
index 8df2782..3fb21a6 100644
--- a/sys/dev/acpica/acpi_cpu.c
+++ b/sys/dev/acpica/acpi_cpu.c
@@ -47,6 +47,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/bus.h>
 #if defined(__amd64__) || defined(__i386__)
 #include <machine/clock.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
 #endif
 #include <sys/rman.h>
 
@@ -70,6 +72,10 @@ struct acpi_cx {
     uint32_t		 power;		/* Power consumed (mW). */
     int			 res_type;	/* Resource type for p_lvlx. */
     int			 res_rid;	/* Resource ID for p_lvlx. */
+    bool		 do_mwait;
+    uint32_t		 mwait_hint;
+    bool		 mwait_hw_coord;
+    bool		 mwait_bm_avoidance;
 };
 #define MAX_CX_STATES	 8
 
@@ -128,6 +134,12 @@ struct acpi_cpu_device {
 #define PIIX4_STOP_BREAK_MASK	(PIIX4_BRLD_EN_IRQ0 | PIIX4_BRLD_EN_IRQ | PIIX4_BRLD_EN_IRQ8)
 #define PIIX4_PCNTRL_BST_EN	(1<<10)
 
+#define	CST_FFH_VENDOR_INTEL	1
+#define	CST_FFH_INTEL_CL_C1IO	1
+#define	CST_FFH_INTEL_CL_MWAIT	2
+#define	CST_FFH_MWAIT_HW_COORD	0x0001
+#define	CST_FFH_MWAIT_BM_AVOID	0x0002
+
 /* Allow users to ignore processor orders in MADT. */
 static int cpu_unordered;
 SYSCTL_INT(_debug_acpi, OID_AUTO, cpu_unordered, CTLFLAG_RDTUN,
@@ -348,7 +360,17 @@ acpi_cpu_attach(device_t dev)
      * so advertise this ourselves.  Note this is not the same as independent
      * SMP control where each CPU can have different settings.
      */
-    sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3;
+    sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3 |
+      ACPI_CAP_C1_IO_HALT;
+
+#if defined(__i386__) || defined(__amd64__)
+    /*
+     * Ask for MWAIT modes if interrupts work reasonable with MWAIT.
+     */
+    if (cpu_mwait_usable())
+	sc->cpu_features |= ACPI_CAP_SMP_C1_NATIVE | ACPI_CAP_SMP_C3_NATIVE;
+#endif
+
     if (devclass_get_drivers(acpi_cpu_devclass, &drivers, &drv_count) == 0) {
 	for (i = 0; i < drv_count; i++) {
 	    if (ACPI_GET_FEATURES(drivers[i], &features) == 0)
@@ -720,6 +742,27 @@ acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc)
     }
 }
 
+static void
+acpi_cpu_cx_cst_mwait(struct acpi_cx *cx_ptr, uint64_t address, int accsize)
+{
+
+	cx_ptr->do_mwait = true;
+	cx_ptr->mwait_hint = address & 0xffffffff;
+	cx_ptr->mwait_hw_coord = (accsize & CST_FFH_MWAIT_HW_COORD) != 0;
+	cx_ptr->mwait_bm_avoidance = (accsize & CST_FFH_MWAIT_BM_AVOID) != 0;
+}
+
+static void
+acpi_cpu_cx_cst_free_plvlx(device_t cpu_dev, struct acpi_cx *cx_ptr)
+{
+
+	if (cx_ptr->p_lvlx == NULL)
+		return;
+	bus_release_resource(cpu_dev, cx_ptr->res_type, cx_ptr->res_rid,
+	    cx_ptr->p_lvlx);
+	cx_ptr->p_lvlx = NULL;
+}
+
 /*
  * Parse a _CST package and set up its Cx states.  Since the _CST object
  * can change dynamically, our notify handler may call this function
@@ -734,7 +777,8 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
     ACPI_OBJECT	*top;
     ACPI_OBJECT	*pkg;
     uint32_t	 count;
-    int		 i;
+    uint64_t	 address;
+    int		 i, vendor, class, accsize;
 
     ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__);
 
@@ -790,6 +834,30 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
 	/* Validate the state to see if we should use it. */
 	switch (cx_ptr->type) {
 	case ACPI_STATE_C1:
+	    acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr);
+#if defined(__i386__) || defined(__amd64__)
+	    if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address,
+	      &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL) {
+		if (class == CST_FFH_INTEL_CL_C1IO) {
+		    /* C1 I/O then Halt */
+		    cx_ptr->res_rid = sc->cpu_cx_count;
+		    bus_set_resource(sc->cpu_dev, SYS_RES_IOPORT,
+		      cx_ptr->res_rid, address, 1);
+		    cx_ptr->p_lvlx = bus_alloc_resource_any(sc->cpu_dev,
+		      SYS_RES_IOPORT, &cx_ptr->res_rid, RF_ACTIVE |
+		      RF_SHAREABLE);
+		    if (cx_ptr->p_lvlx == NULL) {
+			bus_delete_resource(sc->cpu_dev, SYS_RES_IOPORT,
+			  cx_ptr->res_rid);
+			device_printf(sc->cpu_dev,
+			  "C1 I/O failed to allocate port %d, "
+			  "degrading to C1 Halt", (int)address);
+		    }
+		} else if (class == CST_FFH_INTEL_CL_MWAIT) {
+		    acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize);
+		}
+	    }
+#endif
 	    if (sc->cpu_cx_states[0].type == ACPI_STATE_C0) {
 		/* This is the first C1 state.  Use the reserved slot. */
 		sc->cpu_cx_states[0] = *cx_ptr;
@@ -818,23 +886,34 @@ acpi_cpu_cx_cst(struct acpi_cpu_softc *sc)
 	}
 
 	/* Free up any previous register. */
-	if (cx_ptr->p_lvlx != NULL) {
-	    bus_release_resource(sc->cpu_dev, cx_ptr->res_type, cx_ptr->res_rid,
-	        cx_ptr->p_lvlx);
-	    cx_ptr->p_lvlx = NULL;
-	}
+	acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr);
 
 	/* Allocate the control register for C2 or C3. */
-	cx_ptr->res_rid = sc->cpu_cx_count;
-	acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type, &cx_ptr->res_rid,
-	    &cx_ptr->p_lvlx, RF_SHAREABLE);
-	if (cx_ptr->p_lvlx) {
+#if defined(__i386__) || defined(__amd64__)
+	if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address,
+	  &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL &&
+	  class == CST_FFH_INTEL_CL_MWAIT) {
+	    /* Native C State Instruction use (mwait) */
+	    acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize);
 	    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			     "acpi_cpu%d: Got C%d - %d latency\n",
-			     device_get_unit(sc->cpu_dev), cx_ptr->type,
-			     cx_ptr->trans_lat));
+	      "acpi_cpu%d: Got C%d/mwait - %d latency\n",
+	      device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat));
 	    cx_ptr++;
 	    sc->cpu_cx_count++;
+	} else
+#endif
+	{
+	    cx_ptr->res_rid = sc->cpu_cx_count;
+	    acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type,
+		&cx_ptr->res_rid, &cx_ptr->p_lvlx, RF_SHAREABLE);
+	    if (cx_ptr->p_lvlx) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+		     "acpi_cpu%d: Got C%d - %d latency\n",
+		     device_get_unit(sc->cpu_dev), cx_ptr->type,
+		     cx_ptr->trans_lat));
+		cx_ptr++;
+		sc->cpu_cx_count++;
+	    }
 	}
     }
     AcpiOsFree(buf.Pointer);
@@ -1043,7 +1122,14 @@ acpi_cpu_idle(sbintime_t sbt)
      */
     if (cx_next->type == ACPI_STATE_C1) {
 	cputicks = cpu_ticks();
-	acpi_cpu_c1();
+	if (cx_next->p_lvlx != NULL) {
+	    /* C1 I/O then Halt */
+	    CPU_GET_REG(cx_next->p_lvlx, 1);
+	}
+	if (cx_next->do_mwait)
+	    acpi_cpu_idle_mwait(cx_next->mwait_hint);
+	else
+	    acpi_cpu_c1();
 	end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate();
 	if (curthread->td_critnest == 0)
 		end_time = min(end_time, 500000 / hz);
@@ -1055,7 +1141,7 @@ acpi_cpu_idle(sbintime_t sbt)
      * For C3, disable bus master arbitration and enable bus master wake
      * if BM control is available, otherwise flush the CPU cache.
      */
-    if (cx_next->type == ACPI_STATE_C3) {
+    if (cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) {
 	if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
 	    AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 1);
 	    AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1);
@@ -1076,7 +1162,10 @@ acpi_cpu_idle(sbintime_t sbt)
 	start_time = 0;
 	cputicks = cpu_ticks();
     }
-    CPU_GET_REG(cx_next->p_lvlx, 1);
+    if (cx_next->do_mwait)
+	acpi_cpu_idle_mwait(cx_next->mwait_hint);
+    else
+	CPU_GET_REG(cx_next->p_lvlx, 1);
 
     /*
      * Read the end time twice.  Since it may take an arbitrary time
@@ -1092,8 +1181,8 @@ acpi_cpu_idle(sbintime_t sbt)
 	end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate();
 
     /* Enable bus master arbitration and disable bus master wakeup. */
-    if (cx_next->type == ACPI_STATE_C3 &&
-	(cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
+    if ((cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) &&
+      (cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) {
 	AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 0);
 	AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0);
     }
diff --git a/sys/dev/acpica/acpi_package.c b/sys/dev/acpica/acpi_package.c
index e38fea5..c1070cb 100644
--- a/sys/dev/acpica/acpi_package.c
+++ b/sys/dev/acpica/acpi_package.c
@@ -120,6 +120,28 @@ acpi_PkgGas(device_t dev, ACPI_OBJECT *res, int idx, int *type, int *rid,
     return (acpi_bus_alloc_gas(dev, type, rid, &gas, dst, flags));
 }
 
+int
+acpi_PkgFFH_IntelCpu(ACPI_OBJECT *res, int idx, int *vendor, int *class,
+    uint64_t *address, int *accsize)
+{
+    ACPI_GENERIC_ADDRESS gas;
+    ACPI_OBJECT *obj;
+
+    obj = &res->Package.Elements[idx];
+    if (obj == NULL || obj->Type != ACPI_TYPE_BUFFER ||
+	obj->Buffer.Length < sizeof(ACPI_GENERIC_ADDRESS) + 3)
+	return (EINVAL);
+
+    memcpy(&gas, obj->Buffer.Pointer + 3, sizeof(gas));
+    if (gas.SpaceId != ACPI_ADR_SPACE_FIXED_HARDWARE)
+	return (ERESTART);
+    *vendor = gas.BitWidth;
+    *class = gas.BitOffset;
+    *address = gas.Address;
+    *accsize = gas.AccessWidth;
+    return (0);
+}
+
 ACPI_HANDLE
 acpi_GetReference(ACPI_HANDLE scope, ACPI_OBJECT *obj)
 {
diff --git a/sys/dev/acpica/acpivar.h b/sys/dev/acpica/acpivar.h
index 2e2b96d..cbd4bd9 100644
--- a/sys/dev/acpica/acpivar.h
+++ b/sys/dev/acpica/acpivar.h
@@ -467,6 +467,8 @@ int		acpi_PkgInt32(ACPI_OBJECT *res, int idx, uint32_t *dst);
 int		acpi_PkgStr(ACPI_OBJECT *res, int idx, void *dst, size_t size);
 int		acpi_PkgGas(device_t dev, ACPI_OBJECT *res, int idx, int *type,
 		    int *rid, struct resource **dst, u_int flags);
+int		acpi_PkgFFH_IntelCpu(ACPI_OBJECT *res, int idx, int *vendor,
+		    int *class, uint64_t *address, int *accsize);
 ACPI_HANDLE	acpi_GetReference(ACPI_HANDLE scope, ACPI_OBJECT *obj);
 
 /*
diff --git a/sys/i386/acpica/acpi_machdep.c b/sys/i386/acpica/acpi_machdep.c
index 049354b..4c79691 100644
--- a/sys/i386/acpica/acpi_machdep.c
+++ b/sys/i386/acpica/acpi_machdep.c
@@ -106,13 +106,6 @@ acpi_machdep_quirks(int *quirks)
 	return (0);
 }
 
-void
-acpi_cpu_c1()
-{
-
-	__asm __volatile("sti; hlt");
-}
-
 /*
  * Support for mapping ACPI tables during early boot.  This abuses the
  * crashdump map because the kernel cannot allocate KVA in
diff --git a/sys/i386/include/md_var.h b/sys/i386/include/md_var.h
index bffdd57..b5bd35e 100644
--- a/sys/i386/include/md_var.h
+++ b/sys/i386/include/md_var.h
@@ -97,6 +97,7 @@ struct	dumperinfo;
 void	*alloc_fpusave(int flags);
 void	bcopyb(const void *from, void *to, size_t len);
 void	busdma_swi(void);
+bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 void	cpu_switch_load_gs(void) __asm(__STRING(cpu_switch_load_gs));
diff --git a/sys/x86/include/acpica_machdep.h b/sys/x86/include/acpica_machdep.h
index 46080c0..136285c 100644
--- a/sys/x86/include/acpica_machdep.h
+++ b/sys/x86/include/acpica_machdep.h
@@ -74,6 +74,7 @@ enum intr_polarity;
 
 void	acpi_SetDefaultIntrModel(int model);
 void	acpi_cpu_c1(void);
+void	acpi_cpu_idle_mwait(uint32_t mwait_hint);
 void	*acpi_map_table(vm_paddr_t pa, const char *sig);
 void	acpi_unmap_table(void *table);
 vm_paddr_t acpi_find_table(const char *sig);
diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index 846a123..d1d49f4 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#include <x86/acpica_machdep.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
@@ -130,6 +131,27 @@ cpu_flush_dcache(void *ptr, size_t len)
 	/* Not applicable */
 }
 
+void
+acpi_cpu_c1(void)
+{
+
+	__asm __volatile("sti; hlt");
+}
+
+void
+acpi_cpu_idle_mwait(uint32_t mwait_hint)
+{
+	int *state;
+
+	state = (int *)PCPU_PTR(monitorbuf);
+	/*
+	 * XXXKIB.  Software coordination mode should be supported,
+	 * but all Intel CPUs provide hardware coordination.
+	 */
+	cpu_monitor(state, 0, 0);
+	cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+}
+
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
@@ -232,6 +254,15 @@ cpu_halt(void)
 
 #endif
 
+bool
+cpu_mwait_usable(void)
+{
+
+	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
+	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
+}
+
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
@@ -258,7 +289,7 @@ cpu_idle_acpi(sbintime_t sbt)
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
-		__asm __volatile("sti; hlt");
+		acpi_cpu_c1();
 	*state = STATE_RUNNING;
 }
 #endif /* !PC98 */
@@ -292,7 +323,7 @@ cpu_idle_hlt(sbintime_t sbt)
 	if (sched_runnable())
 		enable_intr();
 	else
-		__asm __volatile("sti; hlt");
+		acpi_cpu_c1();
 	*state = STATE_RUNNING;
 }
 #endif



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20150422125521.GQ2390>