Date: Wed, 14 Jul 2010 21:10:14 +0000 (UTC) From: John Baldwin <jhb@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org Subject: svn commit: r210079 - in stable/8/sys: amd64/amd64 amd64/include i386/i386 i386/include Message-ID: <201007142110.o6ELAEYi081085@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jhb Date: Wed Jul 14 21:10:14 2010 New Revision: 210079 URL: http://svn.freebsd.org/changeset/base/210079 Log: MFC 208507,208556,208621: Add support for corrected machine check interrupts. CMCI is a new local APIC interrupt that fires when a threshold of corrected machine check events is reached. CMCI also includes a count of events when reporting corrected errors in the bank's status register. Note that individual banks may or may not support CMCI. If they do, each bank includes its own threshold register that determines when the interrupt fires. Currently the code uses a very simple strategy where it doubles the threshold on each interrupt until it succeeds in throttling the interrupt to occur only once a minute (this interval can be tuned via sysctl). The threshold is also adjusted on each hourly poll which will lower the threshold once events stop occurring. Modified: stable/8/sys/amd64/amd64/apic_vector.S stable/8/sys/amd64/amd64/local_apic.c stable/8/sys/amd64/amd64/machdep.c stable/8/sys/amd64/amd64/mca.c stable/8/sys/amd64/include/apicreg.h stable/8/sys/amd64/include/apicvar.h stable/8/sys/amd64/include/mca.h stable/8/sys/amd64/include/pcpu.h stable/8/sys/amd64/include/specialreg.h stable/8/sys/i386/i386/apic_vector.s stable/8/sys/i386/i386/local_apic.c stable/8/sys/i386/i386/machdep.c stable/8/sys/i386/i386/mca.c stable/8/sys/i386/include/apicreg.h stable/8/sys/i386/include/apicvar.h stable/8/sys/i386/include/mca.h stable/8/sys/i386/include/pcpu.h stable/8/sys/i386/include/specialreg.h Directory Properties: stable/8/sys/ (props changed) stable/8/sys/amd64/include/xen/ (props changed) stable/8/sys/cddl/contrib/opensolaris/ (props changed) stable/8/sys/contrib/dev/acpica/ (props changed) stable/8/sys/contrib/pf/ (props changed) stable/8/sys/dev/xen/xenpci/ (props changed) Modified: stable/8/sys/amd64/amd64/apic_vector.S ============================================================================== --- stable/8/sys/amd64/amd64/apic_vector.S Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/amd64/apic_vector.S Wed Jul 14 21:10:14 2010 (r210079) @@ -105,6 +105,18 @@ IDTVEC(timerint) jmp doreti /* + * Local APIC CMCI handler. + */ + .text + SUPERALIGN_TEXT +IDTVEC(cmcint) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + call lapic_handle_cmc + MEXITCOUNT + jmp doreti + +/* * Local APIC error interrupt handler. */ .text Modified: stable/8/sys/amd64/amd64/local_apic.c ============================================================================== --- stable/8/sys/amd64/amd64/local_apic.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/amd64/local_apic.c Wed Jul 14 21:10:14 2010 (r210079) @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <machine/frame.h> #include <machine/intr_machdep.h> #include <machine/apicvar.h> +#include <machine/mca.h> #include <machine/md_var.h> #include <machine/smp.h> #include <machine/specialreg.h> @@ -123,6 +124,7 @@ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; static inthand_t *ioint_handlers[] = { @@ -227,6 +229,9 @@ lapic_init(vm_paddr_t addr) setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_SYSIGT, SEL_KPL, 0); /* XXX: Thermal interrupt */ + + /* Local APIC CMCI. */ + setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYSIGT, SEL_KPL, 0); } /* @@ -252,7 +257,7 @@ lapic_create(u_int apic_id, int boot_cpu */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; - for (i = 0; i < LVT_MAX; i++) { + for (i = 0; i <= LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } @@ -282,6 +287,7 @@ lapic_dump(const char* str) printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, lapic->lvt_pcint); + printf(" cmci: 0x%08x\n", lapic->lvt_cmci); } void @@ -333,6 +339,10 @@ lapic_setup(int boot) /* XXX: Thermal LVT */ + /* Program the CMCI LVT entry if present. */ + if (maxlvt >= LVT_CMCI) + lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci); + intr_restore(eflags); } @@ -857,6 +867,34 @@ lapic_timer_enable_intr(void) } void +lapic_handle_cmc(void) +{ + + lapic_eoi(); + cmc_intr(); +} + +/* + * Called from the mca_init() to activate the CMC interrupt if this CPU is + * responsible for monitoring any MC banks for CMC events. Since mca_init() + * is called prior to lapic_setup() during boot, this just needs to unmask + * this CPU's LVT_CMCI entry. + */ +void +lapic_enable_cmc(void) +{ + u_int apic_id; + + apic_id = PCPU_GET(apic_id); + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0; + lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1; + if (bootverbose) + printf("lapic%u: CMCI unmasked\n", apic_id); +} + +void lapic_handle_error(void) { u_int32_t esr; Modified: stable/8/sys/amd64/amd64/machdep.c ============================================================================== --- stable/8/sys/amd64/amd64/machdep.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/amd64/machdep.c Wed Jul 14 21:10:14 2010 (r210079) @@ -283,7 +283,6 @@ cpu_startup(dummy) vm_pager_bufferinit(); cpu_setregs(); - mca_init(); } /* Modified: stable/8/sys/amd64/amd64/mca.c ============================================================================== --- stable/8/sys/amd64/amd64/mca.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/amd64/mca.c Wed Jul 14 21:10:14 2010 (r210079) @@ -33,6 +33,8 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> #include <machine/cputypes.h> #include <machine/mca.h> #include <machine/md_var.h> #include <machine/specialreg.h> +/* Modes for mca_scan() */ +enum scan_mode { + POLLED, + MCE, + CMCI, +}; + +/* + * State maintained for each monitored MCx bank to control the + * corrected machine check interrupt threshold. + */ +struct cmc_state { + int max_threshold; + int last_intr; +}; + struct mca_internal { struct mca_record rec; int logged; @@ -79,19 +99,22 @@ static struct callout mca_timer; static int mca_ticks = 3600; /* Check hourly by default. */ static struct task mca_task; static struct mtx mca_lock; +static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ +static int cmc_banks; +static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ static int -sysctl_mca_ticks(SYSCTL_HANDLER_ARGS) +sysctl_positive_int(SYSCTL_HANDLER_ARGS) { int error, value; - value = mca_ticks; + value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value <= 0) return (EINVAL); - mca_ticks = value; + *(int *)arg1 = value; return (0); } @@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record } /* + * Update the interrupt threshold for a CMCI. The strategy is to use + * a low trigger that interrupts as soon as the first event occurs. + * However, if a steady stream of events arrive, the threshold is + * increased until the interrupts are throttled to once every + * cmc_throttle seconds or the periodic scan. If a periodic scan + * finds that the threshold is too high, it is lowered. + */ +static void +cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +{ + struct cmc_state *cc; + uint64_t ctl; + u_int delta; + int count, limit; + + /* Fetch the current limit for this bank. */ + cc = &cmc_state[PCPU_GET(cpuid)][bank]; + ctl = rdmsr(MSR_MC_CTL2(bank)); + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + delta = (u_int)(ticks - cc->last_intr); + + /* + * If an interrupt was received less than cmc_throttle seconds + * since the previous interrupt and the count from the current + * event is greater than or equal to the current threshold, + * double the threshold up to the max. + */ + if (mode == CMCI && valid) { + limit = ctl & MC_CTL2_THRESHOLD; + if (delta < cmc_throttle && count >= limit && + limit < cc->max_threshold) { + limit = min(limit << 1, cc->max_threshold); + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } + cc->last_intr = ticks; + return; + } + + /* + * When the banks are polled, check to see if the threshold + * should be lowered. + */ + if (mode != POLLED) + return; + + /* If a CMCI occured recently, do nothing for now. */ + if (delta < cmc_throttle) + return; + + /* + * Compute a new limit based on the average rate of events per + * cmc_throttle seconds since the last interrupt. + */ + if (valid) { + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + limit = count * cmc_throttle / delta; + if (limit <= 0) + limit = 1; + else if (limit > cc->max_threshold) + limit = cc->max_threshold; + } else + limit = 1; + if ((ctl & MC_CTL2_THRESHOLD) != limit) { + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } +} + +/* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are * reported immediately via mca_log(). The current thread must be - * pinned when this is called. The 'mcip' parameter indicates if we - * are being called from the MC exception handler. In that case this - * function returns true if the system is restartable. Otherwise, it - * returns a count of the number of valid MC records found. + * pinned when this is called. The 'mode' parameter indicates if we + * are being called from the MC exception handler, the CMCI handler, + * or the periodic poller. In the MC exception case this function + * returns true if the system is restartable. Otherwise, it returns a + * count of the number of valid MC records found. */ static int -mca_scan(int mcip) +mca_scan(enum scan_mode mode) { struct mca_record rec; uint64_t mcg_cap, ucmask; - int count, i, recoverable; + int count, i, recoverable, valid; count = 0; recoverable = 1; ucmask = MC_STATUS_UC | MC_STATUS_PCC; /* When handling a MCE#, treat the OVER flag as non-restartable. */ - if (mcip) + if (mode == MCE) ucmask |= MC_STATUS_OVER; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - if (mca_check_status(i, &rec)) { + /* + * For a CMCI, only check banks this CPU is + * responsible for. + */ + if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) + continue; + + valid = mca_check_status(i, &rec); + if (valid) { count++; if (rec.mr_status & ucmask) { recoverable = 0; @@ -433,8 +537,15 @@ mca_scan(int mcip) } mca_record_entry(&rec); } + + /* + * If this is a bank this CPU monitors via CMCI, + * update the threshold. + */ + if (PCPU_GET(cmci_mask) & (1 << i)) + cmci_update(mode, i, valid, &rec); } - return (mcip ? recoverable : count); + return (mode == MCE ? recoverable : count); } /* @@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending continue; sched_bind(td, cpu); thread_unlock(td); - count += mca_scan(0); + count += mca_scan(POLLED); thread_lock(td); sched_unbind(td); } @@ -511,7 +622,24 @@ mca_startup(void *dummy) SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); static void -mca_setup(void) +cmci_setup(uint64_t mcg_cap) +{ + int i; + + cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **), + M_MCA, M_WAITOK); + cmc_banks = mcg_cap & MCG_CAP_COUNT; + for (i = 0; i <= mp_maxid; i++) + cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks, + M_MCA, M_WAITOK | M_ZERO); + SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, + "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &cmc_throttle, 0, sysctl_positive_int, "I", + "Interval in seconds to throttle corrected MC interrupts"); +} + +static void +mca_setup(uint64_t mcg_cap) { mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); @@ -522,13 +650,62 @@ mca_setup(void) "count", CTLFLAG_RD, &mca_count, 0, "Record count"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks, - 0, sysctl_mca_ticks, "I", + 0, sysctl_positive_int, "I", "Periodic interval in seconds to scan for machine checks"); SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_setup(mcg_cap); +} + +/* + * See if we should monitor CMCI for this bank. If CMCI_EN is already + * set in MC_CTL2, then another CPU is responsible for this bank, so + * ignore it. If CMCI_EN returns zero after being set, then this bank + * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should + * now monitor this bank. + */ +static void +cmci_monitor(int i) +{ + struct cmc_state *cc; + uint64_t ctl; + + KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); + + ctl = rdmsr(MSR_MC_CTL2(i)); + if (ctl & MC_CTL2_CMCI_EN) + /* Already monitored by another CPU. */ + return; + + /* Set the threshold to one event for now. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= MC_CTL2_CMCI_EN | 1; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + if (!(ctl & MC_CTL2_CMCI_EN)) + /* This bank does not support CMCI. */ + return; + + cc = &cmc_state[PCPU_GET(cpuid)][i]; + + /* Determine maximum threshold. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 0x7fff; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + cc->max_threshold = ctl & MC_CTL2_THRESHOLD; + + /* Start off with a threshold of 1. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 1; + wrmsr(MSR_MC_CTL2(i), ctl); + + /* Mark this bank as monitored. */ + PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); } /* Must be executed on each CPU. */ @@ -554,14 +731,14 @@ mca_init(void) workaround_erratum383 = 1; if (cpu_feature & CPUID_MCA) { - if (PCPU_GET(cpuid) == 0) - mca_setup(); + PCPU_SET(cmci_mask, 0); - sched_pin(); mcg_cap = rdmsr(MSR_MCG_CAP); if (mcg_cap & MCG_CAP_CTL_P) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); + if (PCPU_GET(cpuid) == 0) + mca_setup(mcg_cap); /* * Disable logging of level one TLB parity (L1TP) errors by @@ -597,15 +774,34 @@ mca_init(void) if (!skip) wrmsr(MSR_MC_CTL(i), ctl); + + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_monitor(i); + /* Clear all errors. */ wrmsr(MSR_MC_STATUS(i), 0); } - sched_unpin(); + + if (PCPU_GET(cmci_mask) != 0) + lapic_enable_cmc(); } load_cr4(rcr4() | CR4_MCE); } +/* + * The machine check registers for the BSP cannot be initialized until + * the local APIC is initialized. This happens at SI_SUB_CPU, + * SI_ORDER_SECOND. + */ +static void +mca_init_bsp(void *arg __unused) +{ + + mca_init(); +} +SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL); + /* Called when a machine check exception fires. */ int mca_intr(void) @@ -624,7 +820,7 @@ mca_intr(void) } /* Scan the banks and check for any non-recoverable errors. */ - recoverable = mca_scan(1); + recoverable = mca_scan(MCE); mcg_status = rdmsr(MSR_MCG_STATUS); if (!(mcg_status & MCG_STATUS_RIPV)) recoverable = 0; @@ -633,3 +829,31 @@ mca_intr(void) wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); return (recoverable); } + +/* Called for a CMCI (correctable machine check interrupt). */ +void +cmc_intr(void) +{ + struct mca_internal *mca; + int count; + + /* + * Serialize MCA bank scanning to prevent collisions from + * sibling threads. + */ + count = mca_scan(CMCI); + + /* If we found anything, log them to the console. */ + if (count != 0) { + mtx_lock_spin(&mca_lock); + STAILQ_FOREACH(mca, &mca_records, link) { + if (!mca->logged) { + mca->logged = 1; + mtx_unlock_spin(&mca_lock); + mca_log(&mca->rec); + mtx_lock_spin(&mca_lock); + } + } + mtx_unlock_spin(&mca_lock); + } +} Modified: stable/8/sys/amd64/include/apicreg.h ============================================================================== --- stable/8/sys/amd64/include/apicreg.h Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/include/apicreg.h Wed Jul 14 21:10:14 2010 (r210079) @@ -89,7 +89,7 @@ * 2C0 Reserved * 2D0 Reserved * 2E0 Reserved - * 2F0 Reserved + * 2F0 Local Vector Table (CMCI) R/W * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W * 310 ICR_HI Interrupt Command Reg. (32-63) R/W * 320 Local Vector Table (Timer) R/W @@ -172,7 +172,7 @@ struct LAPIC { /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; - /* reserved */ PAD4; + u_int32_t lvt_cmci; PAD3; u_int32_t icr_lo; PAD3; u_int32_t icr_hi; PAD3; u_int32_t lvt_timer; PAD3; Modified: stable/8/sys/amd64/include/apicvar.h ============================================================================== --- stable/8/sys/amd64/include/apicvar.h Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/include/apicvar.h Wed Jul 14 21:10:14 2010 (r210079) @@ -108,8 +108,9 @@ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) +#define APIC_CMC_INT (APIC_LOCAL_INTS + 2) -#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2) +#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ #define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ #define IPI_INVLPG (APIC_IPI_INTS + 2) @@ -143,7 +144,8 @@ #define LVT_ERROR 3 #define LVT_PMC 4 #define LVT_THERMAL 5 -#define LVT_MAX LVT_THERMAL +#define LVT_CMCI 6 +#define LVT_MAX LVT_CMCI #ifndef LOCORE @@ -179,8 +181,8 @@ struct apic_enumerator { inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), - IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint), - IDTVEC(timerint); + IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), + IDTVEC(spuriousint), IDTVEC(timerint); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; @@ -210,6 +212,7 @@ void lapic_create(u_int apic_id, int boo void lapic_disable(void); void lapic_disable_pmc(void); void lapic_dump(const char *str); +void lapic_enable_cmc(void); int lapic_enable_pmc(void); void lapic_eoi(void); int lapic_id(void); @@ -218,6 +221,7 @@ int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); int lapic_ipi_wait(int delay); +void lapic_handle_cmc(void); void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); Modified: stable/8/sys/amd64/include/mca.h ============================================================================== --- stable/8/sys/amd64/include/mca.h Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/include/mca.h Wed Jul 14 21:10:14 2010 (r210079) @@ -46,6 +46,7 @@ struct mca_record { #ifdef _KERNEL +void cmc_intr(void); void mca_init(void); int mca_intr(void); Modified: stable/8/sys/amd64/include/pcpu.h ============================================================================== --- stable/8/sys/amd64/include/pcpu.h Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/include/pcpu.h Wed Jul 14 21:10:14 2010 (r210079) @@ -75,7 +75,8 @@ /* Pointer to the CPU LDT descriptor */ \ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ - struct system_segment_descriptor *pc_tss \ + struct system_segment_descriptor *pc_tss; \ + u_int pc_cmci_mask /* MCx banks for CMCI */ \ PCPU_XEN_FIELDS #ifdef _KERNEL Modified: stable/8/sys/amd64/include/specialreg.h ============================================================================== --- stable/8/sys/amd64/include/specialreg.h Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/amd64/include/specialreg.h Wed Jul 14 21:10:14 2010 (r210079) @@ -385,7 +385,7 @@ #define MC_STATUS_VAL 0x8000000000000000 #define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ -#define MC_CTL2_THRESHOLD 0x0000000000003fff +#define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 /* Modified: stable/8/sys/i386/i386/apic_vector.s ============================================================================== --- stable/8/sys/i386/i386/apic_vector.s Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/i386/i386/apic_vector.s Wed Jul 14 21:10:14 2010 (r210079) @@ -113,6 +113,19 @@ IDTVEC(timerint) jmp doreti /* + * Local APIC CMCI handler. + */ + .text + SUPERALIGN_TEXT +IDTVEC(cmcint) + PUSH_FRAME + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + call lapic_handle_cmc + MEXITCOUNT + jmp doreti + +/* * Local APIC error interrupt handler. */ .text Modified: stable/8/sys/i386/i386/local_apic.c ============================================================================== --- stable/8/sys/i386/i386/local_apic.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/i386/i386/local_apic.c Wed Jul 14 21:10:14 2010 (r210079) @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$"); #include <machine/frame.h> #include <machine/intr_machdep.h> #include <machine/apicvar.h> +#include <machine/mca.h> #include <machine/md_var.h> #include <machine/smp.h> #include <machine/specialreg.h> @@ -124,6 +125,7 @@ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; static inthand_t *ioint_handlers[] = { @@ -231,6 +233,10 @@ lapic_init(vm_paddr_t addr) GSEL(GCODE_SEL, SEL_KPL)); /* XXX: Thermal interrupt */ + + /* Local APIC CMCI. */ + setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); } /* @@ -256,7 +262,7 @@ lapic_create(u_int apic_id, int boot_cpu */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; - for (i = 0; i < LVT_MAX; i++) { + for (i = 0; i <= LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } @@ -286,6 +292,7 @@ lapic_dump(const char* str) printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, lapic->lvt_pcint); + printf(" cmci: 0x%08x\n", lapic->lvt_cmci); } void @@ -337,6 +344,10 @@ lapic_setup(int boot) /* XXX: Thermal LVT */ + /* Program the CMCI LVT entry if present. */ + if (maxlvt >= LVT_CMCI) + lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci); + intr_restore(eflags); } @@ -858,6 +869,34 @@ lapic_timer_enable_intr(void) } void +lapic_handle_cmc(void) +{ + + lapic_eoi(); + cmc_intr(); +} + +/* + * Called from the mca_init() to activate the CMC interrupt if this CPU is + * responsible for monitoring any MC banks for CMC events. Since mca_init() + * is called prior to lapic_setup() during boot, this just needs to unmask + * this CPU's LVT_CMCI entry. + */ +void +lapic_enable_cmc(void) +{ + u_int apic_id; + + apic_id = PCPU_GET(apic_id); + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0; + lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1; + if (bootverbose) + printf("lapic%u: CMCI unmasked\n", apic_id); +} + +void lapic_handle_error(void) { u_int32_t esr; Modified: stable/8/sys/i386/i386/machdep.c ============================================================================== --- stable/8/sys/i386/i386/machdep.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/i386/i386/machdep.c Wed Jul 14 21:10:14 2010 (r210079) @@ -328,7 +328,6 @@ cpu_startup(dummy) #ifndef XEN cpu_setregs(); #endif - mca_init(); } /* Modified: stable/8/sys/i386/i386/mca.c ============================================================================== --- stable/8/sys/i386/i386/mca.c Wed Jul 14 20:55:45 2010 (r210078) +++ stable/8/sys/i386/i386/mca.c Wed Jul 14 21:10:14 2010 (r210079) @@ -32,7 +32,11 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_apic.h" + #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -43,11 +47,31 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> #include <machine/cputypes.h> #include <machine/mca.h> #include <machine/md_var.h> #include <machine/specialreg.h> +/* Modes for mca_scan() */ +enum scan_mode { + POLLED, + MCE, + CMCI, +}; + +#ifdef DEV_APIC +/* + * State maintained for each monitored MCx bank to control the + * corrected machine check interrupt threshold. + */ +struct cmc_state { + int max_threshold; + int last_intr; +}; +#endif + struct mca_internal { struct mca_record rec; int logged; @@ -80,18 +104,24 @@ static int mca_ticks = 3600; /* Check ho static struct task mca_task; static struct mtx mca_lock; +#ifdef DEV_APIC +static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ +static int cmc_banks; +static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ +#endif + static int -sysctl_mca_ticks(SYSCTL_HANDLER_ARGS) +sysctl_positive_int(SYSCTL_HANDLER_ARGS) { int error, value; - value = mca_ticks; + value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value <= 0) return (EINVAL); - mca_ticks = value; + *(int *)arg1 = value; return (0); } @@ -400,32 +430,117 @@ mca_record_entry(const struct mca_record mtx_unlock_spin(&mca_lock); } +#ifdef DEV_APIC +/* + * Update the interrupt threshold for a CMCI. The strategy is to use + * a low trigger that interrupts as soon as the first event occurs. + * However, if a steady stream of events arrive, the threshold is + * increased until the interrupts are throttled to once every + * cmc_throttle seconds or the periodic scan. If a periodic scan + * finds that the threshold is too high, it is lowered. + */ +static void +cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +{ + struct cmc_state *cc; + uint64_t ctl; + u_int delta; + int count, limit; + + /* Fetch the current limit for this bank. */ + cc = &cmc_state[PCPU_GET(cpuid)][bank]; + ctl = rdmsr(MSR_MC_CTL2(bank)); + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + delta = (u_int)(ticks - cc->last_intr); + + /* + * If an interrupt was received less than cmc_throttle seconds + * since the previous interrupt and the count from the current + * event is greater than or equal to the current threshold, + * double the threshold up to the max. + */ + if (mode == CMCI && valid) { + limit = ctl & MC_CTL2_THRESHOLD; + if (delta < cmc_throttle && count >= limit && + limit < cc->max_threshold) { + limit = min(limit << 1, cc->max_threshold); + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } + cc->last_intr = ticks; + return; + } + + /* + * When the banks are polled, check to see if the threshold + * should be lowered. + */ + if (mode != POLLED) + return; + + /* If a CMCI occured recently, do nothing for now. */ + if (delta < cmc_throttle) + return; + + /* + * Compute a new limit based on the average rate of events per + * cmc_throttle seconds since the last interrupt. + */ + if (valid) { + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + limit = count * cmc_throttle / delta; + if (limit <= 0) + limit = 1; + else if (limit > cc->max_threshold) + limit = cc->max_threshold; + } else + limit = 1; + if ((ctl & MC_CTL2_THRESHOLD) != limit) { + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } +} +#endif + /* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are * reported immediately via mca_log(). The current thread must be - * pinned when this is called. The 'mcip' parameter indicates if we - * are being called from the MC exception handler. In that case this - * function returns true if the system is restartable. Otherwise, it - * returns a count of the number of valid MC records found. + * pinned when this is called. The 'mode' parameter indicates if we + * are being called from the MC exception handler, the CMCI handler, + * or the periodic poller. In the MC exception case this function + * returns true if the system is restartable. Otherwise, it returns a + * count of the number of valid MC records found. */ static int -mca_scan(int mcip) +mca_scan(enum scan_mode mode) { struct mca_record rec; uint64_t mcg_cap, ucmask; - int count, i, recoverable; + int count, i, recoverable, valid; count = 0; recoverable = 1; ucmask = MC_STATUS_UC | MC_STATUS_PCC; /* When handling a MCE#, treat the OVER flag as non-restartable. */ - if (mcip) + if (mode == MCE) ucmask |= MC_STATUS_OVER; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - if (mca_check_status(i, &rec)) { +#ifdef DEV_APIC + /* + * For a CMCI, only check banks this CPU is + * responsible for. + */ + if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) + continue; +#endif + + valid = mca_check_status(i, &rec); + if (valid) { count++; if (rec.mr_status & ucmask) { recoverable = 0; @@ -433,8 +548,17 @@ mca_scan(int mcip) } mca_record_entry(&rec); } + +#ifdef DEV_APIC + /* + * If this is a bank this CPU monitors via CMCI, + * update the threshold. + */ + if (PCPU_GET(cmci_mask) & (1 << i)) + cmci_update(mode, i, valid, &rec); +#endif } - return (mcip ? recoverable : count); + return (mode == MCE ? recoverable : count); } /* @@ -457,7 +581,7 @@ mca_scan_cpus(void *context, int pending continue; sched_bind(td, cpu); thread_unlock(td); - count += mca_scan(0); + count += mca_scan(POLLED); thread_lock(td); sched_unbind(td); } @@ -510,8 +634,27 @@ mca_startup(void *dummy) } SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); +#ifdef DEV_APIC static void -mca_setup(void) +cmci_setup(uint64_t mcg_cap) +{ + int i; + + cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **), + M_MCA, M_WAITOK); + cmc_banks = mcg_cap & MCG_CAP_COUNT; + for (i = 0; i <= mp_maxid; i++) + cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks, + M_MCA, M_WAITOK | M_ZERO); + SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, + "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &cmc_throttle, 0, sysctl_positive_int, "I", + "Interval in seconds to throttle corrected MC interrupts"); +} +#endif + *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201007142110.o6ELAEYi081085>