Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 3 Feb 2011 08:39:34 -0800
From:      mdf@FreeBSD.org
To:        John Baldwin <jhb@freebsd.org>
Cc:        freebsd-current@freebsd.org
Subject:   Re: Panic with mca trap
Message-ID:  <AANLkTi=8qadb_xMy3cnRJ=PHffzTVgobZ4pEDSwfwkiU@mail.gmail.com>
In-Reply-To: <201102031009.52507.jhb@freebsd.org>
References:  <AANLkTik8QJF61gwodmF6iHU4s7y9ORsNZ0H7NW0o8Mxq@mail.gmail.com> <201102030805.31743.jhb@freebsd.org> <201102031009.52507.jhb@freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Thu, Feb 3, 2011 at 7:09 AM, John Baldwin <jhb@freebsd.org> wrote:
> On Thursday, February 03, 2011 8:05:31 am John Baldwin wrote:
>> On Tuesday, February 01, 2011 11:58:12 am mdf@freebsd.org wrote:
>> > On a piece of hardware trying to verify basic build tests, we got an
>> > MCA exception that then panic'd the kernel due to WITNESS/INVARIANTS
>> > interaction.
>> >
>> > panic @ time 1296563157.510, thread 0xffffff0005540000: blockable
>> > sleep lock (sleep mutex) 128 @ /build/mnt/src/sys/vm/uma_core.c:1872
>> >
>> > Stack: --------------------------------------------------
>> > kernel:witness_checkorder+0x7a2
>> > kernel:_mtx_lock_flags+0x81
>> > kernel:uma_zalloc_arg+0x256
>> > kernel:malloc+0xc5
>> > kernel:mca_record_entry+0x30
>> > kernel:mca_scan+0xc9
>> > kernel:mca_intr+0x79
>> > kernel:trap+0x30b
>> > kernel:witness_checkorder+0x66
>> > kernel:_mtx_lock_spin_flags+0xa4
>> > kernel:witness_checkorder+0x2a8
>> > kernel:_mtx_lock_spin_flags+0xa4
>> > kernel:tdq_idled+0xe8
>> > kernel:sched_idletd+0x5b
>> > kernel:fork_exit+0x9b
>> >
>> > That's this bit of code in uma_zalloc_arg():
>> >
>> > #ifdef INVARIANTS
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 ZONE_LOCK(zone);
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 uma_dbg_alloc(zone, NU=
LL, item);
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 ZONE_UNLOCK(zone);
>> > #endif
>> >
>> >
>> > I don't know uma(9) well enough to know the best workaround. =A0Clearl=
y
>> > there are times we can be in uma_zalloc_arg() and taking a regular
>> > mutex is not acceptable. =A0But what to do for the uma_dbg_free() call
>> > so it's happy, and whether to guard taking the ZONE lock with M_NOWAIT
>> > or td_critnest > 0 or both is outside my current knowledge.
>> >
>> > I don't expect we'll see this panic again any time soon, but it would
>> > be nice to fix the story for WITNESS of when an M_NOWAIT allocation
>> > can be done.
>>
>> Actually, this is more my fault. =A0The machine check happened while the
>> interrupted thread was already in a critical section (hence the WITNESS
>> complaint). =A0However, it really isn't correct to be calling malloc() f=
rom an
>> arbitrary exception handler, especially one like MC# which can fire pret=
ty
>> much anywhere. =A0I think instead that we should use malloc() when polli=
ng the
>> machine check banks, but keep a pre-allocated pool of structures for use=
 with
>> MC# exceptions and CMC interrupts and replenish the pool via an asynchro=
nous
>> task.
>
> This is what I'm thinking:
>
> Index: mca.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> --- mca.c =A0 =A0 =A0 (revision 218221)
> +++ mca.c =A0 =A0 =A0 (working copy)
> @@ -85,6 +85,7 @@
> =A0static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
>
> =A0static int mca_count; =A0 =A0 =A0 =A0 =A0/* Number of records stored. =
*/
> +static int mca_banks; =A0 =A0 =A0 =A0 =A0/* Number of per-CPU register b=
anks. */
>
> =A0SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD, NULL, "Machine Check Archi=
tecture");
>
> @@ -102,6 +103,8 @@
> =A0SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RD, &workaround_erra=
tum383, 0,
> =A0 =A0 "Is the workaround for Erratum 383 on AMD Family 10h processors e=
nabled?");
>
> +static STAILQ_HEAD(, mca_internal) mca_freelist;
> +static int mca_freecount;
> =A0static STAILQ_HEAD(, mca_internal) mca_records;
> =A0static struct callout mca_timer;
> =A0static int mca_ticks =3D 3600; =A0 /* Check hourly by default. */
> @@ -111,7 +114,6 @@
>
> =A0#ifdef DEV_APIC
> =A0static struct cmc_state **cmc_state; =A0 /* Indexed by cpuid, bank */
> -static int cmc_banks;
> =A0static int cmc_throttle =3D 60; =A0/* Time in seconds to throttle CMCI=
. */
> =A0#endif
>
> @@ -415,21 +417,51 @@
> =A0 =A0 =A0 =A0return (1);
> =A0}
>
> -static void __nonnull(1)
> -mca_record_entry(const struct mca_record *record)
> +static void
> +mca_fill_freelist(void)
> =A0{
> =A0 =A0 =A0 =A0struct mca_internal *rec;
> + =A0 =A0 =A0 int desired;
>
> - =A0 =A0 =A0 rec =3D malloc(sizeof(*rec), M_MCA, M_NOWAIT);
> - =A0 =A0 =A0 if (rec =3D=3D NULL) {
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 printf("MCA: Unable to allocate space for a=
n event.\n");
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_log(record);
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 return;
> + =A0 =A0 =A0 /*
> + =A0 =A0 =A0 =A0* Ensure we have at least one record for each bank and o=
ne
> + =A0 =A0 =A0 =A0* record per CPU.
> + =A0 =A0 =A0 =A0*/
> + =A0 =A0 =A0 desired =3D imax(mp_ncpus, mca_banks);
> + =A0 =A0 =A0 mtx_lock_spin(&mca_lock);
> + =A0 =A0 =A0 while (desired < mca_freecount) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mtx_unlock_spin(&mca_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 rec =3D malloc(sizeof(*rec), M_MCA, M_WAITO=
K);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mtx_lock_spin(&mca_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 STAILQ_INSERT_TAIL(&mca_freelist, rec, link=
);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_freecount++;
> =A0 =A0 =A0 =A0}
> + =A0 =A0 =A0 mtx_unlock_spin(&mca_lock);
> +}
>
> +static void __nonnull(2)
> +mca_record_entry(enum scan_mode mode, const struct mca_record *record)
> +{
> + =A0 =A0 =A0 struct mca_internal *rec;
> +
> + =A0 =A0 =A0 if (mode =3D=3D POLLED) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 rec =3D malloc(sizeof(*rec), M_MCA, M_WAITO=
K);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mtx_lock_spin(&mca_lock);
> + =A0 =A0 =A0 } else {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mtx_lock_spin(&mca_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 rec =3D STAILQ_FIRST(&mca_freelist);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 if (rec =3D=3D NULL) {
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 mtx_unlock_spin(&mca_lock);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 printf("MCA: Unable to allo=
cate space for an event.\n");
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_log(record);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 return;
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 }
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 STAILQ_REMOVE_HEAD(&mca_freelist, link);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_freecount--;
> + =A0 =A0 =A0 }
> +
> =A0 =A0 =A0 =A0rec->rec =3D *record;
> =A0 =A0 =A0 =A0rec->logged =3D 0;
> - =A0 =A0 =A0 mtx_lock_spin(&mca_lock);
> =A0 =A0 =A0 =A0STAILQ_INSERT_TAIL(&mca_records, rec, link);
> =A0 =A0 =A0 =A0mca_count++;
> =A0 =A0 =A0 =A0mtx_unlock_spin(&mca_lock);
> @@ -551,7 +583,7 @@
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0recoverabl=
e =3D 0;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0mca_log(&r=
ec);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_record_entry(&rec);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_record_entry(mode, &rec=
);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0}
>
> =A0#ifdef DEV_APIC
> @@ -563,6 +595,8 @@
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0cmci_update(mode, i, valid=
, &rec);
> =A0#endif
> =A0 =A0 =A0 =A0}
> + =A0 =A0 =A0 if (mode =3D=3D POLLED)
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 mca_fill_freelist();
> =A0 =A0 =A0 =A0return (mode =3D=3D MCE ? recoverable : count);
> =A0}
>
> @@ -642,15 +676,14 @@
>
> =A0#ifdef DEV_APIC
> =A0static void
> -cmci_setup(uint64_t mcg_cap)
> +cmci_setup(void)
> =A0{
> =A0 =A0 =A0 =A0int i;
>
> =A0 =A0 =A0 =A0cmc_state =3D malloc((mp_maxid + 1) * sizeof(struct cmc_st=
ate **),
> =A0 =A0 =A0 =A0 =A0 =A0M_MCA, M_WAITOK);
> - =A0 =A0 =A0 cmc_banks =3D mcg_cap & MCG_CAP_COUNT;
> =A0 =A0 =A0 =A0for (i =3D 0; i <=3D mp_maxid; i++)
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 cmc_state[i] =3D malloc(sizeof(struct cmc_s=
tate) * cmc_banks,
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 cmc_state[i] =3D malloc(sizeof(struct cmc_s=
tate) * mca_banks,
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0M_MCA, M_WAITOK | M_ZERO);
> =A0 =A0 =A0 =A0SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID=
_AUTO,
> =A0 =A0 =A0 =A0 =A0 =A0"cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG=
_MPSAFE,
> @@ -672,10 +705,13 @@
> =A0 =A0 =A0 =A0 =A0 =A0CPUID_TO_FAMILY(cpu_id) =3D=3D 0x10 && amd10h_L1TP=
)
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0workaround_erratum383 =3D 1;
>
> + =A0 =A0 =A0 mca_banks =3D mcg_cap & MCG_CAP_COUNT;
> =A0 =A0 =A0 =A0mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
> =A0 =A0 =A0 =A0STAILQ_INIT(&mca_records);
> =A0 =A0 =A0 =A0TASK_INIT(&mca_task, 0, mca_scan_cpus, NULL);
> =A0 =A0 =A0 =A0callout_init(&mca_timer, CALLOUT_MPSAFE);
> + =A0 =A0 =A0 STAILQ_INIT(&mca_freelist);
> + =A0 =A0 =A0 mca_fill_freelist();
> =A0 =A0 =A0 =A0SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_=
AUTO,
> =A0 =A0 =A0 =A0 =A0 =A0"count", CTLFLAG_RD, &mca_count, 0, "Record count"=
);
> =A0 =A0 =A0 =A0SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID=
_AUTO,
> @@ -689,7 +725,7 @@
> =A0 =A0 =A0 =A0 =A0 =A0sysctl_mca_scan, "I", "Force an immediate scan for=
 machine checks");
> =A0#ifdef DEV_APIC
> =A0 =A0 =A0 =A0if (mcg_cap & MCG_CAP_CMCI_P)
> - =A0 =A0 =A0 =A0 =A0 =A0 =A0 cmci_setup(mcg_cap);
> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 cmci_setup();
> =A0#endif
> =A0}
>
> @@ -707,7 +743,7 @@
> =A0 =A0 =A0 =A0struct cmc_state *cc;
> =A0 =A0 =A0 =A0uint64_t ctl;
>
> - =A0 =A0 =A0 KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GE=
T(cpuid)));
> + =A0 =A0 =A0 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GE=
T(cpuid)));
>
> =A0 =A0 =A0 =A0ctl =3D rdmsr(MSR_MC_CTL2(i));
> =A0 =A0 =A0 =A0if (ctl & MC_CTL2_CMCI_EN)
> @@ -751,7 +787,7 @@
> =A0 =A0 =A0 =A0struct cmc_state *cc;
> =A0 =A0 =A0 =A0uint64_t ctl;
>
> - =A0 =A0 =A0 KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GE=
T(cpuid)));
> + =A0 =A0 =A0 KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GE=
T(cpuid)));
>
> =A0 =A0 =A0 =A0/* Ignore banks not monitored by this CPU. */
> =A0 =A0 =A0 =A0if (!(PCPU_GET(cmci_mask) & 1 << i))

This generally looks right to me.  I can't test it locally since we're
trying to get a release out the door, and I think we've already
replaced the bad CPU on the machine that failed.

Thanks,
matthew



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?AANLkTi=8qadb_xMy3cnRJ=PHffzTVgobZ4pEDSwfwkiU>