Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 17 Nov 2025 01:15:13 -0800
From:      Mark Millard <marklmi@yahoo.com>
To:        "mmel@freebsd.org" <mmel@FreeBSD.org>
Cc:        Warner Losh <imp@bsdimp.com>, bob prohaska <fbsd@www.zefox.net>, "Herbert J. Skuhra" <herbert@gojira.at>, "freebsd-arm@freebsd.org" <freebsd-arm@freebsd.org>, FreeBSD Current <freebsd-current@freebsd.org>
Subject:   Re: Still seeing Failed assertion: "p[i] == 0" on armv7 buildworld
Message-ID:  <6F882958-E901-489B-A758-BCCFE5D01FBA@yahoo.com>
In-Reply-To: <0157ECE7-759F-4C00-9656-CB2ECA65E19E@yahoo.com>
References:  <aOu7s9roGCofdCOw@www.zefox.net> <aOvTG-20QRJtJJwf@int21h> <CANCZdfrJ8rph_rkT3Mk-sNYKNspoV15SvHWLsahzS0HnULi4ww@mail.gmail.com> <aO068RrAehdiHOoZ@www.zefox.net> <aRUJPryA4Vmu8dDD@www.zefox.net> <4957be52-e57f-4f5f-9626-d0f706480fe1@FreeBSD.org> <aRalt0YwsjV_mvMq@www.zefox.net> <87ldk9f4tt.wl-herbert@gojira.at> <CANCZdfrSaJ7snshhiV2r%2BEX_sazhJ-HFAK0e=q%2B-MOmP=uLKqg@mail.gmail.com> <aRnxOtZ8W2g6ZkVF@www.zefox.net> <CANCZdfqTZ311DGEQmH0FLKrh8csN-P=qwUt=RXGSTrehUfZi3g@mail.gmail.com> <0a93ab5f-3fdf-45a0-8b32-2df5ef4ad60a@FreeBSD.org> <0157ECE7-759F-4C00-9656-CB2ECA65E19E@yahoo.com>

next in thread | previous in thread | raw e-mail | index | archive | help

On Nov 17, 2025, at 00:24, Mark Millard <marklmi@yahoo.com> wrote:

> On Nov 16, 2025, at 13:11, Michal Meloun <mmel@FreeBSD.org> wrote:
> 
>> On 16.11.2025 18:51, Warner Losh wrote:
>>> Maybe try main with the following patch. Adrian noticed the TLS mismatch. I don't think it will matter, but TLS thread model stuff always gives me a big headache. If the following fails to apply, just copy the JEMALLOC_TLS_MODEL line from i386 to arm. The default changed elsewhere, but this wasn't updated here.
>>> Warner
>> 
>> Unfortunately, that doesn't help. I'm out of ideas on how to debug this, all of my attempts have failed.
>> 
>> The problem only occurs when Clang compiles a larger project and is intermediate. Attempt to compile the clang generated reproducer is always successful.
>> It's clear that the parallelism introduced by make plays a significant role. But the system never reached an OOM condition before failure.
>> 
>> I would be grateful for any help and ideas on what to do next.
>> Michal
> 
> [Note: The context is an official pkgbase distribution context
> and so the /usr/src/ is not tied to git. /usr/src-investigation/
> is a copy of /usr/src/ that was then modified. Also, this is
> via a armv7 chroot on the aarch64 Windows Dev Kit 2023, not
> via armv7-only hardware.]
> 
> The crude hack reported later below has shown the first failure
> indicated as happening during base_alloc_edata by reporting:
> 
> p[i] == 0 && which_base_extent_context == 0x11u
> 
> as the failure message.
> 
> 
> # diff -u /usr/src/contrib/jemalloc/include/jemalloc/internal/ehooks.h /usr/src-investigation/contrib/jemalloc/include/jemalloc/
> --- /usr/src/contrib/jemalloc/include/jemalloc/internal/ehooks.h 2025-11-12 02:24:28.000000000 -0800
> +++ /usr/src-investigation/contrib/jemalloc/include/jemalloc/internal/ehooks.h 2025-11-16 23:47:10.965711000 -0800
> @@ -1,6 +1,7 @@
> #ifndef JEMALLOC_INTERNAL_EHOOKS_H
> #define JEMALLOC_INTERNAL_EHOOKS_H
> 
> +#include <signal.h>
> #include "jemalloc/internal/atomic.h"
> #include "jemalloc/internal/extent_mmap.h"
> 
> @@ -158,6 +159,7 @@
>  * This isn't really ehooks-specific (i.e. anyone can check for zeroed memory).
>  * But incorrect zero information indicates an ehook bug.
>  */
> +__attribute__ ((visibility ("internal"))) extern volatile sig_atomic_t which_base_extent_context; // HACK FOR DEBUGGING USE
> static inline void
> ehooks_debug_zero_check(void *addr, size_t size) {
> assert(((uintptr_t)addr & PAGE_MASK) == 0);
> @@ -167,7 +169,45 @@
> /* Check the whole first page. */
> size_t *p = (size_t *)addr;
> for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
> - assert(p[i] == 0);
> +switch (which_base_extent_context)
> +{
> +case 0x10u: // base_alloc
> +    assert(p[i] == 0 && which_base_extent_context == 0x10u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x11u: // base_alloc_edata
> +    assert(p[i] == 0 && which_base_extent_context == 0x11u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x12u: // base_new
> +    assert(p[i] == 0 && which_base_extent_context == 0x12u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x13u: // base_boot
> +    assert(p[i] == 0 && which_base_extent_context == 0x13u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x20u: // extent_commit_wrapper
> +    assert(p[i] == 0 && which_base_extent_context == 0x20u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x21u: // extent_commit_zero
> +    assert(p[i] == 0 && which_base_extent_context == 0x21u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x22u: // ecache_alloc_grow
> +    assert(p[i] == 0 && which_base_extent_context == 0x22u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +case 0x00u: // None known
> +    assert(p[i] == 0 && which_base_extent_context == 0x00u);
> +    which_base_extent_context= 0x0u;
> +    break;
> +default: // Some other context
> +    assert(p[i] == 0 && which_base_extent_context != 0x00u);
> +    which_base_extent_context= 0x0u;
> +}
> + //assert(p[i] == 0);
> }
> /*
> * And 4 spots within.  There's a tradeoff here; the larger
> 
> 
> # diff -u /usr/src/contrib/jemalloc/src/base.c /usr/src-investigation/contrib/jemalloc/src/base.c
> --- /usr/src/contrib/jemalloc/src/base.c 2025-11-12 02:24:28.000000000 -0800
> +++ /usr/src-investigation/contrib/jemalloc/src/base.c 2025-11-16 23:50:14.396483000 -0800
> @@ -1,3 +1,4 @@
> +#include <signal.h>
> #include "jemalloc/internal/jemalloc_preamble.h"
> #include "jemalloc/internal/jemalloc_internal_includes.h"
> 
> @@ -340,12 +341,15 @@
> b0get(void) {
> return b0;
> }
> +
> +__attribute__ ((visibility ("internal"))) volatile sig_atomic_t which_base_extent_context=0x0u;   // HACK FOR DEBUGGING USE
> 
> base_t *
> base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
>     bool metadata_use_hooks) {
> pszind_t pind_last = 0;
> size_t extent_sn_next = 0;
> +which_base_extent_context= 0x12u;
> 
> /*
> * The base will contain the ehooks eventually, but it itself is
> @@ -476,12 +480,14 @@
>  */
> void *
> base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) {
> +which_base_extent_context= 0x10u;
> return base_alloc_impl(tsdn, base, size, alignment, NULL);
> }
> 
> edata_t *
> base_alloc_edata(tsdn_t *tsdn, base_t *base) {
> size_t esn;
> +which_base_extent_context= 0x11u;
> edata_t *edata = base_alloc_impl(tsdn, base, sizeof(edata_t),
>    EDATA_ALIGNMENT, &esn);
> if (edata == NULL) {
> @@ -523,6 +529,7 @@
> 
> bool
> base_boot(tsdn_t *tsdn) {
> +which_base_extent_context= 0x13u;
> b0 = base_new(tsdn, 0, (extent_hooks_t *)&ehooks_default_extent_hooks,
>    /* metadata_use_hooks */ true);
> return (b0 == NULL);
> 
> 
> # diff -u /usr/src/contrib/jemalloc/src/extent.c /usr/src-investigation/contrib/jemalloc/src/extent.c
> --- /usr/src/contrib/jemalloc/src/extent.c 2025-11-12 02:24:28.000000000 -0800
> +++ /usr/src-investigation/contrib/jemalloc/src/extent.c 2025-11-16 23:49:55.820658000 -0800
> @@ -1,3 +1,4 @@
> +#include <signal.h>
> #include "jemalloc/internal/jemalloc_preamble.h"
> #include "jemalloc/internal/jemalloc_internal_includes.h"
> 
> @@ -90,11 +91,14 @@
> assert(edata == NULL || edata_guarded_get(edata) == guarded);
> return edata;
> }
> +
> +__attribute__ ((visibility ("internal"))) extern volatile sig_atomic_t which_base_extent_context; // HACK FOR DEBUGGING USE
> 
> edata_t *
> ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
>     edata_t *expand_edata, size_t size, size_t alignment, bool zero,
>     bool guarded) {
> +which_base_extent_context= 0x22u;
> assert(size != 0);
> assert(alignment != 0);
> witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
> @@ -1114,6 +1118,7 @@
> bool
> extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
>     size_t offset, size_t length) {
> +which_base_extent_context= 0x20u;
> return extent_commit_impl(tsdn, ehooks, edata, offset, length,
>    /* growing_retained */ false);
> }
> @@ -1297,6 +1302,7 @@
> bool
> extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
>     bool commit, bool zero, bool growing_retained) {
> +which_base_extent_context= 0x21u;
> witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
>    WITNESS_RANK_CORE, growing_retained ? 1 : 0);


Well, with this hack, the behavior looks to have changed to
always fail leading-to an initial:

*** [libzpool.so.2.full] Error code 1

The hack may disturb things too much and may not be
sufficiently close to valid code for the context.

Using -j1 got a first failure message:

Failed assertion: p[i] == 0 && which_base_extent_context == 0x22u
That would be during ecache_alloc_grow. Still:

*** [libzpool.so.2.full] Error code 1


===
Mark Millard
marklmi at yahoo.com




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?6F882958-E901-489B-A758-BCCFE5D01FBA>