Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 4 Apr 2017 20:00:45 -0700
From:      Mark Millard <markmi@dsl-only.net>
To:        freebsd-arm <freebsd-arm@freebsd.org>, freebsd-hackers@freebsd.org
Subject:   The arm64 fork-then-swap-out-then-swap-in failures: a program source for exploring them
Message-ID:  <4DEA2D76-9F27-426D-A8D2-F07B16575FB9@dsl-only.net>

next in thread | raw e-mail | index | archive | help
Uncommenting/commenting parts of the below program allows
exploring the problems with fork-then-swap-out-then-in on
arm64.

Note: By swap-out I mean that zero RES(ident memory) results,
      for the process(s) of interest, as shown by
      "top -PCwaopid" .

I discovered recently that swapping-out just before the
fork() prevents the failure from the swapping after the
fork().

Note:
Without the fork() no problem happens. Without the later
swap-out no problem happens. Both are required. But some
activities before the fork() or between fork() and the
swap-out prevent the failures.

Some of the comments are based on a pine64+ 2GB context.
I use stress to force swap-outs during some sleeps in
the program. See also Buzilla 217239 and 217138. (I now
expect that they have the same cause.)

In my environment I've seen the fork-then-swap-out/swap-in
failures on a pine64+ 2GB and a rpi3. They are repeatable
on both. I do not have access to server-class machines, or
any other arm64 machines.


// swap_testing5.c

// Built via (cc was clang 4.0 in my case):
//
// cc -g -std=3Dc11 -Wpedantic -o swaptesting5 swap_testing5.c
// -O0 and -O2 also gets the problem.

// Note: jemalloc's tcache needs to be enabled to get the failure.
//       But FreeBSD can get into a state were /etc/malloc.conf
//       -> 'tcache:false' is ineffective. Also: the allocation
//       size needs to by sufficiently small (<=3D SMALL_MAXCLASS)
//       to see the problem. Other comments are based on a specific
//       context (pine64+ 2GB).

#include <signal.h>     // for raise(.), SIGABRT (induce core dump)
#include <unistd.h>     // for fork(), sleep(.)
#include <sys/types.h>  // for pid_t
#include <sys/wait.h>   // for wait(.)

extern void test_setup(void);         // Sets up the memory byte =
patterns.
extern void test_check(void);         // Tests the memory byte patterns.
extern void memory_willneed(void); // For seeing if
                                   // =
posix_madvise(.,.,POSIX_MADV_WILLNEED)
                                   // makes a difference.

int main(void) {
    sleep(30); // Potentialy force swap-out here.
               // [Swap-out here does not avoid later failures.]

    test_setup();
    test_check(); // Before potential sleep(.)/swap-out or fork(.) =
[passes]

    sleep(30); // Potentialy force swap-out here.
               // [Everything below passes if swapped-out here,
               //  no matter if there are later swap-outs
               //  or not.]

    pid_t pid =3D fork(); // To test no-fork use: =3D 0; no-fork does =
not fail.
    int wait_status =3D 0;

    // HERE: After fork; before sleep/swap-out/wait.

    // if (0 <  pid) memory_willneed(); // Does not prevent either =
parent or
                                     // child failure if enabled.

    // if (0 =3D=3D pid) memory_willneed(); // Prevents both the parent =
and the
                                     // child failure. Disable to see
                                     // failure of both parent and =
child.
                                     // [Presuming no prior swap-out: =
that
                                     // would make everything pass.]

    // During sleep/wait: manually force this process to
    // swap out. I use something like:
    //     stress -m 1 --vm-bytes 1800M
    // in another shell and ^C'ing it after top shows the
    // swapped status desired. 1800M just happened to work
    // on the Pine64+ 2GB that I was using. I watch with
    // top -PCwaopid [checking for zero RES(ident memory)].

    if (0 < pid) {
        sleep(30);    // Intend to swap-out during sleep.
        // test_check(); // Test in parent before child runs (longer =
sleep).
                      // This test fails if run for a failing =
region_size
                      // unless earlier preventing-activity happened.
        wait(&wait_status); // Only if test_check above passes or is
                            // disabled above.
    }
    if (-1 !=3D wait_status && 0 <=3D pid) {
        if (0 =3D=3D pid) { sleep(90); } // Intend to swap-out during =
sleep.
        test_check(); // Fails for small-enough region_size, both
                      // parent and child processes, unless earlier
                      // preventing-activty happened.
    }
}

// The memory and test code follows.

#include <stddef.h>     // for size_t, NULL
#include <stdlib.h>     // for malloc(.), free(.)
#include <sys/mman.h>   // for POSIX_MADV_WILLNEED, posix_madvise(.,.,.)

#define region_size (14u*1024u)
        // Bad dyn_region pattern, parent and child processes examples:
        // 256u, 2u*1024u, 4u*1024u, 8u*1024u, 9u*1024u, 12u*1024u, =
14u*1024u
        // No failure examples:
        // 14u*1024u+1u, 15u*1024u, 16u*1024u, 32u*1024u, =
256u*1024u*1024u
#define num_regions (256u*1024u*1024u/region_size)

typedef volatile unsigned char value_type;
struct region_struct { value_type array[region_size]; };
typedef struct region_struct region;
static region * volatile dyn_regions[num_regions] =3D {NULL,};

static value_type value(size_t v) { return (value_type)((v&0xFEu)|0x1u); =
}
                  // value avoids zero values: the bad values are zeros.

void test_setup(void) {
    for(size_t i=3D0u; i<num_regions; i++) {
        dyn_regions[i] =3D malloc(sizeof(region));
        if (!dyn_regions[i]) raise(SIGABRT);

        for(size_t j=3D0u; j<region_size; j++) {
            (*dyn_regions[i]).array[j] =3D value(j);
        }
    }
}

void memory_willneed(void) {
    for(size_t i=3D0u; i<num_regions; i++) {
        (void) posix_madvise(dyn_regions[i], region_size, =
POSIX_MADV_WILLNEED);
    }
}

static volatile size_t first_failure_idx =3D 0u; // dyn_regions index
static volatile size_t first_failure_pos =3D 0u; //   sub-array index
static volatile size_t after_bad_idx     =3D 0u; // dyn_regions index
static volatile size_t after_bad_pos     =3D 0u; //   sub-array index
static volatile size_t after_good_idx    =3D 0u; // dyn_regions index
static volatile size_t after_good_pos    =3D 0u; //   sub-array index

// Note: Some failing cases get (conjunctive notation):
//
//    0 =3D=3D first_failure_idx < after_bad_idx < after_good_idx =3D=3D =
num_regions
// && 0 =3D=3D first_failure_pos && 0<=3Dafter_bad_pos<=3Dregion_size && =
after_good_idx=3D=3D0
// && (after_bad_pos is a multiple of the page size in Bytes, here:
//     after_bad_pos=3D=3DN*4096 for some non-negative integral value N)
//
// other failing cases instead fail with:
//
//    0 =3D=3D first_failure && num_regions =3D=3D after_bad_idx =3D=3D =
after_good_idx
// && 0 =3D=3D first_failure_pos =3D=3D after_bad_pos =3D=3D =
after_good_idx
//
// after_bad_idx strongly tends to vary from failing run to failing run
// as does after_bad_pos.

// Note: The working cases get:
//
//    num_regions =3D=3D first_failure =3D=3D after_bad_idx =3D=3D =
after_good_idx
// && 0 =3D=3D first_failure_pos =3D=3D after_bad_pos =3D=3D =
after_good_idx

void test_check(void) {
    first_failure_idx =3D first_failure_pos =3D 0u;

    while (first_failure_idx < num_regions) {
        while (  first_failure_pos < region_size
              && (  value(first_failure_pos)
                 =3D=3D =
(*dyn_regions[first_failure_idx]).array[first_failure_pos]
                 )
              ) {
            first_failure_pos++;
        }

        if (region_size !=3D first_failure_pos) break;

        first_failure_idx++;
        first_failure_pos =3D 0u;
    }

    after_bad_idx =3D first_failure_idx;
    after_bad_pos =3D first_failure_pos;

    while (after_bad_idx < num_regions) {
        while (  after_bad_pos < region_size
              && (  value(after_bad_pos)
                 !=3D (*dyn_regions[after_bad_idx]).array[after_bad_pos]
                 )
              ) {
            after_bad_pos++;
        }

        if(region_size !=3D after_bad_pos) break;

        after_bad_idx++;
        after_bad_pos =3D 0u;
    }

    after_good_idx =3D after_bad_idx;
    after_good_pos =3D after_bad_pos;

    while (after_good_idx < num_regions) {
        while (  after_good_pos < region_size
              && (  value(after_good_pos)
                 =3D=3D =
(*dyn_regions[after_good_idx]).array[after_good_pos]
                 )
              ) {
            after_good_pos++;
        }

        if(region_size !=3D after_good_pos) break;

        after_good_idx++;
        after_good_pos =3D 0u;
    }

    if (num_regions !=3D first_failure_idx) raise(SIGABRT);
}

=3D=3D=3D
Mark Millard
markmi at dsl-only.net




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4DEA2D76-9F27-426D-A8D2-F07B16575FB9>