Date: Tue, 4 Apr 2017 20:00:45 -0700 From: Mark Millard <markmi@dsl-only.net> To: freebsd-arm <freebsd-arm@freebsd.org>, freebsd-hackers@freebsd.org Subject: The arm64 fork-then-swap-out-then-swap-in failures: a program source for exploring them Message-ID: <4DEA2D76-9F27-426D-A8D2-F07B16575FB9@dsl-only.net>
next in thread | raw e-mail | index | archive | help
Uncommenting/commenting parts of the below program allows exploring the problems with fork-then-swap-out-then-in on arm64. Note: By swap-out I mean that zero RES(ident memory) results, for the process(s) of interest, as shown by "top -PCwaopid" . I discovered recently that swapping-out just before the fork() prevents the failure from the swapping after the fork(). Note: Without the fork() no problem happens. Without the later swap-out no problem happens. Both are required. But some activities before the fork() or between fork() and the swap-out prevent the failures. Some of the comments are based on a pine64+ 2GB context. I use stress to force swap-outs during some sleeps in the program. See also Buzilla 217239 and 217138. (I now expect that they have the same cause.) In my environment I've seen the fork-then-swap-out/swap-in failures on a pine64+ 2GB and a rpi3. They are repeatable on both. I do not have access to server-class machines, or any other arm64 machines. // swap_testing5.c // Built via (cc was clang 4.0 in my case): // // cc -g -std=3Dc11 -Wpedantic -o swaptesting5 swap_testing5.c // -O0 and -O2 also gets the problem. // Note: jemalloc's tcache needs to be enabled to get the failure. // But FreeBSD can get into a state were /etc/malloc.conf // -> 'tcache:false' is ineffective. Also: the allocation // size needs to by sufficiently small (<=3D SMALL_MAXCLASS) // to see the problem. Other comments are based on a specific // context (pine64+ 2GB). #include <signal.h> // for raise(.), SIGABRT (induce core dump) #include <unistd.h> // for fork(), sleep(.) #include <sys/types.h> // for pid_t #include <sys/wait.h> // for wait(.) extern void test_setup(void); // Sets up the memory byte = patterns. extern void test_check(void); // Tests the memory byte patterns. extern void memory_willneed(void); // For seeing if // = posix_madvise(.,.,POSIX_MADV_WILLNEED) // makes a difference. int main(void) { sleep(30); // Potentialy force swap-out here. // [Swap-out here does not avoid later failures.] test_setup(); test_check(); // Before potential sleep(.)/swap-out or fork(.) = [passes] sleep(30); // Potentialy force swap-out here. // [Everything below passes if swapped-out here, // no matter if there are later swap-outs // or not.] pid_t pid =3D fork(); // To test no-fork use: =3D 0; no-fork does = not fail. int wait_status =3D 0; // HERE: After fork; before sleep/swap-out/wait. // if (0 < pid) memory_willneed(); // Does not prevent either = parent or // child failure if enabled. // if (0 =3D=3D pid) memory_willneed(); // Prevents both the parent = and the // child failure. Disable to see // failure of both parent and = child. // [Presuming no prior swap-out: = that // would make everything pass.] // During sleep/wait: manually force this process to // swap out. I use something like: // stress -m 1 --vm-bytes 1800M // in another shell and ^C'ing it after top shows the // swapped status desired. 1800M just happened to work // on the Pine64+ 2GB that I was using. I watch with // top -PCwaopid [checking for zero RES(ident memory)]. if (0 < pid) { sleep(30); // Intend to swap-out during sleep. // test_check(); // Test in parent before child runs (longer = sleep). // This test fails if run for a failing = region_size // unless earlier preventing-activity happened. wait(&wait_status); // Only if test_check above passes or is // disabled above. } if (-1 !=3D wait_status && 0 <=3D pid) { if (0 =3D=3D pid) { sleep(90); } // Intend to swap-out during = sleep. test_check(); // Fails for small-enough region_size, both // parent and child processes, unless earlier // preventing-activty happened. } } // The memory and test code follows. #include <stddef.h> // for size_t, NULL #include <stdlib.h> // for malloc(.), free(.) #include <sys/mman.h> // for POSIX_MADV_WILLNEED, posix_madvise(.,.,.) #define region_size (14u*1024u) // Bad dyn_region pattern, parent and child processes examples: // 256u, 2u*1024u, 4u*1024u, 8u*1024u, 9u*1024u, 12u*1024u, = 14u*1024u // No failure examples: // 14u*1024u+1u, 15u*1024u, 16u*1024u, 32u*1024u, = 256u*1024u*1024u #define num_regions (256u*1024u*1024u/region_size) typedef volatile unsigned char value_type; struct region_struct { value_type array[region_size]; }; typedef struct region_struct region; static region * volatile dyn_regions[num_regions] =3D {NULL,}; static value_type value(size_t v) { return (value_type)((v&0xFEu)|0x1u); = } // value avoids zero values: the bad values are zeros. void test_setup(void) { for(size_t i=3D0u; i<num_regions; i++) { dyn_regions[i] =3D malloc(sizeof(region)); if (!dyn_regions[i]) raise(SIGABRT); for(size_t j=3D0u; j<region_size; j++) { (*dyn_regions[i]).array[j] =3D value(j); } } } void memory_willneed(void) { for(size_t i=3D0u; i<num_regions; i++) { (void) posix_madvise(dyn_regions[i], region_size, = POSIX_MADV_WILLNEED); } } static volatile size_t first_failure_idx =3D 0u; // dyn_regions index static volatile size_t first_failure_pos =3D 0u; // sub-array index static volatile size_t after_bad_idx =3D 0u; // dyn_regions index static volatile size_t after_bad_pos =3D 0u; // sub-array index static volatile size_t after_good_idx =3D 0u; // dyn_regions index static volatile size_t after_good_pos =3D 0u; // sub-array index // Note: Some failing cases get (conjunctive notation): // // 0 =3D=3D first_failure_idx < after_bad_idx < after_good_idx =3D=3D = num_regions // && 0 =3D=3D first_failure_pos && 0<=3Dafter_bad_pos<=3Dregion_size && = after_good_idx=3D=3D0 // && (after_bad_pos is a multiple of the page size in Bytes, here: // after_bad_pos=3D=3DN*4096 for some non-negative integral value N) // // other failing cases instead fail with: // // 0 =3D=3D first_failure && num_regions =3D=3D after_bad_idx =3D=3D = after_good_idx // && 0 =3D=3D first_failure_pos =3D=3D after_bad_pos =3D=3D = after_good_idx // // after_bad_idx strongly tends to vary from failing run to failing run // as does after_bad_pos. // Note: The working cases get: // // num_regions =3D=3D first_failure =3D=3D after_bad_idx =3D=3D = after_good_idx // && 0 =3D=3D first_failure_pos =3D=3D after_bad_pos =3D=3D = after_good_idx void test_check(void) { first_failure_idx =3D first_failure_pos =3D 0u; while (first_failure_idx < num_regions) { while ( first_failure_pos < region_size && ( value(first_failure_pos) =3D=3D = (*dyn_regions[first_failure_idx]).array[first_failure_pos] ) ) { first_failure_pos++; } if (region_size !=3D first_failure_pos) break; first_failure_idx++; first_failure_pos =3D 0u; } after_bad_idx =3D first_failure_idx; after_bad_pos =3D first_failure_pos; while (after_bad_idx < num_regions) { while ( after_bad_pos < region_size && ( value(after_bad_pos) !=3D (*dyn_regions[after_bad_idx]).array[after_bad_pos] ) ) { after_bad_pos++; } if(region_size !=3D after_bad_pos) break; after_bad_idx++; after_bad_pos =3D 0u; } after_good_idx =3D after_bad_idx; after_good_pos =3D after_bad_pos; while (after_good_idx < num_regions) { while ( after_good_pos < region_size && ( value(after_good_pos) =3D=3D = (*dyn_regions[after_good_idx]).array[after_good_pos] ) ) { after_good_pos++; } if(region_size !=3D after_good_pos) break; after_good_idx++; after_good_pos =3D 0u; } if (num_regions !=3D first_failure_idx) raise(SIGABRT); } =3D=3D=3D Mark Millard markmi at dsl-only.net
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4DEA2D76-9F27-426D-A8D2-F07B16575FB9>