Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 13 May 2019 03:23:33 -0700
From:      Mark Millard <marklmi@yahoo.com>
To:        Justin Hibbits <chmeeedalf@gmail.com>, FreeBSD PowerPC ML <freebsd-ppc@freebsd.org>
Subject:   An experiment in PowerMac G5 multi-socket/multi-core having better matching mftb() values
Message-ID:  <C7CD77C3-8A0F-44A0-926E-BE7C28FB69A3@yahoo.com>

next in thread | raw e-mail | index | archive | help
I've been experimenting with a alternate
technique of dealing with boot-time 970 family
PowerMac G5 tbr value synchronization across
sockets/cores. So far it has narrowed the
range significantly. I've reverted my hack for
tolerating the mismatches in order to see how
it goes.

I'm not aware of other contexts having the
threads-get-stuck-sleeping problem from the
tbr mismatch scale that can happen as things
are officially. And, if there are any,
I've no environment to test.

The technique definitely requires the
relationship between the mftb() value
changing rate and the time it takes to
store-release/load-acq each way between
an ap and the bsp to be such that the
round trip time is reasonably measurable,
with a useful combination of accuracy
and precision. Thus there are limits
to its generality if some other context
attempted something analogous.

Each ap does its own instance of the
process. No single delta would work.

It is also based on the expectation that
the store-release/load-acq each way takes
a non-trivial amount of the round trip
time, putting the bsp's activity in the
middle part of the round trip range.
(Interrupts disabled around the relevant
code.) What Ive seen suggests that this
is true.

I've included my exploratory code below.
It is based on my head -r345758 context.

The ap sends the bsp a mftb() value (that
the bsp only used as a flag that it is
time to send back its own mftb() value).
The ap also calculates the approximate
round trip ap-time for this exchange.
=46rom such the ap comes up with the
adjustment to its mftb() values to
approximate the mftb() value the bsp
provided. The ap then uses that as an
adjustment to the ap's tbr value (via
mttb()). So far the results seem to be
a sizable improvement.

[In experiments I've been labeling some variables
volatile, just to indicate that I generally do not
expect loads/stores to be skipped for them. This
does not mean that I'd observed any cases of just
holding a value in a register. This may produce
minor text mismatches with other files not shown
here.]


# svnlite diff /usr/src/sys/powerpc/powermac/platform_powermac.c =
/usr/src/sys/powerpc/powerpc/mp_machdep.c | more
Index: /usr/src/sys/powerpc/powermac/platform_powermac.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- /usr/src/sys/powerpc/powermac/platform_powermac.c   (revision =
345758)
+++ /usr/src/sys/powerpc/powermac/platform_powermac.c   (working copy)
@@ -55,7 +55,7 @@
=20
 #include "platform_if.h"
=20
-extern void *ap_pcpu;
+extern void * volatile ap_pcpu;
=20
 static int powermac_probe(platform_t);
 static int powermac_attach(platform_t);
@@ -333,6 +333,10 @@
        return (powermac_smp_fill_cpuref(cpuref, bsp));
 }
=20
+#ifdef __powerpc64__
+extern volatile int alternate_timebase_sync_style;
+#endif
+
 static int
 powermac_smp_start_cpu(platform_t plat, struct pcpu *pc)
 {
@@ -366,6 +370,19 @@
        }
=20
        ap_pcpu =3D pc;
+#ifdef __powerpc64__
+       switch (mfpvr()>>16)
+       {
+               case IBM970:
+               case IBM970FX:
+               case IBM970MP:
+                       alternate_timebase_sync_style=3D 1;
+                       break;
+               default:
+                       break;
+       }
+#endif
+       powerpc_sync();
=20
        if (rstvec_virtbase =3D=3D NULL)
                rstvec_virtbase =3D pmap_mapdev(0x80000000, PAGE_SIZE);
Index: /usr/src/sys/powerpc/powerpc/mp_machdep.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- /usr/src/sys/powerpc/powerpc/mp_machdep.c   (revision 345758)
+++ /usr/src/sys/powerpc/powerpc/mp_machdep.c   (working copy)
@@ -70,6 +70,13 @@
 static struct mtx ap_boot_mtx;
 struct pcb stoppcbs[MAXCPU];
=20
+#if defined(__powerpc64__) && defined(AIM)
+// Part of: Attempt a better-than-historical approximately equal =
timebase value for ap vs. bsp
+volatile int alternate_timebase_sync_style=3D 0;
+volatile uint64_t timebase_samples[2]; // 0: from ap; 1: from bsp.
+                                      // Consider separate cache lines?
+#endif
+
 void
 machdep_ap_bootstrap(void)
 {
@@ -77,19 +84,65 @@
        PCPU_SET(awake, 1);
        __asm __volatile("msync; isync");
=20
+#if defined(__powerpc64__) && defined(AIM)
+       // Attempt a better-than-historical approximately equal timebase =
value for ap vs. bsp
+       powerpc_sync();
+       isync();
+       if (alternate_timebase_sync_style) // Requires: timeframe with =
only one ap at a time
+       {
+               register_t oldmsr=3D intr_disable();
+
+               while (1u!=3Dtimebase_samples[1])
+                       ; // spin waiting for bsp to flag that ready to =
start.
+
+               //  Measure a round trip:: to the bsp and back.
+
+               isync(); // Be sure below mftb() result is not from =
earlier speculative execution.
+               atomic_store_rel_64(&timebase_samples[0], mftb()); // =
bsp waits for this before its mftb().
+
+               while (1u=3D=3Dtimebase_samples[1]) // expect bsp to =
have: 1u<mftb()
+                       ; // spin waiting for bsp's tbr value
+               // Mid-point of ap round trip and the bsp timebase value =
should be approximately equal
+               // when the tbr's are well matched, absent interruptions =
on both sides.
+
+               isync(); // Be sure below mftb() result is not from =
earlier speculative execution.
+               register_t const end_round_trip_time_on_ap=3D mftb(); // =
Allows estimate round-trip time.
+
+               int64_t const approx_round_trip_tbr_detla_on_ap=3D =
end_round_trip_time_on_ap-timebase_samples[0];
+               int64_t const ap_midpoint_tbr_value=3D =
timebase_samples[0] + approx_round_trip_tbr_detla_on_ap/2;
+
+               // Establish delta_to_match_bsp_example such that:
+               //           =
ap_midpoint_tbr_value+delta_to_match_bsp_example=3D=3Dtimebase_samples[1] =
(from bsp)
+               int64_t const delta_to_match_bsp_tbr_example=3D =
timebase_samples[1]-ap_midpoint_tbr_value;
+
+               isync(); // Be sure below mftb() result is not from =
earlier speculative execution.
+               mttb((int64_t)mftb()+delta_to_match_bsp_tbr_example); // =
Make the ap tbr adjustment.
+
+               atomic_store_rel_64(&timebase_samples[0], 0u); // Get =
ready for next ap in bsp loop
+               atomic_store_rel_64(&timebase_samples[1], 0u); // also =
flaging bsp that this ap is done
+
+               mtmsr(oldmsr);
+       }
+#endif
+
        while (ap_letgo =3D=3D 0)
                __asm __volatile("or 31,31,31");
        __asm __volatile("or 6,6,6");
=20
-       /*
-        * Set timebase as soon as possible to meet an implicit =
rendezvous
-        * from cpu_mp_unleash(), which sets ap_letgo and then =
immediately
-        * sets timebase.
-        *
-        * Note that this is instrinsically racy and is only relevant on
-        * platforms that do not support better mechanisms.
-        */
-       platform_smp_timebase_sync(ap_timebase, 1);
+#if defined(__powerpc64__) && defined(AIM)
+       if (!alternate_timebase_sync_style)
+#endif
+       {
+               /*
+                * Set timebase as soon as possible to meet an implicit =
rendezvous
+                * from cpu_mp_unleash(), which sets ap_letgo and then =
immediately
+                * sets timebase.
+                *
+                * Note that this is instrinsically racy and is only =
relevant on
+                * platforms that do not support better mechanisms.
+                */
+               platform_smp_timebase_sync(ap_timebase, 1);
+       }
=20
        /* Give platform code a chance to do anything else necessary */
        platform_smp_ap_init();
@@ -251,6 +304,34 @@
                                    pc->pc_cpuid, =
(uintmax_t)pc->pc_hwref,
                                    pc->pc_awake);
                        smp_cpus++;
+
+#if defined(__powerpc64__) && defined(AIM)
+                       // Part of: Attempt a better-than-historical =
approximately
+                       //          equal timebase value for ap vs. bsp
+                       powerpc_sync();
+                       isync();
+                       if (alternate_timebase_sync_style)
+                       {
+                               register_t oldmsr=3D intr_disable();
+
+                               =
atomic_store_rel_64(&timebase_samples[1], 1u); // flag ap that bsp is =
ready to start.
+
+                               while (0u=3D=3Dtimebase_samples[0]) // =
Expect on ap's: 0u<mftb()
+                                       ; // spin waiting for ap's tbr =
value to flag that it is time to send one back.
+
+                               isync(); // Be sure below mftb() result =
is not from earlier speculative execution.
+                               =
atomic_store_rel_64(&timebase_samples[1], mftb()); // Give ap the bsp =
tbr value
+                                                                        =
          // from the time frame.
+
+                               // Most of the rest of the usage is in =
machdep_ap_bootstrap,
+                               // other than controling =
alternate_timebase_sync_style.
+
+                               while (0u!=3Dtimebase_samples[1])
+                                       ; // spin waiting for ap's to be =
done with the samples.
+
+                               mtmsr(oldmsr);
+                       }
+#endif
                } else
                        CPU_SET(pc->pc_cpuid, &stopped_cpus);
        }
@@ -257,14 +338,22 @@
=20
        ap_awake =3D 1;
=20
-       /* Provide our current DEC and TB values for APs */
-       ap_timebase =3D mftb() + 10;
-       __asm __volatile("msync; isync");
+#if defined(__powerpc64__) && defined(AIM)
+       if (!alternate_timebase_sync_style)
+#endif
+       {
+               /* Provide our current DEC and TB values for APs */
+               ap_timebase =3D mftb() + 10;
+               __asm __volatile("msync; isync");
+       }
       =20
        /* Let APs continue */
        atomic_store_rel_int(&ap_letgo, 1);
=20
-       platform_smp_timebase_sync(ap_timebase, 0);
+#if defined(__powerpc64__) && defined(AIM)
+       if (!alternate_timebase_sync_style)
+#endif
+               platform_smp_timebase_sync(ap_timebase, 0);
=20
        while (ap_awake < smp_cpus)
                ;


=3D=3D=3D
Mark Millard
marklmi at yahoo.com
( dsl-only.net went
away in early 2018-Mar)




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?C7CD77C3-8A0F-44A0-926E-BE7C28FB69A3>