Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 29 Jul 2012 20:12:43 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        Bruce Evans <brde@optusnet.com.au>
Cc:        amd64@freebsd.org, Andriy Gapon <avg@freebsd.org>
Subject:   Re: Use fences for kernel tsc reads.
Message-ID:  <20120729171243.GO2676@deviant.kiev.zoral.com.ua>
In-Reply-To: <20120729123231.K1193@besplex.bde.org>
References:  <20120728160202.GI2676@deviant.kiev.zoral.com.ua> <20120729123231.K1193@besplex.bde.org>

next in thread | previous in thread | raw e-mail | index | archive | help

--/bcALokiWR3y46GP
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Sun, Jul 29, 2012 at 02:58:36PM +1000, Bruce Evans wrote:
> On Sat, 28 Jul 2012, Konstantin Belousov wrote:
> >...
> >Handling of usermode will follow later.
>=20
> I hesitate to mention that this doesn't pessimize all uses of rdtsc:
> - x86/isa/clock.c uses raw rdtsc32() for DELAY()
There, fence is not needed because we do not compare counters on
different cores. Note that delay_tc() explicitely performs pin when
tsc is going to be used.

> - x86/x86/mca.c uses raw rdtsc() for mca_check_status()
mca use of rdtsc() seems to tbe purely informational.

> >diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
> >index c253a96..101cbb3 100644
> >--- a/sys/x86/x86/tsc.c
> >+++ b/sys/x86/x86/tsc.c
> >...
> >@@ -328,15 +344,26 @@ init_TSC(void)
> >
> >#ifdef SMP
> >
> >-/* rmb is required here because rdtsc is not a serializing instruction.=
 */
> >+/*
> >+ * RDTSC is not a serializing instruction, so we need to drain
> >+ * instruction stream before executing it. It could be fixed by use of
> >+ * RDTSCP, except the instruction is not available everywhere.
>=20
> Sentence breaks are 2 spaces in KNF.  There was a recent thread about thi=
s.
Changed.

> RDTSC apparently doesn't need full serialization (else fences wouldn't
> be enough, and the necessary serialization would be slower).  The
> necessary serialization is very complex and not fully described above,
> but the above is not a good place to describe many details.  It already
> describes too much.  It is obvious that we shouldn't use large code or
> ifdefs to be portable in places where we don't care at all about
> efficiency.  Only delicate timing requirements would prevent us using
> the simple CPUID in such code.  For example, in code very much like
> this, we might need to _not_ slow things down, since we might want
> to see small differences between the TSCs on different CPUs.
This is argument to stop using do_cpuid(), since it causes relatively
large amount of unneeded memory writes. I inlined CPUID instead with
explicit register clobber list.

>=20
> >@@ -592,23 +628,55 @@ sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
> >SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW,
> >    0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency");
> >
> >-static u_int
> >+static inline u_int
> >tsc_get_timecount(struct timecounter *tc __unused)
> >{
> >
> >	return (rdtsc32());
> >}
>=20
> Adding `inline' gives up to 5 (!) style bugs at different levels:
> - `inline' is usually spelled `__inline'.  The former is now standard,
>   but sys/cdefs.h never caught up with this and has better magic for
>   the latter.  And `inline' doesn't match most existing spellings.
There is no reasons to use __inline in the kernel code anymore. We are in
C99 mode always when compiling kernel.

> - the function is still forward-declared without `inline'.  The differenc
>   is confusing.
> - forward declaration of inline functions is nonsense.  It sort of asks
>   for early uses to be non-inline and late uses to be inline.  But
>   gcc now doesn't care much about the order, since -funit-at-a-time is
>   the default.
> - this function is never actually inline.  You added calls to the others
>   but not this.
> - if this function were inline, then this might happen automatically due
>   to -finline-functions-called-once.  See below.
>=20
> >
> >-static u_int
> >+static inline u_int
> >tsc_get_timecount_low(struct timecounter *tc)
> >{
> >	uint32_t rv;
> >
> >	__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
> >-	: "=3Da" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
> >+	    : "=3Da" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
> >	return (rv);
> >}
=2E..
Added inline to declaration of tsc_get_timecounter_low. And removed
'inline' from other functions. Compiler still generates exactly '*fence;
rdtsc' sequences. In truth, for _low variabnts, the load of tc_priv is
done between fence and rdtsc, but I kept it as is for now.

diff --git a/sys/amd64/include/cpufunc.h b/sys/amd64/include/cpufunc.h
index 94d4133..881fcd2 100644
--- a/sys/amd64/include/cpufunc.h
+++ b/sys/amd64/include/cpufunc.h
@@ -290,6 +290,13 @@ popcntq(u_long mask)
 }
=20
 static __inline void
+lfence(void)
+{
+
+	__asm __volatile("lfence" : : : "memory");
+}
+
+static __inline void
 mfence(void)
 {
=20
diff --git a/sys/i386/include/cpufunc.h b/sys/i386/include/cpufunc.h
index 62d268d..7cd3663 100644
--- a/sys/i386/include/cpufunc.h
+++ b/sys/i386/include/cpufunc.h
@@ -155,6 +155,13 @@ cpu_mwait(u_long extensions, u_int hints)
 }
=20
 static __inline void
+lfence(void)
+{
+
+	__asm __volatile("lfence" : : : "memory");
+}
+
+static __inline void
 mfence(void)
 {
=20
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index c253a96..3d8bd30 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -82,7 +82,11 @@ static void tsc_freq_changed(void *arg, const struct cf_=
level *level,
 static void tsc_freq_changing(void *arg, const struct cf_level *level,
     int *status);
 static unsigned tsc_get_timecount(struct timecounter *tc);
-static unsigned tsc_get_timecount_low(struct timecounter *tc);
+static inline unsigned tsc_get_timecount_low(struct timecounter *tc);
+static unsigned tsc_get_timecount_lfence(struct timecounter *tc);
+static unsigned tsc_get_timecount_low_lfence(struct timecounter *tc);
+static unsigned tsc_get_timecount_mfence(struct timecounter *tc);
+static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc);
 static void tsc_levels_changed(void *arg, int unit);
=20
 static struct timecounter tsc_timecounter =3D {
@@ -262,6 +266,10 @@ probe_tsc_freq(void)
 		    (vm_guest =3D=3D VM_GUEST_NO &&
 		    CPUID_TO_FAMILY(cpu_id) >=3D 0x10))
 			tsc_is_invariant =3D 1;
+		if (cpu_feature & CPUID_SSE2) {
+			tsc_timecounter.tc_get_timecount =3D
+			    tsc_get_timecount_mfence;
+		}
 		break;
 	case CPU_VENDOR_INTEL:
 		if ((amd_pminfo & AMDPM_TSC_INVARIANT) !=3D 0 ||
@@ -271,6 +279,10 @@ probe_tsc_freq(void)
 		    (CPUID_TO_FAMILY(cpu_id) =3D=3D 0xf &&
 		    CPUID_TO_MODEL(cpu_id) >=3D 0x3))))
 			tsc_is_invariant =3D 1;
+		if (cpu_feature & CPUID_SSE2) {
+			tsc_timecounter.tc_get_timecount =3D
+			    tsc_get_timecount_lfence;
+		}
 		break;
 	case CPU_VENDOR_CENTAUR:
 		if (vm_guest =3D=3D VM_GUEST_NO &&
@@ -278,6 +290,10 @@ probe_tsc_freq(void)
 		    CPUID_TO_MODEL(cpu_id) >=3D 0xf &&
 		    (rdmsr(0x1203) & 0x100000000ULL) =3D=3D 0)
 			tsc_is_invariant =3D 1;
+		if (cpu_feature & CPUID_SSE2) {
+			tsc_timecounter.tc_get_timecount =3D
+			    tsc_get_timecount_lfence;
+		}
 		break;
 	}
=20
@@ -328,16 +344,31 @@ init_TSC(void)
=20
 #ifdef SMP
=20
-/* rmb is required here because rdtsc is not a serializing instruction. */
-#define	TSC_READ(x)			\
-static void				\
-tsc_read_##x(void *arg)			\
-{					\
-	uint32_t *tsc =3D arg;		\
-	u_int cpu =3D PCPU_GET(cpuid);	\
-					\
-	rmb();				\
-	tsc[cpu * 3 + x] =3D rdtsc32();	\
+/*
+ * RDTSC is not a serializing instruction, and does not drain
+ * instruction stream, so we need to drain the stream before executing
+ * it.  It could be fixed by use of RDTSCP, except the instruction is
+ * not available everywhere.
+ *
+ * Use CPUID for draining in the boot-time SMP constistency test.  The
+ * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel
+ * and VIA) when SSE2 is present, and nothing on older machines which
+ * also do not issue RDTSC prematurely.  There, testing for SSE2 and
+ * vendor is too cumbersome, and we learn about TSC presence from
+ * CPUID.
+ *
+ * Do not use do_cpuid(), since we do not need CPUID results, which
+ * have to be written into memory with do_cpuid().
+ */
+#define	TSC_READ(x)							\
+static void								\
+tsc_read_##x(void *arg)							\
+{									\
+	uint32_t *tsc =3D arg;						\
+	u_int cpu =3D PCPU_GET(cpuid);					\
+									\
+	__asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx");	\
+	tsc[cpu * 3 + x] =3D rdtsc32();					\
 }
 TSC_READ(0)
 TSC_READ(1)
@@ -487,7 +518,16 @@ init:
 	for (shift =3D 0; shift < 31 && (tsc_freq >> shift) > max_freq; shift++)
 		;
 	if (shift > 0) {
-		tsc_timecounter.tc_get_timecount =3D tsc_get_timecount_low;
+		if (cpu_feature & CPUID_SSE2) {
+			if (cpu_vendor_id =3D=3D CPU_VENDOR_AMD) {
+				tsc_timecounter.tc_get_timecount =3D
+				    tsc_get_timecount_low_mfence;
+			} else {
+				tsc_timecounter.tc_get_timecount =3D
+				    tsc_get_timecount_low_lfence;
+			}
+		} else
+			tsc_timecounter.tc_get_timecount =3D tsc_get_timecount_low;
 		tsc_timecounter.tc_name =3D "TSC-low";
 		if (bootverbose)
 			printf("TSC timecounter discards lower %d bit(s)\n",
@@ -599,16 +639,48 @@ tsc_get_timecount(struct timecounter *tc __unused)
 	return (rdtsc32());
 }
=20
-static u_int
+static inline u_int
 tsc_get_timecount_low(struct timecounter *tc)
 {
 	uint32_t rv;
=20
 	__asm __volatile("rdtsc; shrd %%cl, %%edx, %0"
-	: "=3Da" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
+	    : "=3Da" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx");
 	return (rv);
 }
=20
+static u_int
+tsc_get_timecount_lfence(struct timecounter *tc __unused)
+{
+
+	lfence();
+	return (rdtsc32());
+}
+
+static u_int
+tsc_get_timecount_low_lfence(struct timecounter *tc)
+{
+
+	lfence();
+	return (tsc_get_timecount_low(tc));
+}
+
+static u_int
+tsc_get_timecount_mfence(struct timecounter *tc __unused)
+{
+
+	mfence();
+	return (rdtsc32());
+}
+
+static u_int
+tsc_get_timecount_low_mfence(struct timecounter *tc)
+{
+
+	mfence();
+	return (tsc_get_timecount_low(tc));
+}
+
 uint32_t
 cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
 {

--/bcALokiWR3y46GP
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.12 (FreeBSD)

iEYEARECAAYFAlAVbwsACgkQC3+MBN1Mb4jAEQCfY3F7EPgEz65p43knh0BvqT+1
nqoAoNqLT0k8T8bX7TXWPSd0V47uOOS2
=Mait
-----END PGP SIGNATURE-----

--/bcALokiWR3y46GP--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120729171243.GO2676>