Date: Sun, 28 Jun 2015 05:04:09 +0000 (UTC) From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r284901 - in head/sys: amd64/amd64 amd64/include i386/i386 i386/include Message-ID: <201506280504.t5S549Cv042100@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kib Date: Sun Jun 28 05:04:08 2015 New Revision: 284901 URL: https://svnweb.freebsd.org/changeset/base/284901 Log: Remove unneeded data dependency, currently imposed by atomic_load_acq(9), on it source, for x86. Right now, atomic_load_acq() on x86 is sequentially consistent with other atomics, code ensures this by doing store/load barrier by performing locked nop on the source. Provide separate primitive __storeload_barrier(), which is implemented as the locked nop done on a cpu-private variable, and put __storeload_barrier() before load, to keep seq_cst semantic but avoid introducing false dependency on the no-modification of the source for its later use. Note that seq_cst property of x86 atomic_load_acq() is not documented and not carried by atomics implementations on other architectures, although some kernel code relies on the behaviour. This commit does not intend to change this. Reviewed by: alc Discussed with: bde Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks Modified: head/sys/amd64/amd64/atomic.c head/sys/amd64/amd64/vm_machdep.c head/sys/amd64/include/atomic.h head/sys/i386/i386/vm_machdep.c head/sys/i386/include/atomic.h Modified: head/sys/amd64/amd64/atomic.c ============================================================================== --- head/sys/amd64/amd64/atomic.c Sun Jun 28 03:22:26 2015 (r284900) +++ head/sys/amd64/amd64/atomic.c Sun Jun 28 05:04:08 2015 (r284901) @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #undef ATOMIC_ASM /* Make atomic.h generate public functions */ +static __inline void __storeload_barrier(void); #define WANT_FUNCTIONS #define static #undef __inline Modified: head/sys/amd64/amd64/vm_machdep.c ============================================================================== --- head/sys/amd64/amd64/vm_machdep.c Sun Jun 28 03:22:26 2015 (r284900) +++ head/sys/amd64/amd64/vm_machdep.c Sun Jun 28 05:04:08 2015 (r284901) @@ -93,6 +93,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread."); _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb), "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb."); +_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), + "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf."); struct savefpu * get_pcb_user_save_td(struct thread *td) Modified: head/sys/amd64/include/atomic.h ============================================================================== --- head/sys/amd64/include/atomic.h Sun Jun 28 03:22:26 2015 (r284900) +++ head/sys/amd64/include/atomic.h Sun Jun 28 05:04:08 2015 (r284901) @@ -85,7 +85,7 @@ u_long atomic_fetchadd_long(volatile u_l int atomic_testandset_int(volatile u_int *p, u_int v); int atomic_testandset_long(volatile u_long *p, u_int v); -#define ATOMIC_LOAD(TYPE, LOP) \ +#define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) @@ -245,53 +245,79 @@ atomic_testandset_long(volatile u_long * * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * - * However, loads may pass stores, so for atomic_load_acq we have to - * ensure a Store/Load barrier to do the load in SMP kernels. We use - * "lock cmpxchg" as recommended by the AMD Software Optimization - * Guide, and not mfence. For UP kernels, however, the cache of the - * single processor is always consistent, so we only need to take care - * of the compiler. + * However, a load may pass a store if they are performed on distinct + * addresses, so for atomic_load_acq we introduce a Store/Load barrier + * before the load in SMP kernels. We use "lock addl $0,mem", as + * recommended by the AMD Software Optimization Guide, and not mfence. + * In the kernel, we use a private per-cpu cache line as the target + * for the locked addition, to avoid introducing false data + * dependencies. In userspace, a word in the red zone on the stack + * (-8(%rsp)) is utilized. + * + * For UP kernels, however, the memory of the single processor is + * always consistent, so we only need to stop the compiler from + * reordering accesses in a way that violates the semantics of acquire + * and release. */ -#define ATOMIC_STORE(TYPE) \ -static __inline void \ -atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ -{ \ - __compiler_membar(); \ - *p = v; \ -} \ -struct __hack -#if defined(_KERNEL) && !defined(SMP) +#if defined(_KERNEL) -#define ATOMIC_LOAD(TYPE, LOP) \ -static __inline u_##TYPE \ -atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ -{ \ - u_##TYPE tmp; \ - \ - tmp = *p; \ - __compiler_membar(); \ - return (tmp); \ -} \ -struct __hack +/* + * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). + * + * The open-coded number is used instead of the symbolic expression to + * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. + * An assertion in amd64/vm_machdep.c ensures that the value is correct. + */ +#define OFFSETOF_MONITORBUF 0x180 -#else /* !(_KERNEL && !SMP) */ +#if defined(SMP) +static __inline void +__storeload_barrier(void) +{ -#define ATOMIC_LOAD(TYPE, LOP) \ -static __inline u_##TYPE \ -atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ -{ \ - u_##TYPE res; \ - \ - __asm __volatile(MPLOCKED LOP \ - : "=a" (res), /* 0 */ \ - "+m" (*p) /* 1 */ \ - : : "memory", "cc"); \ - return (res); \ -} \ + __asm __volatile("lock; addl $0,%%gs:%0" + : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc"); +} +#else /* _KERNEL && UP */ +static __inline void +__storeload_barrier(void) +{ + + __compiler_membar(); +} +#endif /* SMP */ +#else /* !_KERNEL */ +static __inline void +__storeload_barrier(void) +{ + + __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc"); +} +#endif /* _KERNEL*/ + +#define ATOMIC_LOAD(TYPE) \ +static __inline u_##TYPE \ +atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ +{ \ + u_##TYPE res; \ + \ + __storeload_barrier(); \ + res = *p; \ + __compiler_membar(); \ + return (res); \ +} \ struct __hack -#endif /* _KERNEL && !SMP */ +#define ATOMIC_STORE(TYPE) \ +static __inline void \ +atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ +{ \ + \ + __compiler_membar(); \ + *p = v; \ +} \ +struct __hack #endif /* KLD_MODULE || !__GNUCLIKE_ASM */ @@ -315,20 +341,19 @@ ATOMIC_ASM(clear, long, "andq %1,%0" ATOMIC_ASM(add, long, "addq %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subq %1,%0", "ir", v); -ATOMIC_LOAD(char, "cmpxchgb %b0,%1"); -ATOMIC_LOAD(short, "cmpxchgw %w0,%1"); -ATOMIC_LOAD(int, "cmpxchgl %0,%1"); -ATOMIC_LOAD(long, "cmpxchgq %0,%1"); - -ATOMIC_STORE(char); -ATOMIC_STORE(short); -ATOMIC_STORE(int); -ATOMIC_STORE(long); +#define ATOMIC_LOADSTORE(TYPE) \ + ATOMIC_LOAD(TYPE); \ + ATOMIC_STORE(TYPE) + +ATOMIC_LOADSTORE(char); +ATOMIC_LOADSTORE(short); +ATOMIC_LOADSTORE(int); +ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE - +#undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS /* Read the current value and store a new value in the destination. */ Modified: head/sys/i386/i386/vm_machdep.c ============================================================================== --- head/sys/i386/i386/vm_machdep.c Sun Jun 28 03:22:26 2015 (r284900) +++ head/sys/i386/i386/vm_machdep.c Sun Jun 28 05:04:08 2015 (r284901) @@ -111,6 +111,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread."); _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb), "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb."); +_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), + "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf."); static void cpu_reset_real(void); #ifdef SMP Modified: head/sys/i386/include/atomic.h ============================================================================== --- head/sys/i386/include/atomic.h Sun Jun 28 03:22:26 2015 (r284900) +++ head/sys/i386/include/atomic.h Sun Jun 28 05:04:08 2015 (r284901) @@ -87,7 +87,7 @@ int atomic_cmpset_int(volatile u_int *ds u_int atomic_fetchadd_int(volatile u_int *p, u_int v); int atomic_testandset_int(volatile u_int *p, u_int v); -#define ATOMIC_LOAD(TYPE, LOP) \ +#define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) @@ -228,53 +228,78 @@ atomic_testandset_int(volatile u_int *p, * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * - * However, loads may pass stores, so for atomic_load_acq we have to - * ensure a Store/Load barrier to do the load in SMP kernels. We use - * "lock cmpxchg" as recommended by the AMD Software Optimization - * Guide, and not mfence. For UP kernels, however, the cache of the - * single processor is always consistent, so we only need to take care - * of the compiler. + * However, a load may pass a store if they are performed on distinct + * addresses, so for atomic_load_acq we introduce a Store/Load barrier + * before the load in SMP kernels. We use "lock addl $0,mem", as + * recommended by the AMD Software Optimization Guide, and not mfence. + * In the kernel, we use a private per-cpu cache line as the target + * for the locked addition, to avoid introducing false data + * dependencies. In userspace, a word at the top of the stack is + * utilized. + * + * For UP kernels, however, the memory of the single processor is + * always consistent, so we only need to stop the compiler from + * reordering accesses in a way that violates the semantics of acquire + * and release. */ -#define ATOMIC_STORE(TYPE) \ -static __inline void \ -atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ -{ \ - __compiler_membar(); \ - *p = v; \ -} \ -struct __hack +#if defined(_KERNEL) -#if defined(_KERNEL) && !defined(SMP) +/* + * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). + * + * The open-coded number is used instead of the symbolic expression to + * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. + * An assertion in i386/vm_machdep.c ensures that the value is correct. + */ +#define OFFSETOF_MONITORBUF 0x180 -#define ATOMIC_LOAD(TYPE, LOP) \ -static __inline u_##TYPE \ -atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ -{ \ - u_##TYPE tmp; \ - \ - tmp = *p; \ - __compiler_membar(); \ - return (tmp); \ -} \ -struct __hack +#if defined(SMP) +static __inline void +__storeload_barrier(void) +{ -#else /* !(_KERNEL && !SMP) */ + __asm __volatile("lock; addl $0,%%fs:%0" + : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc"); +} +#else /* _KERNEL && UP */ +static __inline void +__storeload_barrier(void) +{ -#define ATOMIC_LOAD(TYPE, LOP) \ -static __inline u_##TYPE \ -atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ -{ \ - u_##TYPE res; \ - \ - __asm __volatile(MPLOCKED LOP \ - : "=a" (res), /* 0 */ \ - "+m" (*p) /* 1 */ \ - : : "memory", "cc"); \ - return (res); \ -} \ + __compiler_membar(); +} +#endif /* SMP */ +#else /* !_KERNEL */ +static __inline void +__storeload_barrier(void) +{ + + __asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc"); +} +#endif /* _KERNEL*/ + +#define ATOMIC_LOAD(TYPE) \ +static __inline u_##TYPE \ +atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ +{ \ + u_##TYPE res; \ + \ + __storeload_barrier(); \ + res = *p; \ + __compiler_membar(); \ + return (res); \ +} \ struct __hack -#endif /* _KERNEL && !SMP */ +#define ATOMIC_STORE(TYPE) \ +static __inline void \ +atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ +{ \ + \ + __compiler_membar(); \ + *p = v; \ +} \ +struct __hack #ifdef _KERNEL @@ -511,19 +536,19 @@ ATOMIC_ASM(clear, long, "andl %1,%0" ATOMIC_ASM(add, long, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subl %1,%0", "ir", v); -ATOMIC_LOAD(char, "cmpxchgb %b0,%1"); -ATOMIC_LOAD(short, "cmpxchgw %w0,%1"); -ATOMIC_LOAD(int, "cmpxchgl %0,%1"); -ATOMIC_LOAD(long, "cmpxchgl %0,%1"); - -ATOMIC_STORE(char); -ATOMIC_STORE(short); -ATOMIC_STORE(int); -ATOMIC_STORE(long); +#define ATOMIC_LOADSTORE(TYPE) \ + ATOMIC_LOAD(TYPE); \ + ATOMIC_STORE(TYPE) + +ATOMIC_LOADSTORE(char); +ATOMIC_LOADSTORE(short); +ATOMIC_LOADSTORE(int); +ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE +#undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201506280504.t5S549Cv042100>