Date: Sun, 31 Jul 2016 13:49:28 +0300 From: Konstantin Belousov <kostikbel@gmail.com> To: Mateusz Guzik <mjguzik@gmail.com> Cc: jhb@freebsd.org, freebsd-current@freebsd.org Subject: Re: [PATCH] randomized delay in locking primitives, take 2 Message-ID: <20160731104928.GW83214@kib.kiev.ua> In-Reply-To: <20160731095706.GB9408@dft-labs.eu> References: <20160731095706.GB9408@dft-labs.eu>
next in thread | previous in thread | raw e-mail | index | archive | help
On Sun, Jul 31, 2016 at 11:57:06AM +0200, Mateusz Guzik wrote: > The patch can be found inline below and also here: > https://people.freebsd.org/~mjg/lock_backoff_complete.diff > > The previous version of the patch was posted here: > https://lists.freebsd.org/pipermail/freebsd-current/2016-July/062320.html > > This time around instead of a total hack I have something of reasonable > quality. > > Problem description and the way to fix it stands, to quote: > > If the lock is contended, primitives like __mtx_lock_sleep will spin > > checking if the owner is running or the lock was freed. The problem is > > that once it is discovered that the lock is free, multiple CPUs are > > likely to try to do the atomic op which will make it more costly for > > everyone and throughput suffers. > > > The standard thing to do is to have some sort of a randomized delay so > > that this kind of behaviour is reduced. > > So, this time around lock_delay primitive accepts configuration including > min/max spin count per iteration, incrementation step per lock_delay > invocation and total spins allowed during one execution of the locking > primitive. The code "autotunes" with mp_ncpus on boot. Current values > were determined after testing the code on a 80-way and a 40-way system. > There is definitely room for improvement there, but the biggest gain as > it is is sufficient to get the patch in. > > Parameters are settable by lock type. > > Converted locks are: mtx, sx, rwlocks. > > Trivial loops spinning as long as the lock owner is running are > modified. The rest was left untouched in order to keep the patch > simpler. > > The following locks were not touched: spin mutexes, thread locks and > lockmgr. First two could be, but I don't have any suitable benchmarks > for them. lockmgr has support for spinning, but is compiled out. > > In terms of results, this time around I got my hands on an 80-way > machine (pig1 in the testing cluster). Vanilla kernel performance drops > significantly with increased contention. For instance, the following is > make -j 80 buildkernel of 11.0 tree on a tmpfs: > 7681.08s user 17040.02s system 7014% cpu 5:52.43 total > > And this is with the patched kernel: > 4585.85s user 2365.95s system 5675% cpu 2:02.48 total > > Both results are reproducible. > > I would like to get this in as fast as possible. > > Note: lock_delay has a .cap parameter which is currently set to 0 and > has no sysctl to control it. It is there for later when it will replace > the unmodified spinning code in sx and rwlocks. > > diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c > index 453add4..ca9319c 100644 > --- a/sys/kern/kern_mutex.c > +++ b/sys/kern/kern_mutex.c > @@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$"); > #include <sys/resourcevar.h> > #include <sys/sched.h> > #include <sys/sbuf.h> > +#include <sys/smp.h> > #include <sys/sysctl.h> > #include <sys/turnstile.h> > #include <sys/vmmeter.h> > @@ -138,6 +139,37 @@ struct lock_class lock_class_mtx_spin = { > #endif > }; > > +#ifdef ADAPTIVE_MUTEXES > +static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging"); > + > +static struct lock_delay_config mtx_delay = { > + .initial = 1000, > + .step = 500, > + .cap = 0, > + .min = 100, > + .max = 5000, > +}; > + > +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, &mtx_delay.initial, > + 0, ""); > +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, &mtx_delay.step, > + 0, ""); > +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, &mtx_delay.min, > + 0, ""); > +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, > + 0, ""); > + > +static void > +mtx_delay_sysinit(void *dummy) > +{ > + > + mtx_delay.initial = mp_ncpus * 25; > + mtx_delay.min = mp_ncpus * 5; > + mtx_delay.max = mp_ncpus * 25 * 10; > +} > +LOCK_DELAY_SYSINIT(mtx_delay_sysinit); > +#endif > + > /* > * System-wide mutexes > */ > @@ -408,8 +440,11 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, > int contested = 0; > uint64_t waittime = 0; > #endif > -#ifdef KDTRACE_HOOKS > +#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) > uint64_t spin_cnt = 0; > + uint64_t delay = 0; > +#endif > +#ifdef KDTRACE_HOOKS > uint64_t sleep_cnt = 0; > int64_t sleep_time = 0; > int64_t all_time = 0; > @@ -471,12 +506,8 @@ __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, > "spinning", "lockname:\"%s\"", > m->lock_object.lo_name); > while (mtx_owner(m) == owner && > - TD_IS_RUNNING(owner)) { > - cpu_spinwait(); > -#ifdef KDTRACE_HOOKS > - spin_cnt++; > -#endif > - } > + TD_IS_RUNNING(owner)) > + lock_delay(&delay, &mtx_delay, &spin_cnt); > KTR_STATE0(KTR_SCHED, "thread", > sched_tdname((struct thread *)tid), > "running"); > diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c > index 496738d..cd93520 100644 > --- a/sys/kern/kern_rwlock.c > +++ b/sys/kern/kern_rwlock.c > @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); > #include <sys/proc.h> > #include <sys/rwlock.h> > #include <sys/sched.h> > +#include <sys/smp.h> > #include <sys/sysctl.h> > #include <sys/systm.h> > #include <sys/turnstile.h> > @@ -65,15 +66,6 @@ PMC_SOFT_DECLARE( , , lock, failed); > */ > #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) > > -#ifdef ADAPTIVE_RWLOCKS > -static int rowner_retries = 10; > -static int rowner_loops = 10000; > -static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, > - "rwlock debugging"); > -SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); > -SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); > -#endif > - > #ifdef DDB > #include <ddb/ddb.h> > > @@ -100,6 +92,42 @@ struct lock_class lock_class_rw = { > #endif > }; > > +#ifdef ADAPTIVE_RWLOCKS > +static int rowner_retries = 10; > +static int rowner_loops = 10000; > +static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, > + "rwlock debugging"); > +SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); > +SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); > + > +static struct lock_delay_config rw_delay = { > + .initial = 1000, > + .step = 500, > + .cap = 0, > + .min = 100, > + .max = 5000, > +}; > + > +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_initial, CTLFLAG_RW, &rw_delay.initial, > + 0, ""); > +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_step, CTLFLAG_RW, &rw_delay.step, > + 0, ""); > +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_min, CTLFLAG_RW, &rw_delay.min, > + 0, ""); > +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, > + 0, ""); > + > +static void > +rw_delay_sysinit(void *dummy) > +{ > + > + rw_delay.initial = mp_ncpus * 25; > + rw_delay.min = mp_ncpus * 5; > + rw_delay.max = mp_ncpus * 25 * 10; > +} > +LOCK_DELAY_SYSINIT(rw_delay_sysinit); > +#endif > + > /* > * Return a pointer to the owning thread if the lock is write-locked or > * NULL if the lock is unlocked or read-locked. > @@ -355,9 +383,12 @@ __rw_rlock(volatile uintptr_t *c, const char *file, int line) > int contested = 0; > #endif > uintptr_t v; > +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) > + uint64_t spin_cnt = 0; > + uint64_t delay = 0; > +#endif > #ifdef KDTRACE_HOOKS > uintptr_t state; > - uint64_t spin_cnt = 0; > uint64_t sleep_cnt = 0; > int64_t sleep_time = 0; > int64_t all_time = 0; > @@ -437,12 +468,8 @@ __rw_rlock(volatile uintptr_t *c, const char *file, int line) > sched_tdname(curthread), "spinning", > "lockname:\"%s\"", rw->lock_object.lo_name); > while ((struct thread*)RW_OWNER(rw->rw_lock) == > - owner && TD_IS_RUNNING(owner)) { > - cpu_spinwait(); > -#ifdef KDTRACE_HOOKS > - spin_cnt++; > -#endif > - } > + owner && TD_IS_RUNNING(owner)) > + lock_delay(&delay, &rw_delay, &spin_cnt); > KTR_STATE0(KTR_SCHED, "thread", > sched_tdname(curthread), "running"); > continue; > @@ -740,9 +767,12 @@ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, > uint64_t waittime = 0; > int contested = 0; > #endif > +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) > + uint64_t spin_cnt = 0; > + uint64_t delay = 0; > +#endif > #ifdef KDTRACE_HOOKS > uintptr_t state; > - uint64_t spin_cnt = 0; > uint64_t sleep_cnt = 0; > int64_t sleep_time = 0; > int64_t all_time = 0; > @@ -798,12 +828,8 @@ __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, > "spinning", "lockname:\"%s\"", > rw->lock_object.lo_name); > while ((struct thread*)RW_OWNER(rw->rw_lock) == owner && > - TD_IS_RUNNING(owner)) { > - cpu_spinwait(); > -#ifdef KDTRACE_HOOKS > - spin_cnt++; > -#endif > - } > + TD_IS_RUNNING(owner)) > + lock_delay(&delay, &rw_delay, &spin_cnt); > KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), > "running"); > continue; > diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c > index a84df52..9a3465f 100644 > --- a/sys/kern/kern_sx.c > +++ b/sys/kern/kern_sx.c > @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); > #include <sys/sched.h> > #include <sys/sleepqueue.h> > #include <sys/sx.h> > +#include <sys/smp.h> > #include <sys/sysctl.h> > > #if defined(SMP) && !defined(NO_ADAPTIVE_SX) > @@ -145,6 +146,33 @@ static u_int asx_loops = 10000; > static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); > SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); > SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); > + > +static struct lock_delay_config sx_delay = { > + .initial = 1000, > + .step = 500, > + .cap = 0, > + .min = 100, > + .max = 5000, > +}; > + > +SYSCTL_INT(_debug_sx, OID_AUTO, delay_initial, CTLFLAG_RW, &sx_delay.initial, > + 0, ""); > +SYSCTL_INT(_debug_sx, OID_AUTO, delay_step, CTLFLAG_RW, &sx_delay.step, > + 0, ""); > +SYSCTL_INT(_debug_sx, OID_AUTO, delay_min, CTLFLAG_RW, &sx_delay.min, > + 0, ""); > +SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, > + 0, ""); > + > +static void > +sx_delay_sysinit(void *dummy) > +{ > + > + sx_delay.initial = mp_ncpus * 25; > + sx_delay.min = mp_ncpus * 5; > + sx_delay.max = mp_ncpus * 25 * 10; > +} > +LOCK_DELAY_SYSINIT(sx_delay_sysinit); > #endif > > void > @@ -513,9 +541,12 @@ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, > int contested = 0; > #endif > int error = 0; > +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) > + uint64_t spin_cnt = 0; > + uint64_t delay = 0; > +#endif > #ifdef KDTRACE_HOOKS > uintptr_t state; > - uint64_t spin_cnt = 0; > uint64_t sleep_cnt = 0; > int64_t sleep_time = 0; > int64_t all_time = 0; > @@ -578,12 +609,8 @@ _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, > sx->lock_object.lo_name); > GIANT_SAVE(); > while (SX_OWNER(sx->sx_lock) == x && > - TD_IS_RUNNING(owner)) { > - cpu_spinwait(); > -#ifdef KDTRACE_HOOKS > - spin_cnt++; > -#endif > - } > + TD_IS_RUNNING(owner)) > + lock_delay(&delay, &sx_delay, &spin_cnt); > KTR_STATE0(KTR_SCHED, "thread", > sched_tdname(curthread), "running"); > continue; > @@ -818,9 +845,12 @@ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) > #endif > uintptr_t x; > int error = 0; > +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) > + uint64_t spin_cnt = 0; > + uint64_t delay = 0; > +#endif IMO it would be more maintainable to pack the local parameters for spin iteration into some opaque structure. You would also provide a function to initialize the local structure, and pass it to lock_delay() together with &lock_delay_config. > #ifdef KDTRACE_HOOKS > uintptr_t state; > - uint64_t spin_cnt = 0; > uint64_t sleep_cnt = 0; > int64_t sleep_time = 0; > int64_t all_time = 0; > @@ -888,12 +918,8 @@ _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) > "lockname:\"%s\"", sx->lock_object.lo_name); > GIANT_SAVE(); > while (SX_OWNER(sx->sx_lock) == x && > - TD_IS_RUNNING(owner)) { > - cpu_spinwait(); > -#ifdef KDTRACE_HOOKS > - spin_cnt++; > -#endif > - } > + TD_IS_RUNNING(owner)) > + lock_delay(&delay, &sx_delay, &spin_cnt); > KTR_STATE0(KTR_SCHED, "thread", > sched_tdname(curthread), "running"); > continue; > diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c > index e78d5a9..546881f 100644 > --- a/sys/kern/subr_lock.c > +++ b/sys/kern/subr_lock.c > @@ -103,6 +103,37 @@ lock_destroy(struct lock_object *lock) > lock->lo_flags &= ~LO_INITIALIZED; > } > > +uint64_t > +lock_delay(uint64_t *delayp, struct lock_delay_config *l, uint64_t *spin_cntp) > +{ > + uint64_t i, delay, backoff, min, max, cap; I am just curious there, are these locals indeed help ? Why not use l->xxx members directly ? > + > + delay = *delayp; > + > + if (delay == 0) > + delay = l->initial; > + else { > + delay += l->step; > + max = l->max; > + if (delay > max) > + delay = max; > + } > + > + backoff = cpu_ticks() % delay; Did you tested this on any 32bit architecture ? 64bit division done on each spin loop iteration there might give much larger delay than intended, and heat the core. I.e., it would be useful to ensure that there is no regression on small 32bit machines. > + min = l->min; > + if (backoff < min) > + backoff = min; > + cap = l->cap; > + if (cap > 0 && backoff > cap - *spin_cntp) > + backoff = cap - *spin_cntp; > + for (i = 0; i < backoff; i++) > + cpu_spinwait(); > + > + *delayp = delay; > + *spin_cntp += backoff; > + return (cap - *spin_cntp); What is the purpose of this calculation ? All callers seems to ignore the return value. > +} > + > #ifdef DDB > DB_SHOW_COMMAND(lock, db_show_lock) > { > diff --git a/sys/sys/lock.h b/sys/sys/lock.h > index 8d7a068..ffac5ba 100644 > --- a/sys/sys/lock.h > +++ b/sys/sys/lock.h > @@ -201,9 +201,23 @@ extern struct lock_class lock_class_lockmgr; > > extern struct lock_class *lock_classes[]; > > +extern int lock_delay_enabled; > + > +struct lock_delay_config { > + u_int initial; > + u_int step; > + u_int cap; > + u_int min; > + u_int max; > +}; > + > +#define LOCK_DELAY_SYSINIT(func) \ > + SYSINIT(func##_ld, SI_SUB_LOCK, SI_ORDER_ANY, func, NULL) > + > void lock_init(struct lock_object *, struct lock_class *, > const char *, const char *, int); > void lock_destroy(struct lock_object *); > +uint64_t lock_delay(uint64_t *, struct lock_delay_config *, uint64_t *); > void spinlock_enter(void); > void spinlock_exit(void); > void witness_init(struct lock_object *, const char *);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20160731104928.GW83214>