Date: Sun, 04 Jan 2015 13:15:13 +0100 From: Hans Petter Selasky <hps@selasky.org> To: Ivan Klymenko <fidaj@ukr.net> Cc: FreeBSD Current <freebsd-current@freebsd.org>, freebsd-arch@freebsd.org Subject: Re: [RFC] kern/kern_timeout.c rewrite in progress Message-ID: <54A92ED1.2070906@selasky.org> In-Reply-To: <54A53F4F.2000003@selasky.org> References: <54A1B38C.1000709@selasky.org> <20150101005613.4f788b0c@nonamehost.local> <54A49CA5.2060801@selasky.org> <54A4A002.8010802@selasky.org> <54A53F4F.2000003@selasky.org>
next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format. --------------070009030809080804000507 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Hi, Please find attached an updated timeout patch which also updates clients in the kernel area to use the callout API properly, like cv_timedwait(). Previously there was some custom sleepqueue code in the callout subsystem. All of that has now been removed and we allow callouts to be protected by spinlocks. This allows us to tear down the callback like done with regular mutexes, and a "td_slpmutex" has been added to "struct thread" to atomically teardown the "td_slpcallout". Further the "TDF_TIMOFAIL" and "SWT_SLEEPQTIMO" states can now be completely removed. Summary of changes: 1) Make consistent callout API which also supports spinlocks for the callback function. This has been done to allow atomic callout stop of "td_slpcallout" without the need of many kernel threading quirks. 2) It is not allowed to migrate CPU if the timeout is restarted while the timeout callback is executing. Callouts must be stopped before CPU migration is allowed. Optionally drained. 3) Shared lock support has been removed, because it prevents atomic stop of the callback function. 4) A new API to drain callouts asynchronously has been added, called "callout_drain_async()". Please test and report any errors! Patch applies to FreeBSD-11-current as of today. Thank you! --HPS --------------070009030809080804000507 Content-Type: text/x-patch; name="kern_timeout_wip_v2.diff" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="kern_timeout_wip_v2.diff" Index: sys/ofed/include/linux/completion.h =================================================================== --- sys/ofed/include/linux/completion.h (revision 276531) +++ sys/ofed/include/linux/completion.h (working copy) @@ -105,7 +105,9 @@ if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); + sleepq_release(c); sleepq_set_timeout(c, end - ticks); + sleepq_lock(c); if (flags & SLEEPQ_INTERRUPTIBLE) { if (sleepq_timedwait_sig(c, 0) != 0) return (-ERESTARTSYS); Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c (revision 276531) +++ sys/kern/init_main.c (working copy) @@ -504,7 +504,8 @@ callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); - callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); + mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); + callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); /* Create credentials. */ p->p_ucred = crget(); Index: sys/kern/kern_condvar.c =================================================================== --- sys/kern/kern_condvar.c (revision 276531) +++ sys/kern/kern_condvar.c (working copy) @@ -313,15 +313,13 @@ DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); + sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { - if (class->lc_flags & LC_SLEEPABLE) - sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); - if (class->lc_flags & LC_SLEEPABLE) - sleepq_lock(cvp); } + sleepq_lock(cvp); rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE @@ -383,15 +381,13 @@ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); + sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { - if (class->lc_flags & LC_SLEEPABLE) - sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); - if (class->lc_flags & LC_SLEEPABLE) - sleepq_lock(cvp); } + sleepq_lock(cvp); rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE Index: sys/kern/kern_lock.c =================================================================== --- sys/kern/kern_lock.c (revision 276531) +++ sys/kern/kern_lock.c (working copy) @@ -210,9 +210,11 @@ GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); - if ((flags & LK_TIMELOCK) && timo) + if ((flags & LK_TIMELOCK) && timo) { + sleepq_release(&lk->lock_object); sleepq_set_timeout(&lk->lock_object, timo); - + sleepq_lock(&lk->lock_object); + } /* * Decisional switch for real sleeping. */ Index: sys/kern/kern_switch.c =================================================================== --- sys/kern/kern_switch.c (revision 276531) +++ sys/kern/kern_switch.c (working copy) @@ -93,8 +93,6 @@ &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); SCHED_STAT_DEFINE_VAR(sleepq, &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); -SCHED_STAT_DEFINE_VAR(sleepqtimo, - &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), ""); SCHED_STAT_DEFINE_VAR(relinquish, &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); SCHED_STAT_DEFINE_VAR(needresched, Index: sys/kern/kern_synch.c =================================================================== --- sys/kern/kern_synch.c (revision 276531) +++ sys/kern/kern_synch.c (working copy) @@ -236,13 +236,17 @@ * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); - if (sbt != 0) - sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); + if (sbt != 0) + sleepq_set_timeout_sbt(ident, sbt, pr, flags); sleepq_lock(ident); + } else if (sbt != 0) { + sleepq_release(ident); + sleepq_set_timeout_sbt(ident, sbt, pr, flags); + sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); @@ -306,8 +310,11 @@ * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); - if (sbt != 0) + if (sbt != 0) { + sleepq_release(ident); sleepq_set_timeout_sbt(ident, sbt, pr, flags); + sleepq_lock(ident); + } /* * Can't call ktrace with any spin locks held so it can lock the Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c (revision 276531) +++ sys/kern/kern_thread.c (working copy) @@ -149,6 +149,9 @@ audit_thread_alloc(td); #endif umtx_thread_alloc(td); + + mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); + callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); return (0); } @@ -162,6 +165,10 @@ td = (struct thread *)mem; + /* make sure to drain any use of the "td->td_slpcallout" */ + callout_drain(&td->td_slpcallout); + mtx_destroy(&td->td_slpmutex); + #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { @@ -544,7 +551,6 @@ LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); - callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } Index: sys/kern/kern_timeout.c =================================================================== --- sys/kern/kern_timeout.c (revision 276531) +++ sys/kern/kern_timeout.c (working copy) @@ -54,6 +54,8 @@ #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mutex.h> +#include <sys/rmlock.h> +#include <sys/rwlock.h> #include <sys/proc.h> #include <sys/sdt.h> #include <sys/sleepqueue.h> @@ -124,37 +126,216 @@ */ u_int callwheelsize, callwheelmask; +typedef void callout_mutex_op_t(struct lock_object *); +typedef int callout_owned_op_t(struct lock_object *); + +struct callout_mutex_ops { + callout_mutex_op_t *lock; + callout_mutex_op_t *unlock; + callout_owned_op_t *owned; +}; + +enum { + CALLOUT_LC_UNUSED_0, + CALLOUT_LC_UNUSED_1, + CALLOUT_LC_UNUSED_2, + CALLOUT_LC_UNUSED_3, + CALLOUT_LC_SPIN, + CALLOUT_LC_MUTEX, + CALLOUT_LC_RW, + CALLOUT_LC_RM, +}; + +static void +callout_mutex_op_none(struct lock_object *lock) +{ +} + +static int +callout_owned_op_none(struct lock_object *lock) +{ + return (0); +} + +static void +callout_mutex_lock(struct lock_object *lock) +{ + mtx_lock((struct mtx *)lock); +} + +static void +callout_mutex_unlock(struct lock_object *lock) +{ + mtx_unlock((struct mtx *)lock); +} + +static void +callout_mutex_lock_spin(struct lock_object *lock) +{ + mtx_lock_spin((struct mtx *)lock); +} + +static void +callout_mutex_unlock_spin(struct lock_object *lock) +{ + mtx_unlock_spin((struct mtx *)lock); +} + +static int +callout_mutex_owned(struct lock_object *lock) +{ + return (mtx_owned((struct mtx *)lock)); +} + +static void +callout_rm_wlock(struct lock_object *lock) +{ + rm_wlock((struct rmlock *)lock); +} + +static void +callout_rm_wunlock(struct lock_object *lock) +{ + rm_wunlock((struct rmlock *)lock); +} + +static int +callout_rm_owned(struct lock_object *lock) +{ + return (rm_wowned((struct rmlock *)lock)); +} + +static void +callout_rw_wlock(struct lock_object *lock) +{ + rw_wlock((struct rwlock *)lock); +} + +static void +callout_rw_wunlock(struct lock_object *lock) +{ + rw_wunlock((struct rwlock *)lock); +} + +static int +callout_rw_owned(struct lock_object *lock) +{ + return (rw_wowned((struct rwlock *)lock)); +} + +static const struct callout_mutex_ops callout_mutex_ops[8] = { + [CALLOUT_LC_UNUSED_0] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_1] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_2] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_3] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_SPIN] = { + .lock = callout_mutex_lock_spin, + .unlock = callout_mutex_unlock_spin, + .owned = callout_mutex_owned, + }, + [CALLOUT_LC_MUTEX] = { + .lock = callout_mutex_lock, + .unlock = callout_mutex_unlock, + .owned = callout_mutex_owned, + }, + [CALLOUT_LC_RW] = { + .lock = callout_rw_wlock, + .unlock = callout_rw_wunlock, + .owned = callout_rw_owned, + }, + [CALLOUT_LC_RM] = { + .lock = callout_rm_wlock, + .unlock = callout_rm_wunlock, + .owned = callout_rm_owned, + }, +}; + +static void +callout_lock_client(int c_flags, struct lock_object *c_lock) +{ + callout_mutex_ops[CALLOUT_GET_LC(c_flags)].lock(c_lock); +} + +static void +callout_unlock_client(int c_flags, struct lock_object *c_lock) +{ + callout_mutex_ops[CALLOUT_GET_LC(c_flags)].unlock(c_lock); +} + +#ifdef SMP +static int +callout_lock_owned_client(int c_flags, struct lock_object *c_lock) +{ + return (callout_mutex_ops[CALLOUT_GET_LC(c_flags)].owned(c_lock)); +} +#endif + /* - * The callout cpu exec entities represent informations necessary for - * describing the state of callouts currently running on the CPU and the ones - * necessary for migrating callouts to the new callout cpu. In particular, - * the first entry of the array cc_exec_entity holds informations for callout - * running in SWI thread context, while the second one holds informations - * for callout running directly from hardware interrupt context. - * The cached informations are very important for deferring migration when - * the migrating callout is already running. + * The callout CPU exec structure represent information necessary for + * describing the state of callouts currently running on the CPU and + * for handling deferred callout restarts. + * + * In particular, the first entry of the array cc_exec_entity holds + * information for callouts running from the SWI thread context, while + * the second one holds information for callouts running directly from + * the hardware interrupt context. */ struct cc_exec { - struct callout *cc_next; + /* + * The "cc_curr" points to the currently executing callout and + * is protected by the "cc_lock" spinlock. If no callback is + * currently executing it is equal to "NULL". + */ struct callout *cc_curr; -#ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - sbintime_t ce_migration_time; - sbintime_t ce_migration_prec; -#endif - bool cc_cancel; - bool cc_waiting; + /* + * The "cc_restart_args" structure holds the argument for a + * deferred callback restart and is protected by the "cc_lock" + * spinlock. The structure is only valid if "cc_restart" is + * "true". If "cc_restart" is "false" the information in the + * "cc_restart_args" structure shall be ignored. + */ + struct callout_args cc_restart_args; + bool cc_restart; + /* + * The "cc_cancel" variable allows the currently pending + * callback to be atomically cancelled. This field is write + * protected by the "cc_lock" spinlock. + */ + bool cc_cancel; + /* + * The "cc_drain_fn" points to a function which shall be + * called with the argument stored in "cc_drain_arg" when an + * asynchronous drain is performed. This field is write + * protected by the "cc_lock" spinlock. + */ + callout_func_t *cc_drain_fn; + void *cc_drain_arg; }; /* - * There is one struct callout_cpu per cpu, holding all relevant + * There is one "struct callout_cpu" per CPU, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; + struct callout *cc_exec_next_dir; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; @@ -166,27 +347,7 @@ char cc_ktr_event_name[20]; }; -#define cc_exec_curr cc_exec_entity[0].cc_curr -#define cc_exec_next cc_exec_entity[0].cc_next -#define cc_exec_cancel cc_exec_entity[0].cc_cancel -#define cc_exec_waiting cc_exec_entity[0].cc_waiting -#define cc_exec_curr_dir cc_exec_entity[1].cc_curr -#define cc_exec_next_dir cc_exec_entity[1].cc_next -#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel -#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting - #ifdef SMP -#define cc_migration_func cc_exec_entity[0].ce_migration_func -#define cc_migration_arg cc_exec_entity[0].ce_migration_arg -#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu -#define cc_migration_time cc_exec_entity[0].ce_migration_time -#define cc_migration_prec cc_exec_entity[0].ce_migration_prec -#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func -#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg -#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu -#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time -#define cc_migration_prec_dir cc_exec_entity[1].ce_migration_prec - struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) @@ -211,62 +372,11 @@ static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); -/** - * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is cc_curr. - * If cc_curr is non-NULL, threads waiting in - * callout_drain() will be woken up as soon as the - * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and cc_lock held - * guarantees that the current callout will not run. - * The softclock() function sets this to 0 before it - * drops callout_lock to acquire c_lock, and it calls - * the handler only if curr_cancelled is still 0 after - * cc_lock is successfully acquired. - * cc_waiting - If a thread is waiting in callout_drain(), then - * callout_wait is nonzero. Set only when - * cc_curr is non-NULL. - */ - /* - * Resets the execution entity tied to a specific callout cpu. + * Kernel low level callwheel initialization called from cpu0 during + * kernel startup: */ static void -cc_cce_cleanup(struct callout_cpu *cc, int direct) -{ - - cc->cc_exec_entity[direct].cc_curr = NULL; - cc->cc_exec_entity[direct].cc_next = NULL; - cc->cc_exec_entity[direct].cc_cancel = false; - cc->cc_exec_entity[direct].cc_waiting = false; -#ifdef SMP - cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; - cc->cc_exec_entity[direct].ce_migration_time = 0; - cc->cc_exec_entity[direct].ce_migration_prec = 0; - cc->cc_exec_entity[direct].ce_migration_func = NULL; - cc->cc_exec_entity[direct].ce_migration_arg = NULL; -#endif -} - -/* - * Checks if migration is requested by a specific callout cpu. - */ -static int -cc_cce_migrating(struct callout_cpu *cc, int direct) -{ - -#ifdef SMP - return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); -#else - return (0); -#endif -} - -/* - * Kernel low level callwheel initialization - * called on cpu0 during kernel startup. - */ -static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; @@ -324,8 +434,6 @@ LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; - for (i = 0; i < 2; i++) - cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only cpu0 handles timeout(9) */ @@ -333,42 +441,12 @@ for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); - c->c_flags = CALLOUT_LOCAL_ALLOC; + c->c_flags |= CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } -#ifdef SMP /* - * Switches the cpu tied to a specific callout. - * The function expects a locked incoming callout cpu and returns with - * locked outcoming callout cpu. - */ -static struct callout_cpu * -callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) -{ - struct callout_cpu *new_cc; - - MPASS(c != NULL && cc != NULL); - CC_LOCK_ASSERT(cc); - - /* - * Avoid interrupts and preemption firing after the callout cpu - * is blocked in order to avoid deadlocks as the new thread - * may be willing to acquire the callout cpu lock. - */ - c->c_cpu = CPUBLOCK; - spinlock_enter(); - CC_UNLOCK(cc); - new_cc = CC_CPU(new_cpu); - CC_LOCK(new_cc); - spinlock_exit(); - c->c_cpu = new_cpu; - return (new_cc); -} -#endif - -/* * Start standard softclock thread. */ static void @@ -444,9 +522,8 @@ #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); + CC_LOCK(cc); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); @@ -549,7 +626,7 @@ avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); + CC_UNLOCK(cc); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. @@ -562,49 +639,55 @@ callout_lock(struct callout *c) { struct callout_cpu *cc; - int cpu; - - for (;;) { - cpu = c->c_cpu; -#ifdef SMP - if (cpu == CPUBLOCK) { - while (c->c_cpu == CPUBLOCK) - cpu_spinwait(); - continue; - } -#endif - cc = CC_CPU(cpu); - CC_LOCK(cc); - if (cpu == c->c_cpu) - break; - CC_UNLOCK(cc); - } + cc = CC_CPU(c->c_cpu); + CC_LOCK(cc); return (cc); } -static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, - sbintime_t sbt, sbintime_t precision, void (*func)(void *), - void *arg, int cpu, int flags) +static struct callout_cpu * +callout_cc_add_locked(struct callout *c, struct callout_cpu *cc, + struct callout_args *coa, bool can_swap_cpu) { +#ifndef NO_EVENTTIMERS + sbintime_t sbt; +#endif int bucket; CC_LOCK_ASSERT(cc); - if (sbt < cc->cc_lastscan) - sbt = cc->cc_lastscan; - c->c_arg = arg; - c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); - if (flags & C_DIRECT_EXEC) - c->c_flags |= CALLOUT_DIRECT; - c->c_flags &= ~CALLOUT_PROCESSED; - c->c_func = func; - c->c_time = sbt; - c->c_precision = precision; + + /* update flags before swapping locks, if any */ + c->c_flags &= ~(CALLOUT_PROCESSED | CALLOUT_DIRECT | CALLOUT_DEFRESTART); + if (coa->flags & C_DIRECT_EXEC) + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_DIRECT); + else + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + +#ifdef SMP + /* + * Check if we are changing the CPU on which the callback + * should be executed and if we have a lock protecting us: + */ + if (can_swap_cpu != false && coa->cpu != c->c_cpu && + callout_lock_owned_client(c->c_flags, c->c_lock) != 0) { + CC_UNLOCK(cc); + c->c_cpu = coa->cpu; + cc = callout_lock(c); + } +#endif + if (coa->time < cc->cc_lastscan) + coa->time = cc->cc_lastscan; + c->c_arg = coa->arg; + c->c_func = coa->func; + c->c_time = coa->time; + c->c_precision = coa->precision; + bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); + + /* Ensure we are first to be scanned, if called via a callback */ if (cc->cc_bucket == bucket) cc->cc_exec_next_dir = c; #ifndef NO_EVENTTIMERS @@ -617,9 +700,10 @@ sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; - cpu_new_callout(cpu, sbt, c->c_time); + cpu_new_callout(coa->cpu, sbt, c->c_time); } #endif + return (cc); } static void @@ -626,8 +710,6 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) { - if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0) - return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } @@ -639,20 +721,10 @@ #endif int direct) { - struct rm_priotracker tracker; - void (*c_func)(void *); + callout_func_t *c_func; void *c_arg; - struct lock_class *class; struct lock_object *c_lock; - uintptr_t lock_status; int c_flags; -#ifdef SMP - struct callout_cpu *new_cc; - void (*new_func)(void *); - void *new_arg; - int flags, new_cpu; - sbintime_t new_prec, new_time; -#endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; @@ -663,37 +735,43 @@ KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) == (CALLOUT_PENDING | CALLOUT_ACTIVE), ("softclock_call_cc: pend|act %p %x", c, c->c_flags)); - class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; - lock_status = 0; - if (c->c_flags & CALLOUT_SHAREDLOCK) { - if (class == &lock_class_rm) - lock_status = (uintptr_t)&tracker; - else - lock_status = 1; - } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_flags = c->c_flags; - if (c->c_flags & CALLOUT_LOCAL_ALLOC) - c->c_flags = CALLOUT_LOCAL_ALLOC; - else - c->c_flags &= ~CALLOUT_PENDING; + + /* remove pending bit */ + c->c_flags &= ~CALLOUT_PENDING; + + /* reset our local state */ cc->cc_exec_entity[direct].cc_curr = c; cc->cc_exec_entity[direct].cc_cancel = false; - CC_UNLOCK(cc); + cc->cc_exec_entity[direct].cc_restart = false; + cc->cc_exec_entity[direct].cc_drain_fn = NULL; + cc->cc_exec_entity[direct].cc_drain_arg = NULL; + if (c_lock != NULL) { - class->lc_lock(c_lock, lock_status); + CC_UNLOCK(cc); + + /* unlocked region for switching locks */ + + callout_lock_client(c_flags, c_lock); + /* - * The callout may have been cancelled - * while we switched locks. + * Check if the callout may have been cancelled while + * we were switching locks. Even though the callout is + * specifying a lock, it might not be certain this + * lock is locked when starting and stopping callouts. */ + CC_LOCK(cc); if (cc->cc_exec_entity[direct].cc_cancel) { - class->lc_unlock(c_lock); - goto skip; + callout_unlock_client(c_flags, c_lock); + goto skip_cc_locked; } - /* The callout cannot be stopped now. */ + /* The callout cannot be stopped now! */ cc->cc_exec_entity[direct].cc_cancel = true; + CC_UNLOCK(cc); + if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; @@ -708,6 +786,8 @@ c, c_func, c_arg); } } else { + CC_UNLOCK(cc); + /* unlocked region */ #ifdef CALLOUT_PROFILING (*mpcalls)++; #endif @@ -740,85 +820,40 @@ #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); + + /* + * At this point the callback structure might have been freed, + * so we need to check the previously copied value of + * "c->c_flags": + */ if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) - class->lc_unlock(c_lock); -skip: + callout_unlock_client(c_flags, c_lock); + CC_LOCK(cc); + +skip_cc_locked: KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); cc->cc_exec_entity[direct].cc_curr = NULL; - if (cc->cc_exec_entity[direct].cc_waiting) { + + /* Check if there is anything which needs draining */ + if (cc->cc_exec_entity[direct].cc_drain_fn != NULL) { /* - * There is someone waiting for the - * callout to complete. - * If the callout was scheduled for - * migration just cancel it. + * Unlock the CPU callout last, so that any use of + * structures belonging to the callout are complete: */ - if (cc_cce_migrating(cc, direct)) { - cc_cce_cleanup(cc, direct); - - /* - * It should be assert here that the callout is not - * destroyed but that is not easy. - */ - c->c_flags &= ~CALLOUT_DFRMIGRATION; - } - cc->cc_exec_entity[direct].cc_waiting = false; CC_UNLOCK(cc); - wakeup(&cc->cc_exec_entity[direct].cc_waiting); + /* call drain function unlocked */ + cc->cc_exec_entity[direct].cc_drain_fn( + cc->cc_exec_entity[direct].cc_drain_arg); CC_LOCK(cc); - } else if (cc_cce_migrating(cc, direct)) { - KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, - ("Migrating legacy callout %p", c)); -#ifdef SMP - /* - * If the callout was scheduled for - * migration just perform it now. - */ - new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; - new_time = cc->cc_exec_entity[direct].ce_migration_time; - new_prec = cc->cc_exec_entity[direct].ce_migration_prec; - new_func = cc->cc_exec_entity[direct].ce_migration_func; - new_arg = cc->cc_exec_entity[direct].ce_migration_arg; - cc_cce_cleanup(cc, direct); - - /* - * It should be assert here that the callout is not destroyed - * but that is not easy. - * - * As first thing, handle deferred callout stops. - */ - if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) { - CTR3(KTR_CALLOUT, - "deferred cancelled %p func %p arg %p", - c, new_func, new_arg); - callout_cc_del(c, cc); - return; - } - c->c_flags &= ~CALLOUT_DFRMIGRATION; - - new_cc = callout_cpu_switch(c, cc, new_cpu); - flags = (direct) ? C_DIRECT_EXEC : 0; - callout_cc_add(c, new_cc, new_time, new_prec, new_func, - new_arg, new_cpu, flags); - CC_UNLOCK(new_cc); - CC_LOCK(cc); -#else - panic("migration should not happen"); -#endif + } else if (c_flags & CALLOUT_LOCAL_ALLOC) { + /* return callout back to freelist */ + callout_cc_del(c, cc); + } else if (cc->cc_exec_entity[direct].cc_restart) { + /* [re-]schedule callout, if any */ + cc = callout_cc_add_locked(c, cc, + &cc->cc_exec_entity[direct].cc_restart_args, false); } - /* - * If the current callout is locally allocated (from - * timeout(9)) then put it on the freelist. - * - * Note: we need to check the cached copy of c_flags because - * if it was not local, then it's not safe to deref the - * callout pointer. - */ - KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 || - c->c_flags == CALLOUT_LOCAL_ALLOC, - ("corrupted callout")); - if (c_flags & CALLOUT_LOCAL_ALLOC) - callout_cc_del(c, cc); } /* @@ -899,10 +934,11 @@ /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); - callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); + callout_reset(new, to_ticks, ftn, arg); + return (handle); } @@ -910,6 +946,7 @@ untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; + bool match; /* * Check for a handle that was initialized @@ -920,9 +957,11 @@ return; cc = callout_lock(handle.callout); - if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) + match = (handle.callout->c_func == ftn && handle.callout->c_arg == arg); + CC_UNLOCK(cc); + + if (match) callout_stop(handle.callout); - CC_UNLOCK(cc); } void @@ -931,6 +970,119 @@ handle->callout = NULL; } +static int +callout_restart_async(struct callout *c, struct callout_args *coa, + callout_func_t *drain_fn, void *drain_arg) +{ + struct callout_cpu *cc; + int cancelled; + int direct; + + cc = callout_lock(c); + + /* Figure out if the callout is direct or not */ + direct = ((c->c_flags & CALLOUT_DIRECT) != 0); + + /* + * Check if the callback is currently scheduled for + * completion: + */ + if (cc->cc_exec_entity[direct].cc_curr == c) { + /* + * Try to prevent the callback from running by setting + * the "cc_cancel" variable to "true". Also check if + * the callout was previously subject to a deferred + * callout restart: + */ + if (cc->cc_exec_entity[direct].cc_cancel == false || + (c->c_flags & CALLOUT_DEFRESTART) != 0) { + cc->cc_exec_entity[direct].cc_cancel = true; + cancelled = 1; + } else { + cancelled = 0; + } + + /* + * Prevent callback restart if "callout_drain_xxx()" + * is being called or we are stopping the callout or + * the callback was preallocated by us: + */ + if (cc->cc_exec_entity[direct].cc_drain_fn != NULL || + coa == NULL || (c->c_flags & CALLOUT_LOCAL_ALLOC) != 0) { + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "cancelled and draining" : "draining", + c, c->c_func, c->c_arg); + + /* clear old flags, if any */ + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART | CALLOUT_PROCESSED); + + /* clear restart flag, if any */ + cc->cc_exec_entity[direct].cc_restart = false; + + /* set drain function, if any */ + if (drain_fn != NULL) { + cc->cc_exec_entity[direct].cc_drain_fn = drain_fn; + cc->cc_exec_entity[direct].cc_drain_arg = drain_arg; + cancelled |= 2; /* XXX define the value */ + } + } else { + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "cancelled and restarting" : "restarting", + c, c->c_func, c->c_arg); + + /* get us back into the game */ + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART); + c->c_flags &= ~CALLOUT_PROCESSED; + + /* enable deferred restart */ + cc->cc_exec_entity[direct].cc_restart = true; + + /* store arguments for the deferred restart, if any */ + cc->cc_exec_entity[direct].cc_restart_args = *coa; + } + } else { + /* stop callout */ + if (c->c_flags & CALLOUT_PENDING) { + /* + * The callback has not yet been executed, and + * we simply just need to unlink it: + */ + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + } + cancelled = 1; + } else { + cancelled = 0; + } + + /* [re-]schedule callout, if any */ + if (coa != NULL) { + cc = callout_cc_add_locked(c, cc, coa, true); + } else { + /* clear old flags, if any */ + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART | CALLOUT_PROCESSED); + + /* return callback to pre-allocated list, if any */ + if ((c->c_flags & CALLOUT_LOCAL_ALLOC) && cancelled != 0) { + callout_cc_del(c, cc); + } + } + + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "rescheduled" : "scheduled", + c, c->c_func, c->c_arg); + } + CC_UNLOCK(cc); + return (cancelled); +} + /* * New interface; clients allocate their own callout structures. * @@ -949,25 +1101,32 @@ */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, - void (*ftn)(void *), void *arg, int cpu, int flags) + callout_func_t *ftn, void *arg, int cpu, int flags) { - sbintime_t to_sbt, pr; - struct callout_cpu *cc; - int cancelled, direct; + struct callout_args coa; - cancelled = 0; - if (flags & C_ABSOLUTE) { - to_sbt = sbt; + /* store arguments for callout add function */ + coa.func = ftn; + coa.arg = arg; + coa.precision = precision; + coa.flags = flags; + coa.cpu = cpu; + + /* compute the rest of the arguments needed */ + if (coa.flags & C_ABSOLUTE) { + coa.time = sbt; } else { - if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbintime_t pr; + + if ((coa.flags & C_HARDCLOCK) && (sbt < tick_sbt)) sbt = tick_sbt; - if ((flags & C_HARDCLOCK) || + if ((coa.flags & C_HARDCLOCK) || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { - to_sbt = getsbinuptime(); + coa.time = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ - to_sbt += tc_tick_sbt - tick_sbt; + coa.time += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* @@ -977,101 +1136,29 @@ * active ones. */ #ifdef __LP64__ - to_sbt = DPCPU_GET(hardclocktime); + coa.time = DPCPU_GET(hardclocktime); #else spinlock_enter(); - to_sbt = DPCPU_GET(hardclocktime); + coa.time = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif - if ((flags & C_HARDCLOCK) == 0) - to_sbt += tick_sbt; + if ((coa.flags & C_HARDCLOCK) == 0) + coa.time += tick_sbt; } else - to_sbt = sbinuptime(); - if (SBT_MAX - to_sbt < sbt) - to_sbt = SBT_MAX; + coa.time = sbinuptime(); + if (SBT_MAX - coa.time < sbt) + coa.time = SBT_MAX; else - to_sbt += sbt; - pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : - sbt >> C_PRELGET(flags)); - if (pr > precision) - precision = pr; + coa.time += sbt; + pr = ((C_PRELGET(coa.flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(coa.flags)); + if (pr > coa.precision) + coa.precision = pr; } - /* - * Don't allow migration of pre-allocated callouts lest they - * become unbalanced. - */ - if (c->c_flags & CALLOUT_LOCAL_ALLOC) - cpu = c->c_cpu; - direct = (c->c_flags & CALLOUT_DIRECT) != 0; - KASSERT(!direct || c->c_lock == NULL, - ("%s: direct callout %p has lock", __func__, c)); - cc = callout_lock(c); - if (cc->cc_exec_entity[direct].cc_curr == c) { - /* - * We're being asked to reschedule a callout which is - * currently in progress. If there is a lock then we - * can cancel the callout if it has not really started. - */ - if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) - cancelled = cc->cc_exec_entity[direct].cc_cancel = true; - if (cc->cc_exec_entity[direct].cc_waiting) { - /* - * Someone has called callout_drain to kill this - * callout. Don't reschedule. - */ - CTR4(KTR_CALLOUT, "%s %p func %p arg %p", - cancelled ? "cancelled" : "failed to cancel", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - return (cancelled); - } - } - if (c->c_flags & CALLOUT_PENDING) { - if ((c->c_flags & CALLOUT_PROCESSED) == 0) { - if (cc->cc_exec_next_dir == c) - cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); - LIST_REMOVE(c, c_links.le); - } else - TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); - cancelled = 1; - c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); - } -#ifdef SMP - /* - * If the callout must migrate try to perform it immediately. - * If the callout is currently running, just defer the migration - * to a more appropriate moment. - */ - if (c->c_cpu != cpu) { - if (cc->cc_exec_entity[direct].cc_curr == c) { - cc->cc_exec_entity[direct].ce_migration_cpu = cpu; - cc->cc_exec_entity[direct].ce_migration_time - = to_sbt; - cc->cc_exec_entity[direct].ce_migration_prec - = precision; - cc->cc_exec_entity[direct].ce_migration_func = ftn; - cc->cc_exec_entity[direct].ce_migration_arg = arg; - c->c_flags |= CALLOUT_DFRMIGRATION; - CTR6(KTR_CALLOUT, - "migration of %p func %p arg %p in %d.%08x to %u deferred", - c, c->c_func, c->c_arg, (int)(to_sbt >> 32), - (u_int)(to_sbt & 0xffffffff), cpu); - CC_UNLOCK(cc); - return (cancelled); - } - cc = callout_cpu_switch(c, cc, cpu); - } -#endif - - callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); - CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", - cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), - (u_int)(to_sbt & 0xffffffff)); - CC_UNLOCK(cc); - - return (cancelled); + /* get callback started, if any */ + return (callout_restart_async(c, &coa, NULL, NULL)); } /* @@ -1090,189 +1177,79 @@ } int -_callout_stop_safe(struct callout *c, int safe) +callout_stop(struct callout *c) { - struct callout_cpu *cc, *old_cc; - struct lock_class *class; - int direct, sq_locked, use_lock; + /* get callback stopped, if any */ + return (callout_restart_async(c, NULL, NULL, NULL)); +} - /* - * Some old subsystems don't hold Giant while running a callout_stop(), - * so just discard this check for the moment. - */ - if (!safe && c->c_lock != NULL) { - if (c->c_lock == &Giant.lock_object) - use_lock = mtx_owned(&Giant); - else { - use_lock = 1; - class = LOCK_CLASS(c->c_lock); - class->lc_assert(c->c_lock, LA_XLOCKED); - } - } else - use_lock = 0; - direct = (c->c_flags & CALLOUT_DIRECT) != 0; - sq_locked = 0; - old_cc = NULL; -again: - cc = callout_lock(c); +static void +callout_drain_function(void *arg) +{ + wakeup(arg); +} - /* - * If the callout was migrating while the callout cpu lock was - * dropped, just drop the sleepqueue lock and check the states - * again. - */ - if (sq_locked != 0 && cc != old_cc) { -#ifdef SMP - CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); - sq_locked = 0; - old_cc = NULL; - goto again; -#else - panic("migration should not happen"); -#endif - } +int +callout_drain_async(struct callout *c, callout_func_t *fn, void *arg) +{ + /* get callback stopped, if any */ + return (callout_restart_async(c, NULL, fn, arg) & 2); +} - /* - * If the callout isn't pending, it's not on the queue, so - * don't attempt to remove it from the queue. We can try to - * stop it by other means however. - */ - if (!(c->c_flags & CALLOUT_PENDING)) { - c->c_flags &= ~CALLOUT_ACTIVE; +int +callout_drain(struct callout *c) +{ + int cancelled; + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "Draining callout"); + + callout_lock_client(c->c_flags, c->c_lock); + + /* at this point the "c->c_cpu" field is not changing */ + + cancelled = callout_drain_async(c, &callout_drain_function, c); + + if (cancelled != 0) { + struct callout_cpu *cc; + int direct; + + CTR3(KTR_CALLOUT, "need to drain %p func %p arg %p", + c, c->c_func, c->c_arg); + + cc = callout_lock(c); + direct = ((c->c_flags & CALLOUT_DIRECT) != 0); + /* - * If it wasn't on the queue and it isn't the current - * callout, then we can't stop it, so just bail. + * We've gotten our callout CPU lock, it is safe to + * drop the initial lock: */ - if (cc->cc_exec_entity[direct].cc_curr != c) { - CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - if (sq_locked) - sleepq_release( - &cc->cc_exec_entity[direct].cc_waiting); - return (0); - } + callout_unlock_client(c->c_flags, c->c_lock); - if (safe) { - /* - * The current callout is running (or just - * about to run) and blocking is allowed, so - * just wait for the current invocation to - * finish. - */ - while (cc->cc_exec_entity[direct].cc_curr == c) { - /* - * Use direct calls to sleepqueue interface - * instead of cv/msleep in order to avoid - * a LOR between cc_lock and sleepqueue - * chain spinlocks. This piece of code - * emulates a msleep_spin() call actually. - * - * If we already have the sleepqueue chain - * locked, then we can safely block. If we - * don't already have it locked, however, - * we have to drop the cc_lock to lock - * it. This opens several races, so we - * restart at the beginning once we have - * both locks. If nothing has changed, then - * we will end up back here with sq_locked - * set. - */ - if (!sq_locked) { - CC_UNLOCK(cc); - sleepq_lock( - &cc->cc_exec_entity[direct].cc_waiting); - sq_locked = 1; - old_cc = cc; - goto again; - } + /* Wait for drain to complete */ - /* - * Migration could be cancelled here, but - * as long as it is still not sure when it - * will be packed up, just let softclock() - * take care of it. - */ - cc->cc_exec_entity[direct].cc_waiting = true; - DROP_GIANT(); - CC_UNLOCK(cc); - sleepq_add( - &cc->cc_exec_entity[direct].cc_waiting, - &cc->cc_lock.lock_object, "codrain", - SLEEPQ_SLEEP, 0); - sleepq_wait( - &cc->cc_exec_entity[direct].cc_waiting, - 0); - sq_locked = 0; - old_cc = NULL; + while (cc->cc_exec_entity[direct].cc_curr == c) + msleep_spin(c, (struct mtx *)&cc->cc_lock, "codrain", 0); - /* Reacquire locks previously released. */ - PICKUP_GIANT(); - CC_LOCK(cc); - } - } else if (use_lock && - !cc->cc_exec_entity[direct].cc_cancel) { - /* - * The current callout is waiting for its - * lock which we hold. Cancel the callout - * and return. After our caller drops the - * lock, the callout will be skipped in - * softclock(). - */ - cc->cc_exec_entity[direct].cc_cancel = true; - CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", - c, c->c_func, c->c_arg); - KASSERT(!cc_cce_migrating(cc, direct), - ("callout wrongly scheduled for migration")); - CC_UNLOCK(cc); - KASSERT(!sq_locked, ("sleepqueue chain locked")); - return (1); - } else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) { - c->c_flags &= ~CALLOUT_DFRMIGRATION; - CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - return (1); - } - CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", - c, c->c_func, c->c_arg); CC_UNLOCK(cc); - KASSERT(!sq_locked, ("sleepqueue chain still locked")); - return (0); + } else { + callout_unlock_client(c->c_flags, c->c_lock); } - if (sq_locked) - sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); - c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); - CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if ((c->c_flags & CALLOUT_PROCESSED) == 0) { - if (cc->cc_exec_next_dir == c) - cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); - LIST_REMOVE(c, c_links.le); - } else - TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); - callout_cc_del(c, cc); - CC_UNLOCK(cc); - return (1); + return (cancelled & 1); } void callout_init(struct callout *c, int mpsafe) { - bzero(c, sizeof *c); if (mpsafe) { - c->c_lock = NULL; - c->c_flags = CALLOUT_RETURNUNLOCKED; + _callout_init_lock(c, NULL, CALLOUT_RETURNUNLOCKED); } else { - c->c_lock = &Giant.lock_object; - c->c_flags = 0; + _callout_init_lock(c, &Giant.lock_object, 0); } - c->c_cpu = timeout_cpu; } void @@ -1279,15 +1256,26 @@ _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); + KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0, + ("callout_init_lock: bad flags 0x%08x", flags)); + flags &= CALLOUT_RETURNUNLOCKED; + if (lock != NULL) { + struct lock_class *class = LOCK_CLASS(lock); + if (class == &lock_class_mtx_sleep) + flags |= CALLOUT_SET_LC(CALLOUT_LC_MUTEX); + else if (class == &lock_class_mtx_spin) + flags |= CALLOUT_SET_LC(CALLOUT_LC_SPIN); + else if (class == &lock_class_rm) + flags |= CALLOUT_SET_LC(CALLOUT_LC_RM); + else if (class == &lock_class_rw) + flags |= CALLOUT_SET_LC(CALLOUT_LC_RW); + else + panic("callout_init_lock: Unsupported lock class '%s'\n", class->lc_name); + } else { + flags |= CALLOUT_SET_LC(CALLOUT_LC_UNUSED_0); + } c->c_lock = lock; - KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, - ("callout_init_lock: bad flags %d", flags)); - KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, - ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); - KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & - (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", - __func__)); - c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); + c->c_flags = flags; c->c_cpu = timeout_cpu; } Index: sys/kern/subr_sleepqueue.c =================================================================== --- sys/kern/subr_sleepqueue.c (revision 276531) +++ sys/kern/subr_sleepqueue.c (working copy) @@ -152,7 +152,8 @@ */ static int sleepq_catch_signals(void *wchan, int pri); static int sleepq_check_signals(void); -static int sleepq_check_timeout(void); +static int sleepq_check_timeout(struct thread *); +static void sleepq_stop_timeout(struct thread *); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif @@ -373,17 +374,14 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, int flags) { - struct sleepqueue_chain *sc; struct thread *td; td = curthread; - sc = SC_LOOKUP(wchan); - mtx_assert(&sc->sc_lock, MA_OWNED); - MPASS(TD_ON_SLEEPQ(td)); - MPASS(td->td_sleepqueue == NULL); - MPASS(wchan != NULL); + + mtx_lock_spin(&td->td_slpmutex); callout_reset_sbt_on(&td->td_slpcallout, sbt, pr, sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); + mtx_unlock_spin(&td->td_slpmutex); } /* @@ -559,11 +557,8 @@ * Check to see if we timed out. */ static int -sleepq_check_timeout(void) +sleepq_check_timeout(struct thread *td) { - struct thread *td; - - td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -573,28 +568,21 @@ td->td_flags &= ~TDF_TIMEOUT; return (EWOULDBLOCK); } - - /* - * If TDF_TIMOFAIL is set, the timeout ran after we had - * already been woken up. - */ - if (td->td_flags & TDF_TIMOFAIL) - td->td_flags &= ~TDF_TIMOFAIL; - - /* - * If callout_stop() fails, then the timeout is running on - * another CPU, so synchronize with it to avoid having it - * accidentally wake up a subsequent sleep. - */ - else if (callout_stop(&td->td_slpcallout) == 0) { - td->td_flags |= TDF_TIMEOUT; - TD_SET_SLEEPING(td); - mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL); - } return (0); } /* + * Atomically stop the timeout by using a mutex. + */ +static void +sleepq_stop_timeout(struct thread *td) +{ + mtx_lock_spin(&td->td_slpmutex); + callout_stop(&td->td_slpcallout); + mtx_unlock_spin(&td->td_slpmutex); +} + +/* * Check to see if we were awoken by a signal. */ static int @@ -664,9 +652,11 @@ MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); - rval = sleepq_check_timeout(); + rval = sleepq_check_timeout(td); thread_unlock(td); + sleepq_stop_timeout(td); + return (rval); } @@ -677,12 +667,18 @@ int sleepq_timedwait_sig(void *wchan, int pri) { + struct thread *td; int rcatch, rvalt, rvals; + td = curthread; + rcatch = sleepq_catch_signals(wchan, pri); - rvalt = sleepq_check_timeout(); + rvalt = sleepq_check_timeout(td); rvals = sleepq_check_signals(); - thread_unlock(curthread); + thread_unlock(td); + + sleepq_stop_timeout(td); + if (rcatch) return (rcatch); if (rvals) @@ -889,64 +885,49 @@ static void sleepq_timeout(void *arg) { - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - struct thread *td; - void *wchan; - int wakeup_swapper; + struct thread *td = arg; + int wakeup_swapper = 0; - td = arg; - wakeup_swapper = 0; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); - /* - * First, see if the thread is asleep and get the wait channel if - * it is. - */ + /* Handle the three cases which can happen */ + thread_lock(td); - if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { - wchan = td->td_wchan; - sc = SC_LOOKUP(wchan); - THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); - sq = sleepq_lookup(wchan); - MPASS(sq != NULL); - td->td_flags |= TDF_TIMEOUT; - wakeup_swapper = sleepq_resume_thread(sq, td, 0); - thread_unlock(td); - if (wakeup_swapper) - kick_proc0(); - return; - } + if (TD_ON_SLEEPQ(td)) { + if (TD_IS_SLEEPING(td)) { + struct sleepqueue_chain *sc; + struct sleepqueue *sq; + void *wchan; - /* - * If the thread is on the SLEEPQ but isn't sleeping yet, it - * can either be on another CPU in between sleepq_add() and - * one of the sleepq_*wait*() routines or it can be in - * sleepq_catch_signals(). - */ - if (TD_ON_SLEEPQ(td)) { - td->td_flags |= TDF_TIMEOUT; - thread_unlock(td); - return; + /* + * Case I - thread is asleep and needs to be + * awoken: + */ + wchan = td->td_wchan; + sc = SC_LOOKUP(wchan); + THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); + sq = sleepq_lookup(wchan); + MPASS(sq != NULL); + td->td_flags |= TDF_TIMEOUT; + wakeup_swapper = sleepq_resume_thread(sq, td, 0); + } else { + /* + * Case II - cancel going to sleep by setting + * the timeout flag because the target thread + * is not asleep yet. It can be on another CPU + * in between sleepq_add() and one of the + * sleepq_*wait*() routines or it can be in + * sleepq_catch_signals(). + */ + td->td_flags |= TDF_TIMEOUT; + } + } else { + /* + * Case III - thread is already woken up by a wakeup + * call and should not timeout. Nothing to do! + */ } - - /* - * Now check for the edge cases. First, if TDF_TIMEOUT is set, - * then the other thread has already yielded to us, so clear - * the flag and resume it. If TDF_TIMEOUT is not set, then the - * we know that the other thread is not on a sleep queue, but it - * hasn't resumed execution yet. In that case, set TDF_TIMOFAIL - * to let it know that the timeout has already run and doesn't - * need to be canceled. - */ - if (td->td_flags & TDF_TIMEOUT) { - MPASS(TD_IS_SLEEPING(td)); - td->td_flags &= ~TDF_TIMEOUT; - TD_CLR_SLEEPING(td); - wakeup_swapper = setrunnable(td); - } else - td->td_flags |= TDF_TIMOFAIL; thread_unlock(td); if (wakeup_swapper) kick_proc0(); Index: sys/sys/_callout.h =================================================================== --- sys/sys/_callout.h (revision 276531) +++ sys/sys/_callout.h (working copy) @@ -46,6 +46,17 @@ SLIST_HEAD(callout_slist, callout); TAILQ_HEAD(callout_tailq, callout); +typedef void callout_func_t(void *); + +struct callout_args { + sbintime_t time; /* absolute time for the event */ + sbintime_t precision; /* delta allowed wrt opt */ + void *arg; /* function argument */ + callout_func_t *func; /* function to call */ + int flags; /* flags passed to callout_reset() */ + int cpu; /* CPU we're scheduled on */ +}; + struct callout { union { LIST_ENTRY(callout) le; @@ -52,13 +63,13 @@ SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - sbintime_t c_time; /* ticks to the event */ + sbintime_t c_time; /* absolute time for the event */ sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ - void (*c_func)(void *); /* function to call */ - struct lock_object *c_lock; /* lock to handle */ + callout_func_t *c_func; /* function to call */ + struct lock_object *c_lock; /* callback lock */ int c_flags; /* state of this entry */ - volatile int c_cpu; /* CPU we're scheduled on */ + int c_cpu; /* CPU we're scheduled on */ }; #endif Index: sys/sys/callout.h =================================================================== --- sys/sys/callout.h (revision 276531) +++ sys/sys/callout.h (working copy) @@ -45,10 +45,12 @@ #define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */ #define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ -#define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ -#define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_UNUSED_5 0x0020 /* --available-- */ +#define CALLOUT_DEFRESTART 0x0040 /* callout restart is deferred */ #define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ #define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ +#define CALLOUT_SET_LC(x) (((x) & 7) << 16) /* set lock class */ +#define CALLOUT_GET_LC(x) (((x) >> 16) & 7) /* get lock class */ #define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ #define C_PRELBITS 7 @@ -65,7 +67,8 @@ #ifdef _KERNEL #define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE) #define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE) -#define callout_drain(c) _callout_stop_safe(c, 1) +int callout_drain(struct callout *); +int callout_drain_async(struct callout *, callout_func_t *, void *); void callout_init(struct callout *, int); void _callout_init_lock(struct callout *, struct lock_object *, int); #define callout_init_mtx(c, mtx, flags) \ @@ -79,7 +82,7 @@ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) int callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t, - void (*)(void *), void *, int, int); + callout_func_t *, void *, int, int); #define callout_reset_sbt(c, sbt, pr, fn, arg, flags) \ callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, (flags)) #define callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags) \ @@ -103,8 +106,7 @@ int callout_schedule_on(struct callout *, int, int); #define callout_schedule_curcpu(c, on_tick) \ callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) -#define callout_stop(c) _callout_stop_safe(c, 0) -int _callout_stop_safe(struct callout *, int); +int callout_stop(struct callout *); void callout_process(sbintime_t now); #endif Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h (revision 276531) +++ sys/sys/proc.h (working copy) @@ -308,6 +308,7 @@ } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval struct callout td_slpcallout; /* (h) Callout for sleep. */ + struct mtx td_slpmutex; /* (h) Mutex for sleep callout */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ @@ -364,7 +365,7 @@ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ -#define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ +#define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ @@ -704,7 +705,7 @@ #define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ -#define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ +#define SWT_UNUSED5 5 /* --available-- */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ --------------070009030809080804000507--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?54A92ED1.2070906>