Date: Wed, 11 Mar 2020 16:32:40 -0700 From: Eric Joyner <erj@freebsd.org> To: Mark Johnston <markj@freebsd.org> Cc: freebsd-net@freebsd.org, Hans Petter Selasky <hps@selasky.org>, John Baldwin <jhb@freebsd.org>, shurd <shurd@freebsd.org>, Drew Gallatin <gallatin@netflix.com>, Gleb Smirnoff <glebius@freebsd.org> Subject: Re: Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib] Message-ID: <CAKdFRZjdiz_axuweksNUHis7jPKXHqOmhQg%2BQWzpVnsKY%2Bcrmg@mail.gmail.com> In-Reply-To: <20200212222219.GE83892@raichu> References: <CAKdFRZjxp=mTkUzFU8qsacP86OQOC9vCDCQ%2BO2iF7svRRGDK8w@mail.gmail.com> <0e2e97f2-df75-3c6f-9bdd-e8c2ab7bf79e@selasky.org> <CAKdFRZi3UoRuz=OXnBG=NVcJe605x9OwrLmdCyD98mDeTpbf0Q@mail.gmail.com> <a6523ed6-9d61-d1b4-5822-5787cf5c0e43@selasky.org> <20200130030911.GA15281@spy> <CA%2Bb0zg-1CQ81dsNGv_O3ebLLko6Piei0A1NCPZUT5JH8EOyntw@mail.gmail.com> <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com> <20200212222219.GE83892@raichu>
next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, Feb 12, 2020 at 2:22 PM Mark Johnston <markj@freebsd.org> wrote: > On Mon, Feb 10, 2020 at 01:30:47PM -0800, Eric Joyner wrote: > > On Fri, Jan 31, 2020 at 11:44 AM Eric Joyner <erj@freebsd.org> wrote: > > > Hi Mark, > > > > > > I applied your patch to a FreeBSD-current VM and reran the test I was > > > running originally, and I still see the same behavior. > > > > > > Your patch doesn't appear to fix the problem. > > > > > > - Eric > > > > > > > *bump* > > Sorry, I had missed this. I think this implies that even the epoch > callback tasks are not getting a chance to run, but I guess that makes > sense: all grouptask threads run at PI_SOFT, a realtime priority, so the > iflib and epoch callback task threads are all running at the same > priority. Unless iflib voluntarily yields the CPU it'll keep getting > scheduled. > > Here's an updated batch which also allows the priority of group task > threads to be specified. I don't think it makes much sense to have more > than one per-CPU group task in the system without that parameter. With > this, epoch callbacks should run at a slightly higher priority than > iflib. > > The combined patch also fixes an annoying problem where interface > destruction can appear to hang for a short period (~0.5s). This is > because epoch callbacks are scheduled off hardclock, but hardclock > interrupts run at a very low frequency on an idle CPU. > > diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c > index 0a477e1d6f7b..4ed7e7e11a3e 100644 > --- a/sys/kern/subr_epoch.c > +++ b/sys/kern/subr_epoch.c > @@ -74,15 +74,19 @@ typedef struct epoch_record { > volatile struct epoch_tdlist er_tdlist; > volatile uint32_t er_gen; > uint32_t er_cpuid; > + int er_drain_state; > } __aligned(EPOCH_ALIGN) *epoch_record_t; > > +#define EPOCH_DRAIN_START 2 > +#define EPOCH_DRAIN_RUNNING 1 > +#define EPOCH_DRAIN_DONE 0 > + > struct epoch { > struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); > epoch_record_t e_pcpu_record; > int e_idx; > int e_flags; > struct sx e_drain_sx; > - struct mtx e_drain_mtx; > volatile int e_drain_count; > const char *e_name; > }; > @@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags) > epoch->e_idx = epoch_count; > epoch->e_name = name; > sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); > - mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); > allepochs[epoch_count++] = epoch; > return (epoch); > } > @@ -348,7 +351,6 @@ epoch_free(epoch_t epoch) > allepochs[epoch->e_idx] = NULL; > epoch_wait(global_epoch); > uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); > - mtx_destroy(&epoch->e_drain_mtx); > sx_destroy(&epoch->e_drain_sx); > free(epoch, M_EPOCH); > } > @@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused) > epoch_t epoch; > ck_stack_t cb_stack; > int i, npending, total; > + bool draining; > + > + KASSERT(curthread->td_pinned > 0, > + ("%s: callback task thread is not pinned", __func__)); > > ck_stack_init(&cb_stack); > critical_enter(); > epoch_enter(global_epoch); > - for (total = i = 0; i < epoch_count; i++) { > + for (total = i = 0, draining = false; i < epoch_count; i++) { > if (__predict_false((epoch = allepochs[i]) == NULL)) > continue; > er = epoch_currecord(epoch); > + if (atomic_load_int(&er->er_drain_state) == > EPOCH_DRAIN_START) { > + atomic_store_int(&er->er_drain_state, > + EPOCH_DRAIN_RUNNING); > + draining = true; > + } > + > record = &er->er_record; > if ((npending = record->n_pending) == 0) > continue; > @@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused) > next = CK_STACK_NEXT(cursor); > entry->function(entry); > } > + > + if (__predict_false(draining)) { > + epoch_enter(global_epoch); > + for (i = 0; i < epoch_count; i++) { > + if (__predict_false((epoch = allepochs[i]) == > NULL)) > + continue; > + er = epoch_currecord(epoch); > + if (atomic_load_int(&er->er_drain_state) == > + EPOCH_DRAIN_RUNNING) > + atomic_store_int(&er->er_drain_state, > + EPOCH_DRAIN_DONE); > + } > + epoch_exit(global_epoch); > + } > } > > int > @@ -769,27 +795,18 @@ in_epoch(epoch_t epoch) > } > > static void > -epoch_drain_cb(struct epoch_context *ctx) > +epoch_drain_handler(struct ck_epoch *global __unused, > + ck_epoch_record_t *cr __unused, void *arg __unused) > { > - struct epoch *epoch = > - __containerof(ctx, struct epoch_record, > er_drain_ctx)->er_parent; > - > - if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { > - mtx_lock(&epoch->e_drain_mtx); > - wakeup(epoch); > - mtx_unlock(&epoch->e_drain_mtx); > - } > + maybe_yield(); > } > > void > epoch_drain_callbacks(epoch_t epoch) > { > epoch_record_t er; > - struct thread *td; > - int was_bound; > - int old_pinned; > - int old_cpu; > - int cpu; > + int cpu, state; > + bool pending; > > WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, > "epoch_drain_callbacks() may sleep!"); > @@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch) > return; > #endif > DROP_GIANT(); > - > sx_xlock(&epoch->e_drain_sx); > - mtx_lock(&epoch->e_drain_mtx); > > - td = curthread; > - thread_lock(td); > - old_cpu = PCPU_GET(cpuid); > - old_pinned = td->td_pinned; > - was_bound = sched_is_bound(td); > - sched_unbind(td); > - td->td_pinned = 0; > + /* Make sure that all pending callbacks are available. */ > + ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler, > NULL); > > - CPU_FOREACH(cpu) > - epoch->e_drain_count++; > CPU_FOREACH(cpu) { > er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); > - sched_bind(td, cpu); > - epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx); > + atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START); > + GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task)); > } > > - /* restore CPU binding, if any */ > - if (was_bound != 0) { > - sched_bind(td, old_cpu); > - } else { > - /* get thread back to initial CPU, if any */ > - if (old_pinned != 0) > - sched_bind(td, old_cpu); > - sched_unbind(td); > - } > - /* restore pinned after bind */ > - td->td_pinned = old_pinned; > - > - thread_unlock(td); > - > - while (epoch->e_drain_count != 0) > - msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); > + do { > + pending = false; > + CPU_FOREACH(cpu) { > + er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); > + state = atomic_load_int(&er->er_drain_state); > + if (state != EPOCH_DRAIN_DONE) > + pending = true; > + } > + pause("edrain", 1); > + } while (pending); > > - mtx_unlock(&epoch->e_drain_mtx); > sx_xunlock(&epoch->e_drain_sx); > - > PICKUP_GIANT(); > } > diff --git a/sys/kern/subr_gtaskqueue.c b/sys/kern/subr_gtaskqueue.c > index 9bf4b0ca3ad3..f2e7f74325fa 100644 > --- a/sys/kern/subr_gtaskqueue.c > +++ b/sys/kern/subr_gtaskqueue.c > @@ -54,8 +54,8 @@ static void gtaskqueue_thread_loop(void *arg); > static int task_is_running(struct gtaskqueue *queue, struct gtask > *gtask); > static void gtaskqueue_drain_locked(struct gtaskqueue *queue, struct > gtask *gtask); > > -TASKQGROUP_DEFINE(softirq, mp_ncpus, 1); > -TASKQGROUP_DEFINE(config, 1, 1); > +TASKQGROUP_DEFINE(softirq, mp_ncpus, 1, PI_SOFT); > +TASKQGROUP_DEFINE(config, 1, 1, PI_SOFT); > > struct gtaskqueue_busy { > struct gtask *tb_running; > @@ -612,7 +612,7 @@ struct taskq_bind_task { > }; > > static void > -taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu) > +taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu, int > pri) > { > struct taskqgroup_cpu *qcpu; > > @@ -620,7 +620,7 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int > idx, int cpu) > LIST_INIT(&qcpu->tgc_tasks); > qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK, > taskqueue_thread_enqueue, &qcpu->tgc_taskq); > - gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT, > + gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, pri, > "%s_%d", qgroup->tqg_name, idx); > qcpu->tgc_cpu = cpu; > } > @@ -900,7 +900,7 @@ taskqgroup_config_init(void *arg) > LIST_SWAP(>ask_head, &qgroup->tqg_queue[0].tgc_tasks, > grouptask, gt_list); > qgroup->tqg_queue[0].tgc_cnt = 0; > - taskqgroup_cpu_create(qgroup, 0, 0); > + taskqgroup_cpu_create(qgroup, 0, 0, PI_SOFT); > > qgroup->tqg_cnt = 1; > qgroup->tqg_stride = 1; > @@ -910,7 +910,7 @@ SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ, > SI_ORDER_SECOND, > taskqgroup_config_init, NULL); > > static int > -_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) > +_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int > pri) > { > LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL); > struct grouptask *gtask; > @@ -948,7 +948,7 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, > int stride) > */ > cpu = old_cpu; > for (i = old_cnt; i < cnt; i++) { > - taskqgroup_cpu_create(qgroup, i, cpu); > + taskqgroup_cpu_create(qgroup, i, cpu, pri); > > for (k = 0; k < stride; k++) > cpu = CPU_NEXT(cpu); > @@ -1001,12 +1001,12 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int > cnt, int stride) > } > > int > -taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) > +taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri) > { > int error; > > mtx_lock(&qgroup->tqg_lock); > - error = _taskqgroup_adjust(qgroup, cnt, stride); > + error = _taskqgroup_adjust(qgroup, cnt, stride, pri); > mtx_unlock(&qgroup->tqg_lock); > > return (error); > diff --git a/sys/net/iflib.c b/sys/net/iflib.c > index a9fc8090c48a..1163425bf15a 100644 > --- a/sys/net/iflib.c > +++ b/sys/net/iflib.c > @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); > #include <sys/types.h> > #include <sys/bus.h> > #include <sys/eventhandler.h> > +#include <sys/interrupt.h> > #include <sys/kernel.h> > #include <sys/lock.h> > #include <sys/mutex.h> > @@ -560,8 +561,8 @@ MODULE_VERSION(iflib, 1); > MODULE_DEPEND(iflib, pci, 1, 1, 1); > MODULE_DEPEND(iflib, ether, 1, 1, 1); > > -TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1); > -TASKQGROUP_DEFINE(if_config_tqg, 1, 1); > +TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1, PI_SWI(SWI_NET)); > +TASKQGROUP_DEFINE(if_config_tqg, 1, 1, PI_SWI(SWI_NET)); > > #ifndef IFLIB_DEBUG_COUNTERS > #ifdef INVARIANTS > diff --git a/sys/sys/gtaskqueue.h b/sys/sys/gtaskqueue.h > index 96fd57dfb76d..a57bcfc1881a 100644 > --- a/sys/sys/gtaskqueue.h > +++ b/sys/sys/gtaskqueue.h > @@ -79,7 +79,8 @@ int taskqgroup_attach_cpu(struct taskqgroup *qgroup, > void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask > *gtask); > struct taskqgroup *taskqgroup_create(const char *name); > void taskqgroup_destroy(struct taskqgroup *qgroup); > -int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride); > +int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, > + int pri); > void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, > gtask_fn_t *fn, const char *name); > void taskqgroup_config_gtask_deinit(struct grouptask *gtask); > @@ -100,7 +101,7 @@ void taskqgroup_config_gtask_deinit(struct > grouptask *gtask); > #define TASKQGROUP_DECLARE(name) \ > extern struct taskqgroup *qgroup_##name > > -#define TASKQGROUP_DEFINE(name, cnt, stride) \ > +#define TASKQGROUP_DEFINE(name, cnt, stride, pri) \ > \ > struct taskqgroup *qgroup_##name; \ > \ > @@ -116,7 +117,7 @@ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ, > SI_ORDER_FIRST, \ > static void \ > taskqgroup_adjust_##name(void *arg) \ > { \ > - taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \ > + taskqgroup_adjust(qgroup_##name, (cnt), (stride), (pri)); \ > } \ > \ > SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \ > Mark, I did get some time to get back and retry this; however your second patch still doesn't solve the problem. Looking into it a bit, it looks like the kldunload process isn't hitting the code you've changed; it's hanging in epoch_wait_preempt() in if_detach_internal(), which is immediately before epoch_drain_callbacks(). I did a kernel dump while it was hanging, and this is the backtrace for the kldunload process: #0 sched_switch (td=0xfffffe00da78b700, flags=<optimized out>) at /usr/src/sys/kern/sched_ule.c:2147 #1 0xffffffff80ba2419 in mi_switch (flags=256) at /usr/src/sys/kern/kern_synch.c:533 #2 0xffffffff80bc749c in sched_bind (td=0xfffffe00da78b700, cpu=31) at /usr/src/sys/kern/sched_ule.c:2729 #3 0xffffffff80bdaf5b in epoch_block_handler_preempt (global=<optimized out>, cr=0xfffffe0036387b00, arg=<optimized out>) at /usr/src/sys/kern/subr_epoch.c:516 #4 0xffffffff803b602d in epoch_block (global=0xfffff80003be2680, cr=<optimized out>, cb=<unavailable>, ct=<unavailable>) at /usr/src/sys/contrib/ck/src/ck_epoch.c:416 #5 ck_epoch_synchronize_wait (global=0xfffff80003be2680, cb=<optimized out>, ct=<optimized out>) at /usr/src/sys/contrib/ck/src/ck_epoch.c:465 #6 0xffffffff80bdaabe in epoch_wait_preempt (epoch=0xfffff80003be2680) at /usr/src/sys/kern/subr_epoch.c:628 #7 0xffffffff80ca0fea in if_detach_internal (ifp=0xfffff80074893800, vmove=<optimized out>, ifcp=0x0) at /usr/src/sys/net/if.c:1123 #8 0xffffffff80ca0dad in if_detach (ifp=<unavailable>) at /usr/src/sys/net/if.c:1063 #9 0xffffffff80cbcc46 in iflib_device_deregister (ctx=0xfffff800711f0000) at /usr/src/sys/net/iflib.c:5127 #10 0xffffffff80bcd95e in DEVICE_DETACH (dev=0xfffff80004388700) at ./device_if.h:234 #11 device_detach (dev=0xfffff80004388700) at /usr/src/sys/kern/subr_bus.c:3049 #12 0xffffffff80bccefd in devclass_driver_deleted (busclass=0xfffff80003f67d80, dc=0xfffff8000d1bc280, driver=0xffffffff8232e9e0 <i40e_write_nvm_aq+80>) at /usr/src/sys/kern/subr_bus.c:1235 #13 0xffffffff80bccddf in devclass_delete_driver (busclass=0xfffff80003f67d80, driver=0xffffffff8232e9e0 <i40e_write_nvm_aq+80>) at /usr/src/sys/kern/subr_bus.c:1310 #14 0xffffffff80bd2ddc in driver_module_handler (mod=0xfffff8000db94a00, what=1, arg=0xffffffff8232e9b0 <i40e_write_nvm_aq+32>) at /usr/src/sys/kern/subr_bus.c:5229 #15 0xffffffff80b72ea2 in module_unload (mod=0xfffff8000db94a00) at /usr/src/sys/kern/kern_module.c:261 #16 0xffffffff80b63b8b in linker_file_unload (file=0xfffff8000d833c00, flags=<optimized out>) at /usr/src/sys/kern/kern_linker.c:700 #17 0xffffffff80b64f7d in kern_kldunload (td=<optimized out>, fileid=4, flags=0) at /usr/src/sys/kern/kern_linker.c:1153 #18 0xffffffff8103b8dd in syscallenter (td=<optimized out>) at /usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:163 #19 amd64_syscall (td=0xfffffe00da78b700, traced=0) at /usr/src/sys/amd64/amd64/trap.c:1162 #20 <signal handler called> #21 0x00000008002de6da in ?? () - Eric
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAKdFRZjdiz_axuweksNUHis7jPKXHqOmhQg%2BQWzpVnsKY%2Bcrmg>