Date: Wed, 12 Feb 2020 17:22:19 -0500 From: Mark Johnston <markj@freebsd.org> To: Eric Joyner <erj@freebsd.org> Cc: freebsd-net@freebsd.org, Hans Petter Selasky <hps@selasky.org>, John Baldwin <jhb@freebsd.org>, shurd <shurd@freebsd.org>, Drew Gallatin <gallatin@netflix.com>, Gleb Smirnoff <glebius@freebsd.org> Subject: Re: Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib] Message-ID: <20200212222219.GE83892@raichu> In-Reply-To: <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com> References: <CAKdFRZjxp=mTkUzFU8qsacP86OQOC9vCDCQ%2BO2iF7svRRGDK8w@mail.gmail.com> <0e2e97f2-df75-3c6f-9bdd-e8c2ab7bf79e@selasky.org> <CAKdFRZi3UoRuz=OXnBG=NVcJe605x9OwrLmdCyD98mDeTpbf0Q@mail.gmail.com> <a6523ed6-9d61-d1b4-5822-5787cf5c0e43@selasky.org> <20200130030911.GA15281@spy> <CA%2Bb0zg-1CQ81dsNGv_O3ebLLko6Piei0A1NCPZUT5JH8EOyntw@mail.gmail.com> <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
On Mon, Feb 10, 2020 at 01:30:47PM -0800, Eric Joyner wrote: > On Fri, Jan 31, 2020 at 11:44 AM Eric Joyner <erj@freebsd.org> wrote: > > Hi Mark, > > > > I applied your patch to a FreeBSD-current VM and reran the test I was > > running originally, and I still see the same behavior. > > > > Your patch doesn't appear to fix the problem. > > > > - Eric > > > > *bump* Sorry, I had missed this. I think this implies that even the epoch callback tasks are not getting a chance to run, but I guess that makes sense: all grouptask threads run at PI_SOFT, a realtime priority, so the iflib and epoch callback task threads are all running at the same priority. Unless iflib voluntarily yields the CPU it'll keep getting scheduled. Here's an updated batch which also allows the priority of group task threads to be specified. I don't think it makes much sense to have more than one per-CPU group task in the system without that parameter. With this, epoch callbacks should run at a slightly higher priority than iflib. The combined patch also fixes an annoying problem where interface destruction can appear to hang for a short period (~0.5s). This is because epoch callbacks are scheduled off hardclock, but hardclock interrupts run at a very low frequency on an idle CPU. diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c index 0a477e1d6f7b..4ed7e7e11a3e 100644 --- a/sys/kern/subr_epoch.c +++ b/sys/kern/subr_epoch.c @@ -74,15 +74,19 @@ typedef struct epoch_record { volatile struct epoch_tdlist er_tdlist; volatile uint32_t er_gen; uint32_t er_cpuid; + int er_drain_state; } __aligned(EPOCH_ALIGN) *epoch_record_t; +#define EPOCH_DRAIN_START 2 +#define EPOCH_DRAIN_RUNNING 1 +#define EPOCH_DRAIN_DONE 0 + struct epoch { struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); epoch_record_t e_pcpu_record; int e_idx; int e_flags; struct sx e_drain_sx; - struct mtx e_drain_mtx; volatile int e_drain_count; const char *e_name; }; @@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags) epoch->e_idx = epoch_count; epoch->e_name = name; sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); - mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); allepochs[epoch_count++] = epoch; return (epoch); } @@ -348,7 +351,6 @@ epoch_free(epoch_t epoch) allepochs[epoch->e_idx] = NULL; epoch_wait(global_epoch); uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); - mtx_destroy(&epoch->e_drain_mtx); sx_destroy(&epoch->e_drain_sx); free(epoch, M_EPOCH); } @@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused) epoch_t epoch; ck_stack_t cb_stack; int i, npending, total; + bool draining; + + KASSERT(curthread->td_pinned > 0, + ("%s: callback task thread is not pinned", __func__)); ck_stack_init(&cb_stack); critical_enter(); epoch_enter(global_epoch); - for (total = i = 0; i < epoch_count; i++) { + for (total = i = 0, draining = false; i < epoch_count; i++) { if (__predict_false((epoch = allepochs[i]) == NULL)) continue; er = epoch_currecord(epoch); + if (atomic_load_int(&er->er_drain_state) == EPOCH_DRAIN_START) { + atomic_store_int(&er->er_drain_state, + EPOCH_DRAIN_RUNNING); + draining = true; + } + record = &er->er_record; if ((npending = record->n_pending) == 0) continue; @@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused) next = CK_STACK_NEXT(cursor); entry->function(entry); } + + if (__predict_false(draining)) { + epoch_enter(global_epoch); + for (i = 0; i < epoch_count; i++) { + if (__predict_false((epoch = allepochs[i]) == NULL)) + continue; + er = epoch_currecord(epoch); + if (atomic_load_int(&er->er_drain_state) == + EPOCH_DRAIN_RUNNING) + atomic_store_int(&er->er_drain_state, + EPOCH_DRAIN_DONE); + } + epoch_exit(global_epoch); + } } int @@ -769,27 +795,18 @@ in_epoch(epoch_t epoch) } static void -epoch_drain_cb(struct epoch_context *ctx) +epoch_drain_handler(struct ck_epoch *global __unused, + ck_epoch_record_t *cr __unused, void *arg __unused) { - struct epoch *epoch = - __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; - - if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { - mtx_lock(&epoch->e_drain_mtx); - wakeup(epoch); - mtx_unlock(&epoch->e_drain_mtx); - } + maybe_yield(); } void epoch_drain_callbacks(epoch_t epoch) { epoch_record_t er; - struct thread *td; - int was_bound; - int old_pinned; - int old_cpu; - int cpu; + int cpu, state; + bool pending; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_drain_callbacks() may sleep!"); @@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch) return; #endif DROP_GIANT(); - sx_xlock(&epoch->e_drain_sx); - mtx_lock(&epoch->e_drain_mtx); - td = curthread; - thread_lock(td); - old_cpu = PCPU_GET(cpuid); - old_pinned = td->td_pinned; - was_bound = sched_is_bound(td); - sched_unbind(td); - td->td_pinned = 0; + /* Make sure that all pending callbacks are available. */ + ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler, NULL); - CPU_FOREACH(cpu) - epoch->e_drain_count++; CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); - sched_bind(td, cpu); - epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx); + atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START); + GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task)); } - /* restore CPU binding, if any */ - if (was_bound != 0) { - sched_bind(td, old_cpu); - } else { - /* get thread back to initial CPU, if any */ - if (old_pinned != 0) - sched_bind(td, old_cpu); - sched_unbind(td); - } - /* restore pinned after bind */ - td->td_pinned = old_pinned; - - thread_unlock(td); - - while (epoch->e_drain_count != 0) - msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); + do { + pending = false; + CPU_FOREACH(cpu) { + er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); + state = atomic_load_int(&er->er_drain_state); + if (state != EPOCH_DRAIN_DONE) + pending = true; + } + pause("edrain", 1); + } while (pending); - mtx_unlock(&epoch->e_drain_mtx); sx_xunlock(&epoch->e_drain_sx); - PICKUP_GIANT(); } diff --git a/sys/kern/subr_gtaskqueue.c b/sys/kern/subr_gtaskqueue.c index 9bf4b0ca3ad3..f2e7f74325fa 100644 --- a/sys/kern/subr_gtaskqueue.c +++ b/sys/kern/subr_gtaskqueue.c @@ -54,8 +54,8 @@ static void gtaskqueue_thread_loop(void *arg); static int task_is_running(struct gtaskqueue *queue, struct gtask *gtask); static void gtaskqueue_drain_locked(struct gtaskqueue *queue, struct gtask *gtask); -TASKQGROUP_DEFINE(softirq, mp_ncpus, 1); -TASKQGROUP_DEFINE(config, 1, 1); +TASKQGROUP_DEFINE(softirq, mp_ncpus, 1, PI_SOFT); +TASKQGROUP_DEFINE(config, 1, 1, PI_SOFT); struct gtaskqueue_busy { struct gtask *tb_running; @@ -612,7 +612,7 @@ struct taskq_bind_task { }; static void -taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu) +taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu, int pri) { struct taskqgroup_cpu *qcpu; @@ -620,7 +620,7 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu) LIST_INIT(&qcpu->tgc_tasks); qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK, taskqueue_thread_enqueue, &qcpu->tgc_taskq); - gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT, + gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, pri, "%s_%d", qgroup->tqg_name, idx); qcpu->tgc_cpu = cpu; } @@ -900,7 +900,7 @@ taskqgroup_config_init(void *arg) LIST_SWAP(>ask_head, &qgroup->tqg_queue[0].tgc_tasks, grouptask, gt_list); qgroup->tqg_queue[0].tgc_cnt = 0; - taskqgroup_cpu_create(qgroup, 0, 0); + taskqgroup_cpu_create(qgroup, 0, 0, PI_SOFT); qgroup->tqg_cnt = 1; qgroup->tqg_stride = 1; @@ -910,7 +910,7 @@ SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ, SI_ORDER_SECOND, taskqgroup_config_init, NULL); static int -_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) +_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri) { LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL); struct grouptask *gtask; @@ -948,7 +948,7 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) */ cpu = old_cpu; for (i = old_cnt; i < cnt; i++) { - taskqgroup_cpu_create(qgroup, i, cpu); + taskqgroup_cpu_create(qgroup, i, cpu, pri); for (k = 0; k < stride; k++) cpu = CPU_NEXT(cpu); @@ -1001,12 +1001,12 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) } int -taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) +taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri) { int error; mtx_lock(&qgroup->tqg_lock); - error = _taskqgroup_adjust(qgroup, cnt, stride); + error = _taskqgroup_adjust(qgroup, cnt, stride, pri); mtx_unlock(&qgroup->tqg_lock); return (error); diff --git a/sys/net/iflib.c b/sys/net/iflib.c index a9fc8090c48a..1163425bf15a 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/bus.h> #include <sys/eventhandler.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/mutex.h> @@ -560,8 +561,8 @@ MODULE_VERSION(iflib, 1); MODULE_DEPEND(iflib, pci, 1, 1, 1); MODULE_DEPEND(iflib, ether, 1, 1, 1); -TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1); -TASKQGROUP_DEFINE(if_config_tqg, 1, 1); +TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1, PI_SWI(SWI_NET)); +TASKQGROUP_DEFINE(if_config_tqg, 1, 1, PI_SWI(SWI_NET)); #ifndef IFLIB_DEBUG_COUNTERS #ifdef INVARIANTS diff --git a/sys/sys/gtaskqueue.h b/sys/sys/gtaskqueue.h index 96fd57dfb76d..a57bcfc1881a 100644 --- a/sys/sys/gtaskqueue.h +++ b/sys/sys/gtaskqueue.h @@ -79,7 +79,8 @@ int taskqgroup_attach_cpu(struct taskqgroup *qgroup, void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask); struct taskqgroup *taskqgroup_create(const char *name); void taskqgroup_destroy(struct taskqgroup *qgroup); -int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride); +int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, + int pri); void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, const char *name); void taskqgroup_config_gtask_deinit(struct grouptask *gtask); @@ -100,7 +101,7 @@ void taskqgroup_config_gtask_deinit(struct grouptask *gtask); #define TASKQGROUP_DECLARE(name) \ extern struct taskqgroup *qgroup_##name -#define TASKQGROUP_DEFINE(name, cnt, stride) \ +#define TASKQGROUP_DEFINE(name, cnt, stride, pri) \ \ struct taskqgroup *qgroup_##name; \ \ @@ -116,7 +117,7 @@ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ, SI_ORDER_FIRST, \ static void \ taskqgroup_adjust_##name(void *arg) \ { \ - taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \ + taskqgroup_adjust(qgroup_##name, (cnt), (stride), (pri)); \ } \ \ SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20200212222219.GE83892>