Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 12 Feb 2020 17:22:19 -0500
From:      Mark Johnston <markj@freebsd.org>
To:        Eric Joyner <erj@freebsd.org>
Cc:        freebsd-net@freebsd.org, Hans Petter Selasky <hps@selasky.org>, John Baldwin <jhb@freebsd.org>, shurd <shurd@freebsd.org>, Drew Gallatin <gallatin@netflix.com>, Gleb Smirnoff <glebius@freebsd.org>
Subject:   Re: Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib]
Message-ID:  <20200212222219.GE83892@raichu>
In-Reply-To: <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com>
References:  <CAKdFRZjxp=mTkUzFU8qsacP86OQOC9vCDCQ%2BO2iF7svRRGDK8w@mail.gmail.com> <0e2e97f2-df75-3c6f-9bdd-e8c2ab7bf79e@selasky.org> <CAKdFRZi3UoRuz=OXnBG=NVcJe605x9OwrLmdCyD98mDeTpbf0Q@mail.gmail.com> <a6523ed6-9d61-d1b4-5822-5787cf5c0e43@selasky.org> <20200130030911.GA15281@spy> <CA%2Bb0zg-1CQ81dsNGv_O3ebLLko6Piei0A1NCPZUT5JH8EOyntw@mail.gmail.com> <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help
On Mon, Feb 10, 2020 at 01:30:47PM -0800, Eric Joyner wrote:
> On Fri, Jan 31, 2020 at 11:44 AM Eric Joyner <erj@freebsd.org> wrote:
> > Hi Mark,
> >
> > I applied your patch to a FreeBSD-current VM and reran the test I was
> > running originally, and I still see the same behavior.
> >
> > Your patch doesn't appear to fix the problem.
> >
> > - Eric
> >
> 
>   *bump*

Sorry, I had missed this.  I think this implies that even the epoch
callback tasks are not getting a chance to run, but I guess that makes
sense: all grouptask threads run at PI_SOFT, a realtime priority, so the
iflib and epoch callback task threads are all running at the same
priority.  Unless iflib voluntarily yields the CPU it'll keep getting
scheduled.

Here's an updated batch which also allows the priority of group task
threads to be specified.  I don't think it makes much sense to have more
than one per-CPU group task in the system without that parameter.  With
this, epoch callbacks should run at a slightly higher priority than
iflib.

The combined patch also fixes an annoying problem where interface
destruction can appear to hang for a short period (~0.5s).  This is
because epoch callbacks are scheduled off hardclock, but hardclock
interrupts run at a very low frequency on an idle CPU.

diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
index 0a477e1d6f7b..4ed7e7e11a3e 100644
--- a/sys/kern/subr_epoch.c
+++ b/sys/kern/subr_epoch.c
@@ -74,15 +74,19 @@ typedef struct epoch_record {
 	volatile struct epoch_tdlist er_tdlist;
 	volatile uint32_t er_gen;
 	uint32_t er_cpuid;
+	int er_drain_state;
 } __aligned(EPOCH_ALIGN)     *epoch_record_t;
 
+#define	EPOCH_DRAIN_START	2
+#define	EPOCH_DRAIN_RUNNING	1
+#define	EPOCH_DRAIN_DONE	0
+
 struct epoch {
 	struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
 	epoch_record_t e_pcpu_record;
 	int	e_idx;
 	int	e_flags;
 	struct sx e_drain_sx;
-	struct mtx e_drain_mtx;
 	volatile int e_drain_count;
 	const char *e_name;
 };
@@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags)
 	epoch->e_idx = epoch_count;
 	epoch->e_name = name;
 	sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
-	mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
 	allepochs[epoch_count++] = epoch;
 	return (epoch);
 }
@@ -348,7 +351,6 @@ epoch_free(epoch_t epoch)
 	allepochs[epoch->e_idx] = NULL;
 	epoch_wait(global_epoch);
 	uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
-	mtx_destroy(&epoch->e_drain_mtx);
 	sx_destroy(&epoch->e_drain_sx);
 	free(epoch, M_EPOCH);
 }
@@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused)
 	epoch_t epoch;
 	ck_stack_t cb_stack;
 	int i, npending, total;
+	bool draining;
+
+	KASSERT(curthread->td_pinned > 0,
+	    ("%s: callback task thread is not pinned", __func__));
 
 	ck_stack_init(&cb_stack);
 	critical_enter();
 	epoch_enter(global_epoch);
-	for (total = i = 0; i < epoch_count; i++) {
+	for (total = i = 0, draining = false; i < epoch_count; i++) {
 		if (__predict_false((epoch = allepochs[i]) == NULL))
 			continue;
 		er = epoch_currecord(epoch);
+		if (atomic_load_int(&er->er_drain_state) == EPOCH_DRAIN_START) {
+			atomic_store_int(&er->er_drain_state,
+			    EPOCH_DRAIN_RUNNING);
+			draining = true;
+		}
+
 		record = &er->er_record;
 		if ((npending = record->n_pending) == 0)
 			continue;
@@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused)
 		next = CK_STACK_NEXT(cursor);
 		entry->function(entry);
 	}
+
+	if (__predict_false(draining)) {
+		epoch_enter(global_epoch);
+		for (i = 0; i < epoch_count; i++) {
+			if (__predict_false((epoch = allepochs[i]) == NULL))
+				continue;
+			er = epoch_currecord(epoch);
+			if (atomic_load_int(&er->er_drain_state) ==
+			    EPOCH_DRAIN_RUNNING)
+				atomic_store_int(&er->er_drain_state,
+				    EPOCH_DRAIN_DONE);
+		}
+		epoch_exit(global_epoch);
+	}
 }
 
 int
@@ -769,27 +795,18 @@ in_epoch(epoch_t epoch)
 }
 
 static void
-epoch_drain_cb(struct epoch_context *ctx)
+epoch_drain_handler(struct ck_epoch *global __unused,
+    ck_epoch_record_t *cr __unused, void *arg __unused)
 {
-	struct epoch *epoch =
-	    __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
-
-	if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
-		mtx_lock(&epoch->e_drain_mtx);
-		wakeup(epoch);
-		mtx_unlock(&epoch->e_drain_mtx);
-	}
+	maybe_yield();
 }
 
 void
 epoch_drain_callbacks(epoch_t epoch)
 {
 	epoch_record_t er;
-	struct thread *td;
-	int was_bound;
-	int old_pinned;
-	int old_cpu;
-	int cpu;
+	int cpu, state;
+	bool pending;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "epoch_drain_callbacks() may sleep!");
@@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch)
 		return;
 #endif
 	DROP_GIANT();
-
 	sx_xlock(&epoch->e_drain_sx);
-	mtx_lock(&epoch->e_drain_mtx);
 
-	td = curthread;
-	thread_lock(td);
-	old_cpu = PCPU_GET(cpuid);
-	old_pinned = td->td_pinned;
-	was_bound = sched_is_bound(td);
-	sched_unbind(td);
-	td->td_pinned = 0;
+	/* Make sure that all pending callbacks are available. */
+	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler, NULL);
 
-	CPU_FOREACH(cpu)
-		epoch->e_drain_count++;
 	CPU_FOREACH(cpu) {
 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
-		sched_bind(td, cpu);
-		epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
+		atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START);
+		GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task));
 	}
 
-	/* restore CPU binding, if any */
-	if (was_bound != 0) {
-		sched_bind(td, old_cpu);
-	} else {
-		/* get thread back to initial CPU, if any */
-		if (old_pinned != 0)
-			sched_bind(td, old_cpu);
-		sched_unbind(td);
-	}
-	/* restore pinned after bind */
-	td->td_pinned = old_pinned;
-
-	thread_unlock(td);
-
-	while (epoch->e_drain_count != 0)
-		msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
+	do {
+		pending = false;
+		CPU_FOREACH(cpu) {
+			er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
+			state = atomic_load_int(&er->er_drain_state);
+			if (state != EPOCH_DRAIN_DONE)
+				pending = true;
+		}
+		pause("edrain", 1);
+	} while (pending);
 
-	mtx_unlock(&epoch->e_drain_mtx);
 	sx_xunlock(&epoch->e_drain_sx);
-
 	PICKUP_GIANT();
 }
diff --git a/sys/kern/subr_gtaskqueue.c b/sys/kern/subr_gtaskqueue.c
index 9bf4b0ca3ad3..f2e7f74325fa 100644
--- a/sys/kern/subr_gtaskqueue.c
+++ b/sys/kern/subr_gtaskqueue.c
@@ -54,8 +54,8 @@ static void	gtaskqueue_thread_loop(void *arg);
 static int	task_is_running(struct gtaskqueue *queue, struct gtask *gtask);
 static void	gtaskqueue_drain_locked(struct gtaskqueue *queue, struct gtask *gtask);
 
-TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
-TASKQGROUP_DEFINE(config, 1, 1);
+TASKQGROUP_DEFINE(softirq, mp_ncpus, 1, PI_SOFT);
+TASKQGROUP_DEFINE(config, 1, 1, PI_SOFT);
 
 struct gtaskqueue_busy {
 	struct gtask		*tb_running;
@@ -612,7 +612,7 @@ struct taskq_bind_task {
 };
 
 static void
-taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
+taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu, int pri)
 {
 	struct taskqgroup_cpu *qcpu;
 
@@ -620,7 +620,7 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
 	LIST_INIT(&qcpu->tgc_tasks);
 	qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
-	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
+	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, pri,
 	    "%s_%d", qgroup->tqg_name, idx);
 	qcpu->tgc_cpu = cpu;
 }
@@ -900,7 +900,7 @@ taskqgroup_config_init(void *arg)
 	LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
 	    grouptask, gt_list);
 	qgroup->tqg_queue[0].tgc_cnt = 0;
-	taskqgroup_cpu_create(qgroup, 0, 0);
+	taskqgroup_cpu_create(qgroup, 0, 0, PI_SOFT);
 
 	qgroup->tqg_cnt = 1;
 	qgroup->tqg_stride = 1;
@@ -910,7 +910,7 @@ SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ, SI_ORDER_SECOND,
 	taskqgroup_config_init, NULL);
 
 static int
-_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
+_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri)
 {
 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
 	struct grouptask *gtask;
@@ -948,7 +948,7 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 	 */
 	cpu = old_cpu;
 	for (i = old_cnt; i < cnt; i++) {
-		taskqgroup_cpu_create(qgroup, i, cpu);
+		taskqgroup_cpu_create(qgroup, i, cpu, pri);
 
 		for (k = 0; k < stride; k++)
 			cpu = CPU_NEXT(cpu);
@@ -1001,12 +1001,12 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 }
 
 int
-taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
+taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri)
 {
 	int error;
 
 	mtx_lock(&qgroup->tqg_lock);
-	error = _taskqgroup_adjust(qgroup, cnt, stride);
+	error = _taskqgroup_adjust(qgroup, cnt, stride, pri);
 	mtx_unlock(&qgroup->tqg_lock);
 
 	return (error);
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index a9fc8090c48a..1163425bf15a 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -560,8 +561,8 @@ MODULE_VERSION(iflib, 1);
 MODULE_DEPEND(iflib, pci, 1, 1, 1);
 MODULE_DEPEND(iflib, ether, 1, 1, 1);
 
-TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
-TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
+TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1, PI_SWI(SWI_NET));
+TASKQGROUP_DEFINE(if_config_tqg, 1, 1, PI_SWI(SWI_NET));
 
 #ifndef IFLIB_DEBUG_COUNTERS
 #ifdef INVARIANTS
diff --git a/sys/sys/gtaskqueue.h b/sys/sys/gtaskqueue.h
index 96fd57dfb76d..a57bcfc1881a 100644
--- a/sys/sys/gtaskqueue.h
+++ b/sys/sys/gtaskqueue.h
@@ -79,7 +79,8 @@ int	taskqgroup_attach_cpu(struct taskqgroup *qgroup,
 void	taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask);
 struct taskqgroup *taskqgroup_create(const char *name);
 void	taskqgroup_destroy(struct taskqgroup *qgroup);
-int	taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
+int	taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride,
+	    int pri);
 void	taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask,
 	    gtask_fn_t *fn, const char *name);
 void	taskqgroup_config_gtask_deinit(struct grouptask *gtask);
@@ -100,7 +101,7 @@ void	taskqgroup_config_gtask_deinit(struct grouptask *gtask);
 #define TASKQGROUP_DECLARE(name)			\
 extern struct taskqgroup *qgroup_##name
 
-#define TASKQGROUP_DEFINE(name, cnt, stride)				\
+#define TASKQGROUP_DEFINE(name, cnt, stride, pri)			\
 									\
 struct taskqgroup *qgroup_##name;					\
 									\
@@ -116,7 +117,7 @@ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ, SI_ORDER_FIRST,		\
 static void								\
 taskqgroup_adjust_##name(void *arg)					\
 {									\
-	taskqgroup_adjust(qgroup_##name, (cnt), (stride));		\
+	taskqgroup_adjust(qgroup_##name, (cnt), (stride), (pri));	\
 }									\
 									\
 SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY,		\



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20200212222219.GE83892>