Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 11 Mar 2020 16:32:40 -0700
From:      Eric Joyner <erj@freebsd.org>
To:        Mark Johnston <markj@freebsd.org>
Cc:        freebsd-net@freebsd.org, Hans Petter Selasky <hps@selasky.org>,  John Baldwin <jhb@freebsd.org>, shurd <shurd@freebsd.org>,  Drew Gallatin <gallatin@netflix.com>, Gleb Smirnoff <glebius@freebsd.org>
Subject:   Re: Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib]
Message-ID:  <CAKdFRZjdiz_axuweksNUHis7jPKXHqOmhQg%2BQWzpVnsKY%2Bcrmg@mail.gmail.com>
In-Reply-To: <20200212222219.GE83892@raichu>
References:  <CAKdFRZjxp=mTkUzFU8qsacP86OQOC9vCDCQ%2BO2iF7svRRGDK8w@mail.gmail.com> <0e2e97f2-df75-3c6f-9bdd-e8c2ab7bf79e@selasky.org> <CAKdFRZi3UoRuz=OXnBG=NVcJe605x9OwrLmdCyD98mDeTpbf0Q@mail.gmail.com> <a6523ed6-9d61-d1b4-5822-5787cf5c0e43@selasky.org> <20200130030911.GA15281@spy> <CA%2Bb0zg-1CQ81dsNGv_O3ebLLko6Piei0A1NCPZUT5JH8EOyntw@mail.gmail.com> <CA%2Bb0zg809EGMS1Ngr38BSb1yNpDqxbCnAv9eC%2BcDwbMQ5t%2BqXQ@mail.gmail.com> <20200212222219.GE83892@raichu>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, Feb 12, 2020 at 2:22 PM Mark Johnston <markj@freebsd.org> wrote:

> On Mon, Feb 10, 2020 at 01:30:47PM -0800, Eric Joyner wrote:
> > On Fri, Jan 31, 2020 at 11:44 AM Eric Joyner <erj@freebsd.org> wrote:
> > > Hi Mark,
> > >
> > > I applied your patch to a FreeBSD-current VM and reran the test I was
> > > running originally, and I still see the same behavior.
> > >
> > > Your patch doesn't appear to fix the problem.
> > >
> > > - Eric
> > >
> >
> >   *bump*
>
> Sorry, I had missed this.  I think this implies that even the epoch
> callback tasks are not getting a chance to run, but I guess that makes
> sense: all grouptask threads run at PI_SOFT, a realtime priority, so the
> iflib and epoch callback task threads are all running at the same
> priority.  Unless iflib voluntarily yields the CPU it'll keep getting
> scheduled.
>
> Here's an updated batch which also allows the priority of group task
> threads to be specified.  I don't think it makes much sense to have more
> than one per-CPU group task in the system without that parameter.  With
> this, epoch callbacks should run at a slightly higher priority than
> iflib.
>
> The combined patch also fixes an annoying problem where interface
> destruction can appear to hang for a short period (~0.5s).  This is
> because epoch callbacks are scheduled off hardclock, but hardclock
> interrupts run at a very low frequency on an idle CPU.
>
> diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
> index 0a477e1d6f7b..4ed7e7e11a3e 100644
> --- a/sys/kern/subr_epoch.c
> +++ b/sys/kern/subr_epoch.c
> @@ -74,15 +74,19 @@ typedef struct epoch_record {
>         volatile struct epoch_tdlist er_tdlist;
>         volatile uint32_t er_gen;
>         uint32_t er_cpuid;
> +       int er_drain_state;
>  } __aligned(EPOCH_ALIGN)     *epoch_record_t;
>
> +#define        EPOCH_DRAIN_START       2
> +#define        EPOCH_DRAIN_RUNNING     1
> +#define        EPOCH_DRAIN_DONE        0
> +
>  struct epoch {
>         struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
>         epoch_record_t e_pcpu_record;
>         int     e_idx;
>         int     e_flags;
>         struct sx e_drain_sx;
> -       struct mtx e_drain_mtx;
>         volatile int e_drain_count;
>         const char *e_name;
>  };
> @@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags)
>         epoch->e_idx = epoch_count;
>         epoch->e_name = name;
>         sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
> -       mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
>         allepochs[epoch_count++] = epoch;
>         return (epoch);
>  }
> @@ -348,7 +351,6 @@ epoch_free(epoch_t epoch)
>         allepochs[epoch->e_idx] = NULL;
>         epoch_wait(global_epoch);
>         uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
> -       mtx_destroy(&epoch->e_drain_mtx);
>         sx_destroy(&epoch->e_drain_sx);
>         free(epoch, M_EPOCH);
>  }
> @@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused)
>         epoch_t epoch;
>         ck_stack_t cb_stack;
>         int i, npending, total;
> +       bool draining;
> +
> +       KASSERT(curthread->td_pinned > 0,
> +           ("%s: callback task thread is not pinned", __func__));
>
>         ck_stack_init(&cb_stack);
>         critical_enter();
>         epoch_enter(global_epoch);
> -       for (total = i = 0; i < epoch_count; i++) {
> +       for (total = i = 0, draining = false; i < epoch_count; i++) {
>                 if (__predict_false((epoch = allepochs[i]) == NULL))
>                         continue;
>                 er = epoch_currecord(epoch);
> +               if (atomic_load_int(&er->er_drain_state) ==
> EPOCH_DRAIN_START) {
> +                       atomic_store_int(&er->er_drain_state,
> +                           EPOCH_DRAIN_RUNNING);
> +                       draining = true;
> +               }
> +
>                 record = &er->er_record;
>                 if ((npending = record->n_pending) == 0)
>                         continue;
> @@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused)
>                 next = CK_STACK_NEXT(cursor);
>                 entry->function(entry);
>         }
> +
> +       if (__predict_false(draining)) {
> +               epoch_enter(global_epoch);
> +               for (i = 0; i < epoch_count; i++) {
> +                       if (__predict_false((epoch = allepochs[i]) ==
> NULL))
> +                               continue;
> +                       er = epoch_currecord(epoch);
> +                       if (atomic_load_int(&er->er_drain_state) ==
> +                           EPOCH_DRAIN_RUNNING)
> +                               atomic_store_int(&er->er_drain_state,
> +                                   EPOCH_DRAIN_DONE);
> +               }
> +               epoch_exit(global_epoch);
> +       }
>  }
>
>  int
> @@ -769,27 +795,18 @@ in_epoch(epoch_t epoch)
>  }
>
>  static void
> -epoch_drain_cb(struct epoch_context *ctx)
> +epoch_drain_handler(struct ck_epoch *global __unused,
> +    ck_epoch_record_t *cr __unused, void *arg __unused)
>  {
> -       struct epoch *epoch =
> -           __containerof(ctx, struct epoch_record,
> er_drain_ctx)->er_parent;
> -
> -       if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
> -               mtx_lock(&epoch->e_drain_mtx);
> -               wakeup(epoch);
> -               mtx_unlock(&epoch->e_drain_mtx);
> -       }
> +       maybe_yield();
>  }
>
>  void
>  epoch_drain_callbacks(epoch_t epoch)
>  {
>         epoch_record_t er;
> -       struct thread *td;
> -       int was_bound;
> -       int old_pinned;
> -       int old_cpu;
> -       int cpu;
> +       int cpu, state;
> +       bool pending;
>
>         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
>             "epoch_drain_callbacks() may sleep!");
> @@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch)
>                 return;
>  #endif
>         DROP_GIANT();
> -
>         sx_xlock(&epoch->e_drain_sx);
> -       mtx_lock(&epoch->e_drain_mtx);
>
> -       td = curthread;
> -       thread_lock(td);
> -       old_cpu = PCPU_GET(cpuid);
> -       old_pinned = td->td_pinned;
> -       was_bound = sched_is_bound(td);
> -       sched_unbind(td);
> -       td->td_pinned = 0;
> +       /* Make sure that all pending callbacks are available. */
> +       ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler,
> NULL);
>
> -       CPU_FOREACH(cpu)
> -               epoch->e_drain_count++;
>         CPU_FOREACH(cpu) {
>                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
> -               sched_bind(td, cpu);
> -               epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
> +               atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START);
> +               GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task));
>         }
>
> -       /* restore CPU binding, if any */
> -       if (was_bound != 0) {
> -               sched_bind(td, old_cpu);
> -       } else {
> -               /* get thread back to initial CPU, if any */
> -               if (old_pinned != 0)
> -                       sched_bind(td, old_cpu);
> -               sched_unbind(td);
> -       }
> -       /* restore pinned after bind */
> -       td->td_pinned = old_pinned;
> -
> -       thread_unlock(td);
> -
> -       while (epoch->e_drain_count != 0)
> -               msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
> +       do {
> +               pending = false;
> +               CPU_FOREACH(cpu) {
> +                       er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
> +                       state = atomic_load_int(&er->er_drain_state);
> +                       if (state != EPOCH_DRAIN_DONE)
> +                               pending = true;
> +               }
> +               pause("edrain", 1);
> +       } while (pending);
>
> -       mtx_unlock(&epoch->e_drain_mtx);
>         sx_xunlock(&epoch->e_drain_sx);
> -
>         PICKUP_GIANT();
>  }
> diff --git a/sys/kern/subr_gtaskqueue.c b/sys/kern/subr_gtaskqueue.c
> index 9bf4b0ca3ad3..f2e7f74325fa 100644
> --- a/sys/kern/subr_gtaskqueue.c
> +++ b/sys/kern/subr_gtaskqueue.c
> @@ -54,8 +54,8 @@ static void   gtaskqueue_thread_loop(void *arg);
>  static int     task_is_running(struct gtaskqueue *queue, struct gtask
> *gtask);
>  static void    gtaskqueue_drain_locked(struct gtaskqueue *queue, struct
> gtask *gtask);
>
> -TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
> -TASKQGROUP_DEFINE(config, 1, 1);
> +TASKQGROUP_DEFINE(softirq, mp_ncpus, 1, PI_SOFT);
> +TASKQGROUP_DEFINE(config, 1, 1, PI_SOFT);
>
>  struct gtaskqueue_busy {
>         struct gtask            *tb_running;
> @@ -612,7 +612,7 @@ struct taskq_bind_task {
>  };
>
>  static void
> -taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
> +taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu, int
> pri)
>  {
>         struct taskqgroup_cpu *qcpu;
>
> @@ -620,7 +620,7 @@ taskqgroup_cpu_create(struct taskqgroup *qgroup, int
> idx, int cpu)
>         LIST_INIT(&qcpu->tgc_tasks);
>         qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
>             taskqueue_thread_enqueue, &qcpu->tgc_taskq);
> -       gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
> +       gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, pri,
>             "%s_%d", qgroup->tqg_name, idx);
>         qcpu->tgc_cpu = cpu;
>  }
> @@ -900,7 +900,7 @@ taskqgroup_config_init(void *arg)
>         LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
>             grouptask, gt_list);
>         qgroup->tqg_queue[0].tgc_cnt = 0;
> -       taskqgroup_cpu_create(qgroup, 0, 0);
> +       taskqgroup_cpu_create(qgroup, 0, 0, PI_SOFT);
>
>         qgroup->tqg_cnt = 1;
>         qgroup->tqg_stride = 1;
> @@ -910,7 +910,7 @@ SYSINIT(taskqgroup_config_init, SI_SUB_TASKQ,
> SI_ORDER_SECOND,
>         taskqgroup_config_init, NULL);
>
>  static int
> -_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
> +_taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int
> pri)
>  {
>         LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
>         struct grouptask *gtask;
> @@ -948,7 +948,7 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt,
> int stride)
>          */
>         cpu = old_cpu;
>         for (i = old_cnt; i < cnt; i++) {
> -               taskqgroup_cpu_create(qgroup, i, cpu);
> +               taskqgroup_cpu_create(qgroup, i, cpu, pri);
>
>                 for (k = 0; k < stride; k++)
>                         cpu = CPU_NEXT(cpu);
> @@ -1001,12 +1001,12 @@ _taskqgroup_adjust(struct taskqgroup *qgroup, int
> cnt, int stride)
>  }
>
>  int
> -taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
> +taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride, int pri)
>  {
>         int error;
>
>         mtx_lock(&qgroup->tqg_lock);
> -       error = _taskqgroup_adjust(qgroup, cnt, stride);
> +       error = _taskqgroup_adjust(qgroup, cnt, stride, pri);
>         mtx_unlock(&qgroup->tqg_lock);
>
>         return (error);
> diff --git a/sys/net/iflib.c b/sys/net/iflib.c
> index a9fc8090c48a..1163425bf15a 100644
> --- a/sys/net/iflib.c
> +++ b/sys/net/iflib.c
> @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/types.h>
>  #include <sys/bus.h>
>  #include <sys/eventhandler.h>
> +#include <sys/interrupt.h>
>  #include <sys/kernel.h>
>  #include <sys/lock.h>
>  #include <sys/mutex.h>
> @@ -560,8 +561,8 @@ MODULE_VERSION(iflib, 1);
>  MODULE_DEPEND(iflib, pci, 1, 1, 1);
>  MODULE_DEPEND(iflib, ether, 1, 1, 1);
>
> -TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
> -TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
> +TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1, PI_SWI(SWI_NET));
> +TASKQGROUP_DEFINE(if_config_tqg, 1, 1, PI_SWI(SWI_NET));
>
>  #ifndef IFLIB_DEBUG_COUNTERS
>  #ifdef INVARIANTS
> diff --git a/sys/sys/gtaskqueue.h b/sys/sys/gtaskqueue.h
> index 96fd57dfb76d..a57bcfc1881a 100644
> --- a/sys/sys/gtaskqueue.h
> +++ b/sys/sys/gtaskqueue.h
> @@ -79,7 +79,8 @@ int   taskqgroup_attach_cpu(struct taskqgroup *qgroup,
>  void   taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask
> *gtask);
>  struct taskqgroup *taskqgroup_create(const char *name);
>  void   taskqgroup_destroy(struct taskqgroup *qgroup);
> -int    taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
> +int    taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride,
> +           int pri);
>  void   taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask,
>             gtask_fn_t *fn, const char *name);
>  void   taskqgroup_config_gtask_deinit(struct grouptask *gtask);
> @@ -100,7 +101,7 @@ void        taskqgroup_config_gtask_deinit(struct
> grouptask *gtask);
>  #define TASKQGROUP_DECLARE(name)                       \
>  extern struct taskqgroup *qgroup_##name
>
> -#define TASKQGROUP_DEFINE(name, cnt, stride)                           \
> +#define TASKQGROUP_DEFINE(name, cnt, stride, pri)                      \
>                                                                         \
>  struct taskqgroup *qgroup_##name;                                      \
>                                                                         \
> @@ -116,7 +117,7 @@ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ,
> SI_ORDER_FIRST,            \
>  static void                                                            \
>  taskqgroup_adjust_##name(void *arg)                                    \
>  {                                                                      \
> -       taskqgroup_adjust(qgroup_##name, (cnt), (stride));              \
> +       taskqgroup_adjust(qgroup_##name, (cnt), (stride), (pri));       \
>  }                                                                      \
>                                                                         \
>  SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY,               \
>

Mark,

I did get some time to get back and retry this; however your second patch
still doesn't solve the problem. Looking into it a bit, it looks like the
kldunload process isn't hitting the code you've changed; it's hanging in
epoch_wait_preempt() in if_detach_internal(), which is immediately before
epoch_drain_callbacks().

I did a kernel dump while it was hanging, and this is the backtrace for the
kldunload process:

#0  sched_switch (td=0xfffffe00da78b700, flags=<optimized out>) at
/usr/src/sys/kern/sched_ule.c:2147
#1  0xffffffff80ba2419 in mi_switch (flags=256) at
/usr/src/sys/kern/kern_synch.c:533
#2  0xffffffff80bc749c in sched_bind (td=0xfffffe00da78b700, cpu=31) at
/usr/src/sys/kern/sched_ule.c:2729
#3  0xffffffff80bdaf5b in epoch_block_handler_preempt (global=<optimized
out>, cr=0xfffffe0036387b00, arg=<optimized out>) at
/usr/src/sys/kern/subr_epoch.c:516
#4  0xffffffff803b602d in epoch_block (global=0xfffff80003be2680,
cr=<optimized out>, cb=<unavailable>, ct=<unavailable>) at
/usr/src/sys/contrib/ck/src/ck_epoch.c:416
#5  ck_epoch_synchronize_wait (global=0xfffff80003be2680, cb=<optimized
out>, ct=<optimized out>) at /usr/src/sys/contrib/ck/src/ck_epoch.c:465
#6  0xffffffff80bdaabe in epoch_wait_preempt (epoch=0xfffff80003be2680) at
/usr/src/sys/kern/subr_epoch.c:628
#7  0xffffffff80ca0fea in if_detach_internal (ifp=0xfffff80074893800,
vmove=<optimized out>, ifcp=0x0) at /usr/src/sys/net/if.c:1123
#8  0xffffffff80ca0dad in if_detach (ifp=<unavailable>) at
/usr/src/sys/net/if.c:1063
#9  0xffffffff80cbcc46 in iflib_device_deregister (ctx=0xfffff800711f0000)
at /usr/src/sys/net/iflib.c:5127
#10 0xffffffff80bcd95e in DEVICE_DETACH (dev=0xfffff80004388700) at
./device_if.h:234
#11 device_detach (dev=0xfffff80004388700) at
/usr/src/sys/kern/subr_bus.c:3049
#12 0xffffffff80bccefd in devclass_driver_deleted
(busclass=0xfffff80003f67d80, dc=0xfffff8000d1bc280,
driver=0xffffffff8232e9e0 <i40e_write_nvm_aq+80>) at
/usr/src/sys/kern/subr_bus.c:1235
#13 0xffffffff80bccddf in devclass_delete_driver
(busclass=0xfffff80003f67d80, driver=0xffffffff8232e9e0
<i40e_write_nvm_aq+80>) at /usr/src/sys/kern/subr_bus.c:1310
#14 0xffffffff80bd2ddc in driver_module_handler (mod=0xfffff8000db94a00,
what=1, arg=0xffffffff8232e9b0 <i40e_write_nvm_aq+32>) at
/usr/src/sys/kern/subr_bus.c:5229
#15 0xffffffff80b72ea2 in module_unload (mod=0xfffff8000db94a00) at
/usr/src/sys/kern/kern_module.c:261
#16 0xffffffff80b63b8b in linker_file_unload (file=0xfffff8000d833c00,
flags=<optimized out>) at /usr/src/sys/kern/kern_linker.c:700
#17 0xffffffff80b64f7d in kern_kldunload (td=<optimized out>, fileid=4,
flags=0) at /usr/src/sys/kern/kern_linker.c:1153
#18 0xffffffff8103b8dd in syscallenter (td=<optimized out>) at
/usr/src/sys/amd64/amd64/../../kern/subr_syscall.c:163
#19 amd64_syscall (td=0xfffffe00da78b700, traced=0) at
/usr/src/sys/amd64/amd64/trap.c:1162
#20 <signal handler called>
#21 0x00000008002de6da in ?? ()

- Eric



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAKdFRZjdiz_axuweksNUHis7jPKXHqOmhQg%2BQWzpVnsKY%2Bcrmg>