Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 30 Jul 2012 16:17:21 +0200
From:      Davide Italiano <davide@freebsd.org>
To:        attilio@freebsd.org
Cc:        svn-src-projects@freebsd.org, src-committers@freebsd.org
Subject:   Re: svn commit: r238907 - projects/calloutng/sys/kern
Message-ID:  <CACYV=-HmOwZ=E8Pw3-mUw0994SbvZaA3eMfcwM0fDTu_zykBJg@mail.gmail.com>
In-Reply-To: <CAJ-FndBJNNBNDUEDsDBUvwoVExZpnXmoJmpY58gE3QQbw3hRGA@mail.gmail.com>
References:  <201207301350.q6UDobCI099069@svn.freebsd.org> <CAJ-FndBJNNBNDUEDsDBUvwoVExZpnXmoJmpY58gE3QQbw3hRGA@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help
On Mon, Jul 30, 2012 at 4:02 PM, Attilio Rao <attilio@freebsd.org> wrote:
> On 7/30/12, Davide Italiano <davide@freebsd.org> wrote:
>> Author: davide
>> Date: Mon Jul 30 13:50:37 2012
>> New Revision: 238907
>> URL: http://svn.freebsd.org/changeset/base/238907
>>
>> Log:
>>   - Fix a LOR deadlock dropping the callout lock while executing the
>> handler
>>   directly from hardware interrupt context.
>>
>>   - Add migration support for callouts which runs from hw interrupt
>> context.
>>
>>   - Refactor a couple of comments to reflect the new world order.
>>
>>   TODO: implement proper stats for direct callouts.
>>
>>   Just a note: I'd like to thank flo@ for his invaluable help in testing
>> and
>>   issues, mav@ that helped me in tackling the problem and Yahoo! which
>>   provided access to the machines on which tests were run.
>>
>>   Reported by:                avg, flo [1]
>>   Discused with:              mav
>>
>> Modified:
>>   projects/calloutng/sys/kern/kern_timeout.c
>>
>> Modified: projects/calloutng/sys/kern/kern_timeout.c
>> ==============================================================================
>> --- projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30 12:25:20
>> 2012  (r238906)
>> +++ projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30 13:50:37
>> 2012  (r238907)
>> @@ -91,45 +91,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpca
>>  int callwheelsize, callwheelmask;
>>
>>  /*
>> - * The callout cpu migration entity represents informations necessary for
>> - * describing the migrating callout to the new callout cpu.
>> + * The callout cpu exec entities represent informations necessary for
>> + * describing the state of callouts currently running on the CPU and the
>> ones
>> + * necessary for migrating callouts to the new callout cpu. In particular,
>> + * the first entry of the array cc_exec_entity holds informations for
>> callout
>> + * running in SWI thread context, while the second one holds informations
>> + * for callout running directly from hardware interrupt context.
>>   * The cached informations are very important for deferring migration when
>>   * the migrating callout is already running.
>>   */
>> -struct cc_mig_ent {
>> +struct cc_exec {
>> +     struct callout          *cc_next;
>> +     struct callout          *cc_curr;
>>  #ifdef SMP
>>       void                    (*ce_migration_func)(void *);
>>       void                    *ce_migration_arg;
>>       int                     ce_migration_cpu;
>>       struct bintime          ce_migration_time;
>>  #endif
>> +     int                     cc_cancel;
>> +     int                     cc_waiting;
>>  };
>>
>>  /*
>> - * There is one struct callout_cpu per cpu, holding all relevant
>> + * There is one struct callou_cpu per cpu, holding all relevant
>>   * state for the callout processing thread on the individual CPU.
>>   */
>>  struct callout_cpu {
>> -     struct cc_mig_ent       cc_migrating_entity;
>> +     struct cc_exec          cc_exec_entity[2];
>>       struct mtx              cc_lock;
>>       struct callout          *cc_callout;
>>       struct callout_tailq    *cc_callwheel;
>>       struct callout_tailq    cc_expireq;
>>       struct callout_list     cc_callfree;
>> -     struct callout          *cc_next;
>> -     struct callout          *cc_curr;
>>       struct bintime          cc_firstevent;
>>       struct bintime          cc_lastscan;
>>       void                    *cc_cookie;
>> -     int                     cc_cancel;
>> -     int                     cc_waiting;
>>  };
>>
>> +#define      cc_exec_curr            cc_exec_entity[0].cc_curr
>> +#define      cc_exec_next            cc_exec_entity[0].cc_next
>> +#define      cc_exec_cancel          cc_exec_entity[0].cc_cancel
>> +#define      cc_exec_waiting         cc_exec_entity[0].cc_waiting
>> +#define      cc_exec_curr_dir        cc_exec_entity[1].cc_curr
>> +#define      cc_exec_next_dir        cc_exec_entity[1].cc_next
>> +#define      cc_exec_cancel_dir      cc_exec_entity[1].cc_cancel
>> +#define      cc_exec_waiting_dir     cc_exec_entity[1].cc_waiting
>> +
>>  #ifdef SMP
>> -#define      cc_migration_func       cc_migrating_entity.ce_migration_func
>> -#define      cc_migration_arg        cc_migrating_entity.ce_migration_arg
>> -#define      cc_migration_cpu        cc_migrating_entity.ce_migration_cpu
>> -#define      cc_migration_time       cc_migrating_entity.ce_migration_time
>> +#define      cc_migration_func       cc_exec_entity[0].ce_migration_func
>> +#define      cc_migration_arg        cc_exec_entity[0].ce_migration_arg
>> +#define      cc_migration_cpu        cc_exec_entity[0].ce_migration_cpu
>> +#define      cc_migration_time       cc_exec_entity[0].ce_migration_time
>> +#define      cc_migration_func_dir   cc_exec_entity[1].ce_migration_func
>> +#define      cc_migration_arg_dir    cc_exec_entity[1].ce_migration_arg
>> +#define      cc_migration_cpu_dir    cc_exec_entity[1].ce_migration_cpu
>> +#define      cc_migration_time_dir   cc_exec_entity[1].ce_migration_time
>>
>>  struct callout_cpu cc_cpu[MAXCPU];
>>  #define      CPUBLOCK        MAXCPU
>> @@ -156,6 +173,9 @@ struct callout_cpu cc_cpu;
>>
>>  static int timeout_cpu;
>>  void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL;
>> +static struct callout *
>> +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
>> +    int *lockcalls, int *gcalls, int direct);
>>
>>  static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
>>
>> @@ -180,15 +200,18 @@ static MALLOC_DEFINE(M_CALLOUT, "callout
>>   * Resets the migration entity tied to a specific callout cpu.
>>   */
>>  static void
>> -cc_cme_cleanup(struct callout_cpu *cc)
>> +cc_cme_cleanup(struct callout_cpu *cc, int direct)
>>  {
>> -
>> +
>> +     cc->cc_exec_entity[direct].cc_curr = NULL;
>> +     cc->cc_exec_entity[direct].cc_next = NULL;
>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>> +     cc->cc_exec_entity[direct].cc_waiting = 0;
>>  #ifdef SMP
>> -     cc->cc_migration_cpu = CPUBLOCK;
>> -     cc->cc_migration_time.sec = 0;
>> -     cc->cc_migration_time.frac = 0;
>> -     cc->cc_migration_func = NULL;
>> -     cc->cc_migration_arg = NULL;
>> +     cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
>> +     bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
>> +     cc->cc_exec_entity[direct].ce_migration_func = NULL;
>> +     cc->cc_exec_entity[direct].ce_migration_arg = NULL;
>>  #endif
>>  }
>>
>> @@ -196,11 +219,12 @@ cc_cme_cleanup(struct callout_cpu *cc)
>>   * Checks if migration is requested by a specific callout cpu.
>>   */
>>  static int
>> -cc_cme_migrating(struct callout_cpu *cc)
>> +cc_cme_migrating(struct callout_cpu *cc, int direct)
>>  {
>>
>>  #ifdef SMP
>> -     return (cc->cc_migration_cpu != CPUBLOCK);
>> +
>> +     return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
>>  #else
>>       return (0);
>>  #endif
>> @@ -246,7 +270,8 @@ callout_cpu_init(struct callout_cpu *cc)
>>               TAILQ_INIT(&cc->cc_callwheel[i]);
>>       }
>>       TAILQ_INIT(&cc->cc_expireq);
>> -     cc_cme_cleanup(cc);
>> +     for (i = 0; i < 2; i++)
>> +             cc_cme_cleanup(cc, i);
>>       if (cc->cc_callout == NULL)
>>               return;
>>       for (i = 0; i < ncallout; i++) {
>> @@ -355,7 +380,8 @@ callout_process(struct bintime *now)
>>       struct callout *tmp;
>>       struct callout_cpu *cc;
>>       struct callout_tailq *sc;
>> -     int cpu, first, future, last, need_softclock;
>> +     int cpu, first, future, gcalls, mpcalls, last, lockcalls,
>> +         need_softclock;
>>
>>       /*
>>        * Process callouts at a very low cpu priority, so we don't keep the
>> @@ -376,7 +402,8 @@ callout_process(struct bintime *now)
>>       first &= callwheelmask;
>>       for (;;) {
>>               sc = &cc->cc_callwheel[first];
>> -             TAILQ_FOREACH(tmp, sc, c_links.tqe) {
>> +             tmp = TAILQ_FIRST(sc);
>> +             while (tmp != NULL) {
>>                       next = tmp->c_time;
>>                       bintime_sub(&next, &tmp->c_precision);
>>                       if (bintime_cmp(&next, now, <=)) {
>> @@ -385,17 +412,20 @@ callout_process(struct bintime *now)
>>                                * directly from hardware interrupt context.
>>                                */
>>                               if (tmp->c_flags & CALLOUT_DIRECT) {
>> -                                     tmp->c_func(tmp->c_arg);
>>                                       TAILQ_REMOVE(sc, tmp, c_links.tqe);
>> -                                     tmp->c_flags &= ~CALLOUT_PENDING;
>> +                                     tmp = softclock_call_cc(tmp, cc,
>> +                                         &mpcalls, &lockcalls, &gcalls, 1);
>>                               } else {
>>                                       TAILQ_INSERT_TAIL(&cc->cc_expireq,
>>                                           tmp, c_staiter);
>>                                       TAILQ_REMOVE(sc, tmp, c_links.tqe);
>>                                       tmp->c_flags |= CALLOUT_PROCESSED;
>>                                       need_softclock = 1;
>> +                                     tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>                               }
>>                       }
>> +                     else
>> +                             tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>               }
>>               if (first == last)
>>                       break;
>> @@ -561,11 +591,12 @@ callout_cc_add(struct callout *c, struct
>>  }
>>
>>  static void
>> -callout_cc_del(struct callout *c, struct callout_cpu *cc)
>> +callout_cc_del(struct callout *c, struct callout_cpu *cc, int direct)
>>  {
>> -
>> -     if (cc->cc_next == c)
>> -             cc->cc_next = TAILQ_NEXT(c, c_staiter);
>> +     if (direct && cc->cc_exec_next_dir == c)
>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>> +     else if (!direct && cc->cc_exec_next == c)
>> +             cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>       if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
>>               c->c_func = NULL;
>>               SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
>> @@ -574,13 +605,13 @@ callout_cc_del(struct callout *c, struct
>>
>>  static struct callout *
>>  softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
>> -    int *lockcalls, int *gcalls)
>> +    int *lockcalls, int *gcalls, int direct)
>>  {
>>       void (*c_func)(void *);
>>       void *c_arg;
>>       struct lock_class *class;
>>       struct lock_object *c_lock;
>> -     int c_flags, sharedlock;
>> +     int c_flags, flags, sharedlock;
>>  #ifdef SMP
>>       struct callout_cpu *new_cc;
>>       void (*new_func)(void *);
>> @@ -595,7 +626,14 @@ softclock_call_cc(struct callout *c, str
>>       static timeout_t *lastfunc;
>>  #endif
>>
>> -     cc->cc_next = TAILQ_NEXT(c, c_staiter);
>> +     if (direct)
>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>> +     else {
>> +             if ((c->c_flags & CALLOUT_PROCESSED) == 0)
>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
>> +             else
>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>> +     }
>>       class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
>>       sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
>>       c_lock = c->c_lock;
>> @@ -606,8 +644,8 @@ softclock_call_cc(struct callout *c, str
>>               c->c_flags = CALLOUT_LOCAL_ALLOC;
>>       else
>>               c->c_flags &= ~CALLOUT_PENDING;
>> -     cc->cc_curr = c;
>> -     cc->cc_cancel = 0;
>> +     cc->cc_exec_entity[direct].cc_curr = c;
>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>>       CC_UNLOCK(cc);
>>       if (c_lock != NULL) {
>>               class->lc_lock(c_lock, sharedlock);
>> @@ -615,13 +653,12 @@ softclock_call_cc(struct callout *c, str
>>                * The callout may have been cancelled
>>                * while we switched locks.
>>                */
>> -             if (cc->cc_cancel) {
>> +             if (cc->cc_exec_entity[direct].cc_cancel) {
>>                       class->lc_unlock(c_lock);
>>                       goto skip;
>>               }
>>               /* The callout cannot be stopped now. */
>> -             cc->cc_cancel = 1;
>> -
>> +             cc->cc_exec_entity[direct].cc_cancel = 1;
>>               if (c_lock == &Giant.lock_object) {
>>                       (*gcalls)++;
>>                       CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
>> @@ -639,11 +676,13 @@ softclock_call_cc(struct callout *c, str
>>  #ifdef DIAGNOSTIC
>>       binuptime(&bt1);
>>  #endif
>> -     THREAD_NO_SLEEPING();
>> +     if (!direct)
>> +             THREAD_NO_SLEEPING();
>>       SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
>>       c_func(c_arg);
>>       SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
>> -     THREAD_SLEEPING_OK();
>> +     if (!direct)
>> +             THREAD_SLEEPING_OK();
>
> I didn't look into the full patch, however I have a question about
> this: was it necessary because now this is running in interrupt
> context so it can catch the nested sleeping check  case? Or there is
> something else?
> I think it would be a good thing to keep up THREAD_NO_SLEEPING() also
> in the direct dispatch case.
>
> Attilio
>
>
> --
> Peace can only be achieved by understanding - A. Einstein

Thanks for the comment, Attilio.
Yes, it's exactly what you thought. If direct flag is equal to one
you're sure you're processing a callout which runs directly from
hardware interrupt context. In this case, the running thread cannot
sleep and it's likely you have TDP_NOSLEEPING flags set, failing the
KASSERT() in THREAD_NO_SLEEPING() and leading to panic if kernel is
compiled with INVARIANTS.
In case you're running from SWI context (direct equals to zero) code
remains the same as before.
I think what I'm doing works due the assumption thread running never
sleeps. Do you suggest some other way to handle this?

Davide



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CACYV=-HmOwZ=E8Pw3-mUw0994SbvZaA3eMfcwM0fDTu_zykBJg>