From owner-p4-projects@FreeBSD.ORG Tue Jun 23 23:13:33 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 7D54F1065706; Tue, 23 Jun 2009 23:13:32 +0000 (UTC) Delivered-To: perforce@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 3A1C41065702 for ; Tue, 23 Jun 2009 23:13:32 +0000 (UTC) (envelope-from pvaibhav@FreeBSD.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 270BF8FC13 for ; Tue, 23 Jun 2009 23:13:32 +0000 (UTC) (envelope-from pvaibhav@FreeBSD.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n5NNDWJn083239 for ; Tue, 23 Jun 2009 23:13:32 GMT (envelope-from pvaibhav@FreeBSD.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n5NNDW9x083237 for perforce@freebsd.org; Tue, 23 Jun 2009 23:13:32 GMT (envelope-from pvaibhav@FreeBSD.org) Date: Tue, 23 Jun 2009 23:13:32 GMT Message-Id: <200906232313.n5NNDW9x083237@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to pvaibhav@FreeBSD.org using -f From: Prashant Vaibhav To: Perforce Change Reviews Cc: Subject: PERFORCE change 165014 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 23 Jun 2009 23:13:34 -0000 http://perforce.freebsd.org/chv.cgi?CH=165014 Change 165014 by pvaibhav@pvaibhav_matrix on 2009/06/23 23:12:45 Syncing branch with local copy. Implemented in this commit are - 1. Generic priority queue implementation based on binary heaps. Check comments in sys/bheap.h for more information. 2. callout/timeout scheme modified to use the new binary heap-based priority queue instead of old "callout wheel" data structure. Not everything has been heavily tested, however. Calling for help with testing on SMP systems. Affected files ... .. //depot/projects/soc2009/calloutapi/src/sys/kern/kern_timeout.c#2 edit .. //depot/projects/soc2009/calloutapi/src/sys/sys/bheap.h#1 add .. //depot/projects/soc2009/calloutapi/src/sys/sys/callout.h#2 edit Differences ... ==== //depot/projects/soc2009/calloutapi/src/sys/kern/kern_timeout.c#2 (text+ko) ==== @@ -64,9 +64,6 @@ SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); -static int avg_depth; -SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0, - "Average number of items examined per softclock call. Units = 1/1000"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); @@ -80,17 +77,15 @@ * TODO: * allocate more timeout table slots when table overflows. */ -int callwheelsize, callwheelbits, callwheelmask; - struct callout_cpu { struct mtx cc_lock; - struct callout *cc_callout; - struct callout_tailq *cc_callwheel; - struct callout_list cc_callfree; - struct callout *cc_next; + struct callout *cc_callout; /* pointer to local callouts */ + struct callout** cc_pq_mem; /* memory for the priority Q */ + struct callout_queue cc_callqueue; /* master priority queue */ + struct callout_list cc_callfree; /* available local callouts */ struct callout *cc_curr; void *cc_cookie; - int cc_softticks; + int cc_monoticks; /* monotonic increasing tick */ int cc_cancel; int cc_waiting; }; @@ -141,19 +136,15 @@ timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); - /* - * Calculate callout wheel size - */ - for (callwheelsize = 1, callwheelbits = 0; - callwheelsize < ncallout; - callwheelsize <<= 1, ++callwheelbits) - ; - callwheelmask = callwheelsize - 1; + /* Reserve space for locally allocated callouts (timeout()) */ cc->cc_callout = (struct callout *)v; v = (caddr_t)(cc->cc_callout + ncallout); - cc->cc_callwheel = (struct callout_tailq *)v; - v = (caddr_t)(cc->cc_callwheel + callwheelsize); + + /* Reserve space for callout queue's internal array */ + cc->cc_pq_mem = (struct callout**)v; + v = (caddr_t)(cc->cc_pq_mem + ncallout); + return(v); } @@ -164,17 +155,26 @@ int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); + + /* + * Initialize the priority queue + */ + MINHEAP_INIT(&cc->cc_callqueue, callout, ncallout, cc->cc_pq_mem); + + /* Set monoticks to 0 */ + cc->cc_monoticks = 0; + + /* + * Initialize list of available local callouts + */ SLIST_INIT(&cc->cc_callfree); - for (i = 0; i < callwheelsize; i++) { - TAILQ_INIT(&cc->cc_callwheel[i]); - } if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_flags = CALLOUT_LOCAL_ALLOC; - SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); + SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links); } } @@ -220,8 +220,8 @@ INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */ - cc->cc_callwheel = malloc( - sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT, + cc->cc_pq_mem = (struct callout**) malloc( + sizeof(struct callout*) * ncallout, M_CALLOUT, M_WAITOK); callout_cpu_init(cc); } @@ -235,29 +235,34 @@ { struct callout_cpu *cc; int need_softclock; - int bucket; /* * Process callouts at a very low cpu priority, so we don't keep the * relatively high clock interrupt priority any longer than necessary. */ - need_softclock = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); - for (; (cc->cc_softticks - ticks) < 0; cc->cc_softticks++) { - bucket = cc->cc_softticks & callwheelmask; - if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) { - need_softclock = 1; - break; - } - } - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); + + cc->cc_monoticks++; /* XXX: Can we just use the global 'ticks'? */ + need_softclock = 0; + + /* + * Check if the first pending callout's time has already arrived (or + * has passed). If yes, schedule the softclock() which will then + * expire this (and any remaining) callout as appropriate. + */ + if (MINHEAP_EMPTY(&cc->cc_callqueue)) + need_softclock = 0; + else if (MINHEAP_FIRST(&cc->cc_callqueue)->c_time <= cc->cc_monoticks) + need_softclock = 1; /* - * swi_sched acquires the thread lock, so we don't want to call it - * with cc_lock held; incorrect locking order. + * swi_sched acquires the thread lock, so we don't want to call + * it with cc_lock held; incorrect locking order. */ - if (need_softclock) + if (need_softclock == 1) { + mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); swi_sched(cc->cc_cookie, 0); + } } static struct callout_cpu * @@ -278,29 +283,29 @@ } /* - * The callout mechanism is based on the work of Adam M. Costello and - * George Varghese, published in a technical report entitled "Redesigning - * the BSD Callout and Timer Facilities" and modified slightly for inclusion - * in FreeBSD by Justin T. Gibbs. The original work on the data structures - * used in this implementation was published by G. Varghese and T. Lauck in - * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for - * the Efficient Implementation of a Timer Facility" in the Proceedings of - * the 11th ACM Annual Symposium on Operating Systems Principles, - * Austin, Texas Nov 1987. + * The callout mechanism is a work-in-progress, based on the idea of using + * binary heap based priority queues containing callouts to expire in the + * future. The expiry time is (currently) specified as the number of ticks + * of the hardware clock (HZ/sec). This is an absolute value. + * + * The softclock() function checks that the first pending (ie. highest priority + * callout) in the queue has an expiry time that has either arrived or already + * passed. If yes, that callout is handled. Following this, the callout is + * removed from the queue and the next callout is checked in a similar fashion, + * until the expiry time of the next-in-line callout is in the future. + * + * For more information, ref: http://wiki.freebsd.org/SOC2009PrashantVaibhav */ /* * Software (low priority) clock interrupt. - * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; - struct callout_tailq *bucket; int curticks; - int steps; /* #steps since we last allowed interrupts */ int depth; int mpcalls; int lockcalls; @@ -320,159 +325,144 @@ lockcalls = 0; gcalls = 0; depth = 0; - steps = 0; cc = (struct callout_cpu *)arg; CC_LOCK(cc); - while (cc->cc_softticks != ticks) { - /* - * cc_softticks may be modified by hard clock, so cache - * it while we work on a given bucket. - */ - curticks = cc->cc_softticks; - cc->cc_softticks++; - bucket = &cc->cc_callwheel[curticks & callwheelmask]; - c = TAILQ_FIRST(bucket); - while (c) { - depth++; - if (c->c_time != curticks) { - c = TAILQ_NEXT(c, c_links.tqe); - ++steps; - if (steps >= MAX_SOFTCLOCK_STEPS) { - cc->cc_next = c; - /* Give interrupts a chance. */ - CC_UNLOCK(cc); - ; /* nothing */ - CC_LOCK(cc); - c = cc->cc_next; - steps = 0; - } - } else { - void (*c_func)(void *); - void *c_arg; - struct lock_class *class; - struct lock_object *c_lock; - int c_flags, sharedlock; + /* + * ticks may be modified by hard clock, so cache it + */ + curticks = cc->cc_monoticks; + CTR1(KTR_CALLOUT, "softclock run at %d", curticks); + while (!MINHEAP_EMPTY(&cc->cc_callqueue)) { + c = MINHEAP_FIRST(&cc->cc_callqueue); + if (c->c_time > curticks) { + /* + * We've reached a callout whose expiry time is + * in the future, so we stop. + */ + break; + } else { + /* + * This callout needs to be expired now + */ + void (*c_func)(void *); + void *c_arg; + struct lock_class *class; + struct lock_object *c_lock; + int c_flags, sharedlock; - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - TAILQ_REMOVE(bucket, c, c_links.tqe); - class = (c->c_lock != NULL) ? - LOCK_CLASS(c->c_lock) : NULL; - sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? - 0 : 1; - c_lock = c->c_lock; - c_func = c->c_func; - c_arg = c->c_arg; - c_flags = c->c_flags; - if (c->c_flags & CALLOUT_LOCAL_ALLOC) { - c->c_flags = CALLOUT_LOCAL_ALLOC; - } else { - c->c_flags = - (c->c_flags & ~CALLOUT_PENDING); + MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time); + class = (c->c_lock != NULL) ? + LOCK_CLASS(c->c_lock) : NULL; + sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1; + c_lock = c->c_lock; + c_func = c->c_func; + c_arg = c->c_arg; + c_flags = c->c_flags; + + if (c->c_flags & CALLOUT_LOCAL_ALLOC) + c->c_flags = CALLOUT_LOCAL_ALLOC; + else + c->c_flags = (c->c_flags & ~CALLOUT_PENDING); + cc->cc_curr = c; + cc->cc_cancel = 0; + CC_UNLOCK(cc); + if (c_lock != NULL) { + class->lc_lock(c_lock, sharedlock); + /* + * The callout may have been cancelled + * while we switched locks. + */ + if (cc->cc_cancel) { + class->lc_unlock(c_lock); + goto skip; } - cc->cc_curr = c; - cc->cc_cancel = 0; - CC_UNLOCK(cc); - if (c_lock != NULL) { - class->lc_lock(c_lock, sharedlock); - /* - * The callout may have been cancelled - * while we switched locks. - */ - if (cc->cc_cancel) { - class->lc_unlock(c_lock); - goto skip; - } - /* The callout cannot be stopped now. */ - cc->cc_cancel = 1; + /* The callout cannot be stopped now. */ + cc->cc_cancel = 1; - if (c_lock == &Giant.lock_object) { - gcalls++; - CTR3(KTR_CALLOUT, - "callout %p func %p arg %p", - c, c_func, c_arg); - } else { - lockcalls++; - CTR3(KTR_CALLOUT, "callout lock" - " %p func %p arg %p", - c, c_func, c_arg); - } + if (c_lock == &Giant.lock_object) { + gcalls++; + CTR3(KTR_CALLOUT, + "callout %p func %p arg %p", + c, c_func, c_arg); } else { - mpcalls++; - CTR3(KTR_CALLOUT, - "callout mpsafe %p func %p arg %p", - c, c_func, c_arg); + lockcalls++; + CTR3(KTR_CALLOUT, "callout lock" + " %p func %p arg %p", + c, c_func, c_arg); } + } else { + mpcalls++; + CTR3(KTR_CALLOUT, + "callout mpsafe %p func %p arg %p", + c, c_func, c_arg); + } #ifdef DIAGNOSTIC - binuptime(&bt1); + binuptime(&bt1); #endif - THREAD_NO_SLEEPING(); - SDT_PROBE(callout_execute, kernel, , - callout_start, c, 0, 0, 0, 0); - c_func(c_arg); - SDT_PROBE(callout_execute, kernel, , - callout_end, c, 0, 0, 0, 0); - THREAD_SLEEPING_OK(); + THREAD_NO_SLEEPING(); + SDT_PROBE(callout_execute, kernel, , + callout_start, c, 0, 0, 0, 0); + c_func(c_arg); + SDT_PROBE(callout_execute, kernel, , + callout_end, c, 0, 0, 0, 0); + THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC - binuptime(&bt2); - bintime_sub(&bt2, &bt1); - if (bt2.frac > maxdt) { - if (lastfunc != c_func || - bt2.frac > maxdt * 2) { - bintime2timespec(&bt2, &ts2); - printf( - "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", - c_func, c_arg, - (intmax_t)ts2.tv_sec, - ts2.tv_nsec); - } - maxdt = bt2.frac; - lastfunc = c_func; + binuptime(&bt2); + bintime_sub(&bt2, &bt1); + if (bt2.frac > maxdt) { + if (lastfunc != c_func || + bt2.frac > maxdt * 2) + { + bintime2timespec(&bt2, &ts2); + printf( + "Expensive timeout(9) function:" + "%p(%p) %jd.%09ld s\n", + c_func, c_arg, + (intmax_t)ts2.tv_sec, + ts2.tv_nsec); } + maxdt = bt2.frac; + lastfunc = c_func; + } #endif - CTR1(KTR_CALLOUT, "callout %p finished", c); - if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) - class->lc_unlock(c_lock); - skip: - CC_LOCK(cc); + CTR1(KTR_CALLOUT, "callout %p finished", c); + if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) + class->lc_unlock(c_lock); +skip: + CC_LOCK(cc); + /* + * If the current callout is locally + * allocated (from timeout(9)) + * then put it on the freelist. + * + * Note: we need to check the cached + * copy of c_flags because if it was not + * local, then it's not safe to deref the + * callout pointer. + */ + if (c_flags & CALLOUT_LOCAL_ALLOC) { + KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC, + ("corrupted callout")); + c->c_func = NULL; + SLIST_INSERT_HEAD(&cc->cc_callfree, c, + c_links); + } + cc->cc_curr = NULL; + if (cc->cc_waiting) { /* - * If the current callout is locally - * allocated (from timeout(9)) - * then put it on the freelist. - * - * Note: we need to check the cached - * copy of c_flags because if it was not - * local, then it's not safe to deref the - * callout pointer. + * There is someone waiting + * for the callout to complete. */ - if (c_flags & CALLOUT_LOCAL_ALLOC) { - KASSERT(c->c_flags == - CALLOUT_LOCAL_ALLOC, - ("corrupted callout")); - c->c_func = NULL; - SLIST_INSERT_HEAD(&cc->cc_callfree, c, - c_links.sle); - } - cc->cc_curr = NULL; - if (cc->cc_waiting) { - /* - * There is someone waiting - * for the callout to complete. - */ - cc->cc_waiting = 0; - CC_UNLOCK(cc); - wakeup(&cc->cc_waiting); - CC_LOCK(cc); - } - steps = 0; - c = cc->cc_next; + cc->cc_waiting = 0; + CC_UNLOCK(cc); + wakeup(&cc->cc_waiting); + CC_LOCK(cc); } } - } - avg_depth += (depth * 1000 - avg_depth) >> 8; + }; avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; - cc->cc_next = NULL; CC_UNLOCK(cc); } @@ -509,7 +499,7 @@ if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); - SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); + SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); @@ -597,12 +587,10 @@ } } if (c->c_flags & CALLOUT_PENDING) { - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); - + /* TODO: Just modify the key of existing callouts instead of + * removing and re-inserting it + */ + MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time); cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } @@ -622,11 +610,12 @@ c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); c->c_func = ftn; - c->c_time = ticks + to_ticks; - TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], - c, c_links.tqe); - CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d", - cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks); + c->c_time = cc->cc_monoticks + to_ticks; + + MINHEAP_INSERT(&cc->cc_callqueue, c, c_qe, c_time); + + CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p at %d", + cancelled ? "re" : "", c, c->c_func, c->c_arg, c->c_time); CC_UNLOCK(cc); return (cancelled); @@ -766,18 +755,14 @@ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); - if (cc->cc_next == c) { - cc->cc_next = TAILQ_NEXT(c, c_links.tqe); - } - TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c, - c_links.tqe); + MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if (c->c_flags & CALLOUT_LOCAL_ALLOC) { c->c_func = NULL; - SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); + SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links); } CC_UNLOCK(cc); return (1); @@ -838,48 +823,13 @@ adjust_timeout_calltodo(time_change) struct timeval *time_change; { - register struct callout *p; - unsigned long delta_ticks; - - /* - * How many ticks were we asleep? - * (stolen from tvtohz()). - */ - - /* Don't do anything */ - if (time_change->tv_sec < 0) - return; - else if (time_change->tv_sec <= LONG_MAX / 1000000) - delta_ticks = (time_change->tv_sec * 1000000 + - time_change->tv_usec + (tick - 1)) / tick + 1; - else if (time_change->tv_sec <= LONG_MAX / hz) - delta_ticks = time_change->tv_sec * hz + - (time_change->tv_usec + (tick - 1)) / tick + 1; - else - delta_ticks = LONG_MAX; - - if (delta_ticks > INT_MAX) - delta_ticks = INT_MAX; - - /* - * Now rip through the timer calltodo list looking for timers - * to expire. + /* + * pvaibhav: This function is no longer required because all timers + * whose expiry times are in the past or present are automatically + * going to be expired on wake up when callout_tick() runs and then + * subsequently calls softclock(). */ - /* don't collide with softclock() */ - CC_LOCK(cc); - for (p = calltodo.c_next; p != NULL; p = p->c_next) { - p->c_time -= delta_ticks; - - /* Break if the timer had more time on it than delta_ticks */ - if (p->c_time > 0) - break; - - /* take back the ticks the timer didn't use (p->c_time <= 0) */ - delta_ticks = -p->c_time; - } - CC_UNLOCK(cc); - return; } #endif /* APM_FIXUP_CALLTODO */ ==== //depot/projects/soc2009/calloutapi/src/sys/sys/callout.h#2 (text+ko) ==== @@ -39,23 +39,23 @@ #define _SYS_CALLOUT_H_ #include +#include struct lock_object; SLIST_HEAD(callout_list, callout); -TAILQ_HEAD(callout_tailq, callout); +MINHEAP_HEAD(callout_queue, callout); +MINHEAP_ENTRY(callout_queue_entry); struct callout { - union { - SLIST_ENTRY(callout) sle; - TAILQ_ENTRY(callout) tqe; - } c_links; int c_time; /* ticks to the event */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ int c_flags; /* state of this entry */ volatile int c_cpu; /* CPU we're scheduled on */ + SLIST_ENTRY(callout) c_links; + struct callout_queue_entry c_qe; /* data for priority queue */ }; #define CALLOUT_LOCAL_ALLOC 0x0001 /* was allocated from callfree */