Date: Thu, 19 May 2011 10:53:14 GMT From: Rudolf Polzer <rpolzer@mucke-novak.de> To: freebsd-gnats-submit@FreeBSD.org Subject: ports/157176: Heartbeat crashes when clock_t (signed 32bit int) rolls over Message-ID: <201105191053.p4JArEeQ017765@red.freebsd.org> Resent-Message-ID: <201105191100.p4JB0QF1012234@freefall.freebsd.org>
next in thread | raw e-mail | index | archive | help
>Number: 157176 >Category: ports >Synopsis: Heartbeat crashes when clock_t (signed 32bit int) rolls over >Confidential: no >Severity: critical >Priority: medium >Responsible: freebsd-ports-bugs >State: open >Quarter: >Keywords: >Date-Required: >Class: sw-bug >Submitter-Id: current-users >Arrival-Date: Thu May 19 11:00:25 UTC 2011 >Closed-Date: >Last-Modified: >Originator: Rudolf Polzer >Release: 7.4-RELEASE >Organization: Mucke-Novak >Environment: FreeBSD bamgmtbsd1-tst.ba.vftest.net 7.4-RELEASE FreeBSD 7.4-RELEASE #0: Fri Feb 18 01:55:22 UTC 2011 root@driscoll.cse.buffalo.edu:/usr/obj/usr/src/sys/GENERIC amd64 >Description: On FreeBSD amd64, after about 196 days uptime, the clock_t data type rolls over from 2147483647 (0x7FFFFFFF) to -2147483648 (0x80000000). When this happens, cl_times()'s return value, which is unsigned long (64bit) jumps from 0x000000007FFFFFFF to 0xFFFFFFFF80000000, wreaking all sorts of havoc, e.g. total spam of messages like: May 18 10:00:05 baweb01-tev heartbeat: [84855]: WARN: Gmain_timeout_dispatch: Dispatch function for memory stats was delayed 144115187989455872 ms (> 5000 ms) before being called (GSource: 0x802531318) May 18 10:00:05 baweb01-tev heartbeat: [84855]: WARN: Gmain_timeout_dispatch: Dispatch function for memory stats was delayed 144115187989455879 ms (> 5000 ms) before being called (GSource: 0x802531318) till heartbeat finally shuts down. Furthermore, on this platform, ./configure does not detect that the rollover workaround has to be used, as sizeof(long) is 8, which for some odd reason is the condition for "clock_t is long enough". Note: I placed the same bug report upstream on http://developerbugs.linux-foundation.org/show_bug.cgi?id=2596 >How-To-Repeat: Also, for testing purposes, I provided the source code of a "fake uptime" preload library to fake the return value of times(). That way, the problem can be reproduced without having access to a FreeBSD amd64 system with an uptime of 196 days. Usage: gcc -shared -o /tmp/fu.so fu.c -Os -fPIC -s -Wall -Wextra env LD_PRELOAD=/tmp/fu.so FU_TIMES=0x7FFF0000 /usr/local/etc/rc.d/heartbeat start Note that the library may need changes if libc is not /lib/libc.so.7 Source code: #include <sys/times.h> #include <err.h> #include <stdbool.h> #include <stdlib.h> #include <dlfcn.h> typedef clock_t (times_t) (struct tms *buffer); static times_t *orig_times = NULL; static bool initialized = false; static clock_t offset = 0; static void init() { struct tms t; if(initialized) return; initialized = true; orig_times = (times_t *) dlfunc(RTLD_NEXT, "times"); if(!orig_times) { void *libc = dlopen("/lib/libc.so.7", RTLD_LAZY); orig_times = (times_t *) dlfunc(libc, "times"); } if(!orig_times) errx(1, "cannot find times()"); const char *p = getenv("FU_TIMES"); if(p) { clock_t ret = times(&t); offset = strtoll(p, NULL, 0) - ret; } else offset = 0; initialized = true; } clock_t times(struct tms *buffer) { init(); clock_t ret = orig_times(buffer); return ret + offset; } >Fix: I attached a patch doing the following changes: - fix to ./configure to check size of clock_t, not long - changed return type of cl_times() from unsigned long to clock_t - make all callers able to cope with that change - time_longclock() is the only caller that doesn't immediately convert cl_times()'s return value to a clock_t, so only that one needed changes - changed wraparound logic to no longer rely on unsigned long being "good enough". Instead, the static variables are changed to longclock_t, and sign extension is explicitly prevented by an AND with the maximum "unsigned" clock_t value It fixes the issue in our lab, and should do no harm on other platforms. Patch attached with submission follows: diff -ru STABLE-2.1.4.orig/include/clplumbing/longclock.h include/clplumbing/longclock.h --- STABLE-2.1.4.orig/include/clplumbing/longclock.h 2011-05-12 16:19:17.000000000 +0200 +++ include/clplumbing/longclock.h 2011-05-17 11:02:47.000000000 +0200 @@ -37,7 +37,7 @@ * * The functions provided here are: * - * unsigned long cl_times(void); + * clock_t cl_times(void); * A rational wrapper for the times(2) call * for those cases where only the return value * is wanted. @@ -79,7 +79,7 @@ * * extern const longclock_t zero_longclock; */ -extern unsigned long cl_times(void); +extern clock_t cl_times(void); #ifdef CLOCK_T_IS_LONG_ENOUGH # ifndef HAVE_LONGCLOCK_ARITHMETIC diff -ru STABLE-2.1.4.orig/lib/clplumbing/longclock.c lib/clplumbing/longclock.c --- STABLE-2.1.4.orig/lib/clplumbing/longclock.c 2011-05-12 16:19:17.000000000 +0200 +++ lib/clplumbing/longclock.c 2011-05-17 11:25:06.000000000 +0200 @@ -68,7 +68,7 @@ # define TIMES_PARAM &dummy_longclock_tms_struct #endif -unsigned long +clock_t cl_times(void) /* Make times(2) behave rationally on Linux */ { clock_t ret; @@ -108,7 +107,7 @@ } errno = save_errno; #endif /* DISABLE_TIMES_KLUDGE */ - return (unsigned long)ret; + return ret; } #ifdef CLOCK_T_IS_LONG_ENOUGH @@ -124,8 +133,9 @@ #define BITSPERBYTE 8 #define WRAPSHIFT (BITSPERBYTE*sizeof(clock_t)) -#define MAXIMUMULONG ((unsigned long)~(0UL)) -#define MINJUMP ((MAXIMUMULONG/100UL)*99UL) +#define WRAPAMOUNT (((longclock_t) 1) << WRAPSHIFT) +#define MAXIMUMCLOCK (WRAPAMOUNT - 1) +#define MINJUMP ((MAXIMUMCLOCK/100UL)*99UL) longclock_t time_longclock(void) @@ -136,20 +146,19 @@ * because then this can recurse infinitely; that is why the * cl_log call is where it is; found by Simon Graham. */ static gboolean calledbefore = FALSE; - static unsigned long lasttimes = 0L; - static unsigned long wrapcount = 0L; + static longclock_t lasttimes = 0L; + static longclock_t wrapcount = 0L; static unsigned long callcount = 0L; - static longclock_t lc_wrapcount = 0L; - unsigned long timesval; + longclock_t timesval; ++callcount; - timesval = (unsigned long) cl_times(); + timesval = ((longclock_t) cl_times()) & MAXIMUMCLOCK; /* this AND prevents sign extension */ if (calledbefore && timesval < lasttimes) { - clock_t jumpbackby = lasttimes - timesval; + longclock_t jumpbackby = lasttimes - timesval; - if (jumpbackby < (clock_t)MINJUMP) { + if (jumpbackby < MINJUMP) { /* Kernel weirdness */ cl_log(LOG_CRIT , "%s: clock_t from times(2) appears to" @@ -172,8 +181,7 @@ to double update of wrapcount! */ lasttimes = timesval; - ++wrapcount; - lc_wrapcount = ((longclock_t)wrapcount) << WRAPSHIFT; + wrapcount += WRAPAMOUNT; cl_log(LOG_INFO , "%s: clock_t wrapped around (uptime)." @@ -184,7 +192,7 @@ lasttimes = timesval; calledbefore = TRUE; } - return (lc_wrapcount | (longclock_t)timesval); + return (wrapcount | timesval); } #endif /* ! CLOCK_T_IS_LONG_ENOUGH */ --- STABLE-2.1.4.orig/configure.in 2011-05-17 13:49:34.000000000 +0200 +++ configure.in 2011-05-17 13:49:37.000000000 +0200 @@ -1726,6 +1726,7 @@ AC_CHECK_SIZEOF(int) AC_CHECK_SIZEOF(long) AC_CHECK_SIZEOF(long long) +AC_CHECK_SIZEOF(clock_t, [], [#include <sys/times.h>]) AC_ARG_ENABLE([all], [ --enable-all Activate ALL features @@ -2387,7 +2388,7 @@ AC_CHECK_LIB(evs, evs_initialize , , [openais_intalled="no"]) AM_CONDITIONAL(BUILD_OPENAIS_MODULE, test "x${openais_installed}" = "xyes") AC_MSG_CHECKING(if clock_t is long enough) -if test $ac_cv_sizeof_long -ge 8; then +if test $ac_cv_sizeof_clock_t -ge 8; then AC_MSG_RESULT(yes) AC_DEFINE(CLOCK_T_IS_LONG_ENOUGH, 1, [Set if CLOCK_T is adequate by itself for the "indefinite future" (>= 100 years)]) else >Release-Note: >Audit-Trail: >Unformatted:
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201105191053.p4JArEeQ017765>