Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 19 May 2011 10:53:14 GMT
From:      Rudolf Polzer <rpolzer@mucke-novak.de>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   ports/157176: Heartbeat crashes when clock_t (signed 32bit int) rolls over
Message-ID:  <201105191053.p4JArEeQ017765@red.freebsd.org>
Resent-Message-ID: <201105191100.p4JB0QF1012234@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         157176
>Category:       ports
>Synopsis:       Heartbeat crashes when clock_t (signed 32bit int) rolls over
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    freebsd-ports-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Thu May 19 11:00:25 UTC 2011
>Closed-Date:
>Last-Modified:
>Originator:     Rudolf Polzer
>Release:        7.4-RELEASE
>Organization:
Mucke-Novak
>Environment:
FreeBSD bamgmtbsd1-tst.ba.vftest.net 7.4-RELEASE FreeBSD 7.4-RELEASE #0: Fri Feb 18 01:55:22 UTC 2011     root@driscoll.cse.buffalo.edu:/usr/obj/usr/src/sys/GENERIC  amd64

>Description:
On FreeBSD amd64, after about 196 days uptime, the clock_t data type rolls over
from 2147483647 (0x7FFFFFFF) to -2147483648 (0x80000000).

When this happens, cl_times()'s return value, which is unsigned long (64bit)
jumps from 0x000000007FFFFFFF to 0xFFFFFFFF80000000, wreaking all sorts of
havoc, e.g. total spam of messages like:

May 18 10:00:05 baweb01-tev heartbeat: [84855]: WARN: Gmain_timeout_dispatch:
Dispatch function for memory stats was delayed 144115187989455872 ms (> 5000
ms) before being called (GSource: 0x802531318)
May 18 10:00:05 baweb01-tev heartbeat: [84855]: WARN: Gmain_timeout_dispatch:
Dispatch function for memory stats was delayed 144115187989455879 ms (> 5000
ms) before being called (GSource: 0x802531318)

till heartbeat finally shuts down.

Furthermore, on this platform, ./configure does not detect that the rollover
workaround has to be used, as sizeof(long) is 8, which for some odd reason is
the condition for "clock_t is long enough".

Note: I placed the same bug report upstream on http://developerbugs.linux-foundation.org/show_bug.cgi?id=2596
>How-To-Repeat:
Also, for testing purposes, I provided the source code of a "fake uptime"
preload library to fake the return value of times(). That way, the problem can
be reproduced without having access to a FreeBSD amd64 system with an uptime of
196 days. Usage:

gcc -shared -o /tmp/fu.so fu.c -Os -fPIC -s -Wall -Wextra
env LD_PRELOAD=/tmp/fu.so FU_TIMES=0x7FFF0000 /usr/local/etc/rc.d/heartbeat
start

Note that the library may need changes if libc is not /lib/libc.so.7

Source code:


#include <sys/times.h>
#include <err.h>
#include <stdbool.h>
#include <stdlib.h>
#include <dlfcn.h>

typedef clock_t (times_t) (struct tms *buffer);
static times_t *orig_times = NULL;
static bool initialized = false;
static clock_t offset = 0;

static void init()
{
        struct tms t;

        if(initialized)
                return;
        initialized = true;

        orig_times = (times_t *) dlfunc(RTLD_NEXT, "times");
        if(!orig_times)
        {
                void *libc = dlopen("/lib/libc.so.7", RTLD_LAZY);
                orig_times = (times_t *) dlfunc(libc, "times");
        }
        if(!orig_times)
                errx(1, "cannot find times()");

        const char *p = getenv("FU_TIMES");
        if(p)
        {
                clock_t ret = times(&t);
                offset = strtoll(p, NULL, 0) - ret;
        }
        else
                offset = 0;

        initialized = true;
}

clock_t times(struct tms *buffer)
{
        init();
        clock_t ret = orig_times(buffer);
        return ret + offset;
}
>Fix:
I attached a patch doing the following changes:

- fix to ./configure to check size of clock_t, not long
- changed return type of cl_times() from unsigned long to clock_t
- make all callers able to cope with that change - time_longclock() is the only
caller that doesn't immediately convert cl_times()'s return value to a clock_t,
so only that one needed changes
- changed wraparound logic to no longer rely on unsigned long being "good
enough". Instead, the static variables are changed to longclock_t, and sign
extension is explicitly prevented by an AND with the maximum "unsigned" clock_t
value

It fixes the issue in our lab, and should do no harm on other platforms.

Patch attached with submission follows:

diff -ru STABLE-2.1.4.orig/include/clplumbing/longclock.h include/clplumbing/longclock.h
--- STABLE-2.1.4.orig/include/clplumbing/longclock.h	2011-05-12 16:19:17.000000000 +0200
+++ include/clplumbing/longclock.h	2011-05-17 11:02:47.000000000 +0200
@@ -37,7 +37,7 @@
  *
  *	The functions provided here are:
  *
- *	unsigned long	cl_times(void);
+ *	clock_t		cl_times(void);
  *			A rational wrapper for the times(2) call
  *			for those cases where only the return value
  *			is wanted.
@@ -79,7 +79,7 @@
  *
  *	extern const longclock_t	zero_longclock;
  */
-extern unsigned long cl_times(void);
+extern clock_t cl_times(void);
 
 #ifdef CLOCK_T_IS_LONG_ENOUGH
 #	ifndef	HAVE_LONGCLOCK_ARITHMETIC
diff -ru STABLE-2.1.4.orig/lib/clplumbing/longclock.c lib/clplumbing/longclock.c
--- STABLE-2.1.4.orig/lib/clplumbing/longclock.c	2011-05-12 16:19:17.000000000 +0200
+++ lib/clplumbing/longclock.c	2011-05-17 11:25:06.000000000 +0200
@@ -68,7 +68,7 @@
 #	define	TIMES_PARAM	&dummy_longclock_tms_struct
 #endif
 
-unsigned long
+clock_t
 cl_times(void)	/* Make times(2) behave rationally on Linux */
 {
 	clock_t		ret;
@@ -108,7 +107,7 @@
 	}
 	errno = save_errno;
 #endif /* DISABLE_TIMES_KLUDGE */
-	return (unsigned long)ret;
+	return ret;
 }
 
 #ifdef CLOCK_T_IS_LONG_ENOUGH
@@ -124,8 +133,9 @@
 
 #define	BITSPERBYTE	8
 #define	WRAPSHIFT	(BITSPERBYTE*sizeof(clock_t))
-#define MAXIMUMULONG	((unsigned long)~(0UL))
-#define MINJUMP		((MAXIMUMULONG/100UL)*99UL)
+#define WRAPAMOUNT	(((longclock_t) 1) << WRAPSHIFT)
+#define MAXIMUMCLOCK	(WRAPAMOUNT - 1)
+#define MINJUMP		((MAXIMUMCLOCK/100UL)*99UL)
 
 longclock_t
 time_longclock(void)
@@ -136,20 +146,19 @@
 	 * because then this can recurse infinitely; that is why the
 	 * cl_log call is where it is; found by Simon Graham. */
 	static	gboolean	calledbefore	= FALSE;
-	static	unsigned long	lasttimes	= 0L;
-	static	unsigned long	wrapcount	= 0L;
+	static	longclock_t	lasttimes	= 0L;
+	static	longclock_t	wrapcount	= 0L;
 	static	unsigned long	callcount	= 0L;
-	static	longclock_t	lc_wrapcount	= 0L;
-	unsigned long		timesval;
+	longclock_t		timesval;
 
 	++callcount;
 
-	timesval = (unsigned long) cl_times();
+	timesval = ((longclock_t) cl_times()) & MAXIMUMCLOCK; /* this AND prevents sign extension */
 
 	if (calledbefore && timesval < lasttimes)  {
-		clock_t		jumpbackby = lasttimes - timesval;
+		longclock_t	jumpbackby = lasttimes - timesval;
 
-		if (jumpbackby < (clock_t)MINJUMP) {
+		if (jumpbackby < MINJUMP) {
 			/* Kernel weirdness */
 			cl_log(LOG_CRIT
 			,	"%s: clock_t from times(2) appears to"
@@ -172,8 +181,7 @@
 			   to double update of wrapcount! */
 
 			lasttimes = timesval;
-			++wrapcount;
-			lc_wrapcount = ((longclock_t)wrapcount) << WRAPSHIFT;
+			wrapcount += WRAPAMOUNT;
 
 			cl_log(LOG_INFO
 			,	"%s: clock_t wrapped around (uptime)."
@@ -184,7 +192,7 @@
 		lasttimes = timesval;
 		calledbefore = TRUE;
 	}
-	return (lc_wrapcount | (longclock_t)timesval);
+	return (wrapcount | timesval);
 }
 #endif	/* ! CLOCK_T_IS_LONG_ENOUGH */
 
--- STABLE-2.1.4.orig/configure.in	2011-05-17 13:49:34.000000000 +0200
+++ configure.in	2011-05-17 13:49:37.000000000 +0200
@@ -1726,6 +1726,7 @@
 AC_CHECK_SIZEOF(int)
 AC_CHECK_SIZEOF(long)
 AC_CHECK_SIZEOF(long long)
+AC_CHECK_SIZEOF(clock_t, [], [#include <sys/times.h>])
 
 AC_ARG_ENABLE([all],
 [  --enable-all Activate ALL features
@@ -2387,7 +2388,7 @@
 AC_CHECK_LIB(evs, evs_initialize , , [openais_intalled="no"])
 AM_CONDITIONAL(BUILD_OPENAIS_MODULE, test "x${openais_installed}" = "xyes")
 
 AC_MSG_CHECKING(if clock_t is long enough)
-if test $ac_cv_sizeof_long -ge 8; then
+if test $ac_cv_sizeof_clock_t -ge 8; then
   AC_MSG_RESULT(yes)
   AC_DEFINE(CLOCK_T_IS_LONG_ENOUGH, 1, [Set if CLOCK_T is adequate by itself for the "indefinite future" (>= 100 years)])
 else


>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201105191053.p4JArEeQ017765>