Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 28 Sep 2012 08:48:35 -0400
From:      John Baldwin <jhb@freebsd.org>
To:        Anton Yuzhaninov <citrin@citrin.ru>
Cc:        freebsd-stable@freebsd.org
Subject:   Re: Problem with IPMI KCS driver
Message-ID:  <201209280848.35380.jhb@freebsd.org>
In-Reply-To: <506573DD.2030808@citrin.ru>
References:  <503DE2AB.6030702@citrin.ru> <201208290825.44198.jhb@freebsd.org> <506573DD.2030808@citrin.ru>

next in thread | previous in thread | raw e-mail | index | archive | help
On Friday, September 28, 2012 5:54:37 am Anton Yuzhaninov wrote:
> On 29.08.2012 16:25, John Baldwin wrote:
> > On Wednesday, August 29, 2012 5:36:43 am Anton Yuzhaninov wrote:
> >> We use servers witch motherboard Supermicro X8DTT-H and meet with such 
problem:
> >> when watchdogd started, server is rebooted by IPMI watchdog several times 
per week.
> >>
> >> After some debugging I've found, that sometimes IPMI driver entered 
endless
> >> loop, and watchdogd have no chances to reset watchdog timer.
> >> In such situation top show:
> >>
> >> PID USERNAME      PRI NICE   SIZE    RES STATE   C   TIME   WCPU COMMAND
> >> ...
> >> 113 root          -16    -     0K    16K CPU4    4  17:18 99.17% ipmi0: 
kcs
> >>
> >> Endless loop located in file /sys/dev/ipmi/ipmi_kcs.c and function
> >> kcs_wait_for_obf():
> >>
> >>           int status, start = ticks;
> >>
> >>           status = INB(sc, KCS_CTL_STS);
> >>           if (state == 0) {
> >>                   /* WAIT FOR OBF = 0 */
> >>                   while (ticks - start<  MAX_TIMEOUT&&  status&  
KCS_STATUS_OBF) {
> >>                           DELAY(100);
> >>                           status = INB(sc, KCS_CTL_STS);
> >>                   }
> >>           } else {
> >>                   /* WAIT FOR OBF = 1 */
> >>                   while (ticks - start<  MAX_TIMEOUT&&
> >>                       !(status&  KCS_STATUS_OBF)) {
> >>                           DELAY(100);
> >>                           status = INB(sc, KCS_CTL_STS);
> >>                   }
> >>           }
> >>
> >> It seems to be, that this loop intended to run no more than MAX_TIMEOUT 
ticks.
> >> but by some reason this timeout does not works and loop runs until 
reboot.
> >>
> >> Questions:
> >> 1. Is it correct to check ticks to implement timeout here?
> >> 2. how to fix this timeout?
> >
> > Hmm.  Can you try this:
> >
> > Index: kern/kern_clock.c
> > ===================================================================
> > --- kern/kern_clock.c	(revision 239819)
> > +++ kern/kern_clock.c	(working copy)
> > @@ -382,7 +382,7 @@
> >   int	stathz;
> >   int	profhz;
> >   int	profprocs;
> > -int	ticks;
> > +volatile int	ticks;
> >   int	psratio;
> >
> >   static DPCPU_DEFINE(int, pcputicks);	/* Per-CPU version of ticks. */
> > @@ -469,7 +469,7 @@
> >   hardclock(int usermode, uintfptr_t pc)
> >   {
> >
> > -	atomic_add_int((volatile int *)&ticks, 1);
> > +	atomic_add_int(&ticks, 1);
> >   	hardclock_cpu(usermode);
> >   	tc_ticktock(1);
> >   	cpu_tick_calibration();
> > Index: sys/kernel.h
> > ===================================================================
> > --- sys/kernel.h	(revision 239819)
> > +++ sys/kernel.h	(working copy)
> > @@ -63,7 +63,7 @@
> >   extern int stathz;			/* statistics clock's frequency */
> >   extern int profhz;			/* profiling clock's frequency */
> >   extern int profprocs;			/* number of process's profiling */
> > -extern int ticks;
> > +extern volatile int ticks;
> >
> >   #endif /* _KERNEL */
> >
> >
> 
> With
> extern volatile int ticks
> 
> Infinite loop repeated not so often, as before, but still repeated.
> 
> Symptoms is same:
> 
> $ ps -ax -o pid,comm,wchan,state,\%cpu | grep ipmi
>    113 ipmi0: kcs    -      RL   100.0
>   1317 watchdogd     ipmire Ds    0.0
> 
> DDB trace for pid 113:
> Tracing pid 113 tid 100359 td 0xffffff0007913470
> cpustop_handler() at cpustop_handler+0x37
> ipi_nmi_handler() at ipi_nmi_handler+0x30
> trap() at trap+0x345
> nmi_calltrap() at nmi_calltrap+0x8
> --- trap 0x13, rip = 0xffffffff809c6e64, rsp = 0xffffffff80fd1ec0, rbp = 
> 0xffffff88425d4b30 ---
> DELAY() at DELAY+0x64
> kcs_wait_for_obf() at kcs_wait_for_obf+0xb6
> kcs_read_byte() at kcs_read_byte+0x7d
> kcs_loop() at kcs_loop+0x372
> fork_exit() at fork_exit+0x135
> fork_trampoline() at fork_trampoline+0xe
> 
> I can type cont from ddb, wait some time, enter to ddb - trace for pid 113 
will 
> be same.
> 
> kcs_wait_for_obf() at kcs_wait_for_obf+0xb6 point to 
> /usr/src/sys/dev/ipmi/ipmi_kcs.c:94
> 
>   91                 while (ticks - start < MAX_TIMEOUT &&
>   92                     !(status & KCS_STATUS_OBF)) {
>   93                         DELAY(100);
>   94                         status = INB(sc, KCS_CTL_STS);
>   95                 }

Hummm.  I'm a bit out of ideas then.  Even the volatile change is a bug that
could have been confirmed (to see if volatile was preventing the compiler
from caching the value of 'ticks') by examining the assembly.

Well, maybe this.  This just avoids using 'ticks' altogether and depends on
DELAY(100) doing what it says:

Index: ipmi.c
===================================================================
--- ipmi.c	(revision 240999)
+++ ipmi.c	(working copy)
@@ -693,7 +693,7 @@
 	req = ipmi_alloc_driver_request(IPMI_ADDR(IPMI_APP_REQUEST, 0),
 	    IPMI_GET_DEVICE_ID, 0, 15);
 
-	error = ipmi_submit_driver_request(sc, req, MAX_TIMEOUT);
+	error = ipmi_submit_driver_request(sc, req, MAX_TIMEOUT * hz);
 	if (error == EWOULDBLOCK) {
 		device_printf(dev, "Timed out waiting for GET_DEVICE_ID\n");
 		ipmi_free_request(req);
Index: ipmi_kcs.c
===================================================================
--- ipmi_kcs.c	(revision 240999)
+++ ipmi_kcs.c	(working copy)
@@ -54,19 +54,22 @@
 static int
 kcs_wait_for_ibf(struct ipmi_softc *sc, int state)
 {
-	int status, start = ticks;
+	int i, status;
 
 	status = INB(sc, KCS_CTL_STS);
 	if (state == 0) {
 		/* WAIT FOR IBF = 0 */
-		while (ticks - start < MAX_TIMEOUT && status & KCS_STATUS_IBF) {
+		for (i = 0; i < MAX_TIMEOUT * 10000 && status & KCS_STATUS_IBF;
+		     i++) {
 			DELAY(100);
 			status = INB(sc, KCS_CTL_STS);
+			if (status & KCS_STATUS_IBF)
+				break;
 		}
 	} else {
 		/* WAIT FOR IBF = 1 */
-		while (ticks - start < MAX_TIMEOUT &&
-		    !(status & KCS_STATUS_IBF)) {
+		for (i = 0; i < MAX_TIMEOUT * 10000 &&
+		    !(status & KCS_STATUS_IBF); i++) {
 			DELAY(100);
 			status = INB(sc, KCS_CTL_STS);
 		}
@@ -77,19 +80,20 @@
 static int
 kcs_wait_for_obf(struct ipmi_softc *sc, int state)
 {
-	int status, start = ticks;
+	int i, status;
 
 	status = INB(sc, KCS_CTL_STS);
 	if (state == 0) {
 		/* WAIT FOR OBF = 0 */
-		while (ticks - start < MAX_TIMEOUT && status & KCS_STATUS_OBF) {
+		for (i = 0; i < MAX_TIMEOUT * 10000 && status & KCS_STATUS_OBF;
+		     i++) {
 			DELAY(100);
 			status = INB(sc, KCS_CTL_STS);
 		}
 	} else {
 		/* WAIT FOR OBF = 1 */
-		while (ticks - start < MAX_TIMEOUT &&
-		    !(status & KCS_STATUS_OBF)) {
+		for (i = 0; i < MAX_TIMEOUT * 10000 &&
+		    !(status & KCS_STATUS_OBF); i++) {
 			DELAY(100);
 			status = INB(sc, KCS_CTL_STS);
 		}
Index: ipmivars.h
===================================================================
--- ipmivars.h	(revision 240999)
+++ ipmivars.h	(working copy)
@@ -219,7 +219,7 @@
 	((sc)->ipmi_io_res[1] != NULL ? OUTB_MULTIPLE(sc, x, value) :	\
 	    OUTB_SINGLE(sc, x, value))
 
-#define MAX_TIMEOUT 3 * hz
+#define MAX_TIMEOUT 3		/* Seconds */
 
 int	ipmi_attach(device_t);
 int	ipmi_detach(device_t);

-- 
John Baldwin



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201209280848.35380.jhb>