Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 19 Aug 2002 01:18:22 +0200 (CEST)
From:      =?ISO-8859-1?Q?G=E9rard_Roudier?= <groudier@free.fr>
To:        Pete French <pfrench@firstcallgroup.co.uk>
Cc:        freebsd-bugs@FreeBSD.ORG, <njl@FreeBSD.ORG>, <webadmin@firstcallgroup.co.uk>
Subject:   Re: kern/37043: Latest stable causes SCSI bus freeze on sym0 when running SMP
Message-ID:  <20020819004743.K200-100000@localhost.my.domain>
In-Reply-To: <E17g8ry-000OSd-00@mailhost.firstcallgroup.co.uk>

next in thread | previous in thread | raw e-mail | index | archive | help

On Sat, 17 Aug 2002, Pete French wrote:

> > Synopsis: Latest stable causes SCSI bus freeze on sym0 when running SMP
> > State-Changed-From-To: feedback->closed
> > State-Changed-By: njl
> > State-Changed-When: Fri Aug 16 13:14:57 PDT 2002
> > State-Changed-Why:
>
> > Workaround is to not share interrupts between ATA and SCSI controllers.
> > This is not a complete fix so we should revisit this if others have the
> > same trouble in the future.
>
> Its not specificly ATA controllers - everyone else who had the problem
> was sharing interrupts with Ethernet adapters as I recall. But the fix
> does work.

It is indeed not a fix, but some last chance workaround.
(The patch against sym is at the end of this email)

Basically, the code tries to detect an interrupt stall and if such seems
to happen, it installs the work-around that just polls the interrupt
status of the chip 100 times per second.

Btw, the ncr had this just hardcoded since day one, but I disliked it for
the reason it can hide severe hardware or software flaws.

As PCI interrupt trigerring relies on level sensitive logic, an interrupt
stall should never happen. The risk is rather an interrupt storm if any
interrupt condition is not properly handled by software.

IMO, if an interrupt stall happens in PCI, then the cause can be either a
flawed/misconfigured piece of hardware that doesn't implement the correct
triggerring or a software bug that leaves the interrupt masked somewhere.

(IIRC, the problem didn't show up with IO/APIC but only happenned using
the legacy interrupt controller.)

May-be, users that get their system fixed by this work-around in sym
should also report a description of their system hardware and software.
This may help find out where the actual flaw actually is.


Regards,
  G=E9rard.

PS: The first line of the patch, i.e.:

+#define SYM_CONF_HANDLE_INTR_STALL

should be removed, if it happens that it will be worthwhile to commit this
code, in order to allow to conditionnaly compile the workaround.

--------------------- PATCH --------------------------

--- sym_hipd.c.orig=09Sun Jun  9 18:37:50 2002
+++ sym_hipd.c=09Sun Jun  9 16:36:07 2002
@@ -1,3 +1,7 @@
+#define SYM_CONF_HANDLE_INTR_STALL
+#if 0
+#define DEBUG_INTR_STALL
+#endif
 /*
  *  Device driver optimized for the Symbios/LSI 53C896/53C895A/53C1010
  *  PCI-SCSI controllers.
@@ -1922,6 +1926,17 @@
 =09struct sym_tblmove abrt_tbl;=09/* Table for the MOV of it =09*/
 =09struct sym_tblsel  abrt_sel;=09/* Sync params for selection=09*/
 =09u_char=09=09istat_sem;=09/* Tells the chip to stop (SEM)=09*/
+
+#ifdef SYM_CONF_HANDLE_INTR_STALL
+=09int stall_state;=09/* State of the algorithm */
+=09int stall_count;=09/* Number of intr stall observed */
+=09u_long intr_count;=09/* Real interrupt counter */
+=09u_long intr_prevc;=09/* Previous counter seen from clock hanlder */
+=09u_long clock_curr;=09/* Our clock in ticks */
+=09u_long clock_stall;=09/* Clock value at a possible stall */
+=09struct callout_handle clock_ch;/* Kernel timer alchemy :) */
+#define SYM_CLOCK_TICK=09((hz+99)/100)
+#endif
 };

 #define HCB_BA(np, lbl)=09    (np->hcb_ba      + offsetof(struct sym_hcb, =
lbl))
@@ -2513,6 +2528,10 @@
 static void sym_nvram_setup_target (hcb_p np, int targ, struct sym_nvram *=
nvp);
 static int sym_read_nvram (hcb_p np, struct sym_nvram *nvp);

+#ifdef SYM_CONF_HANDLE_INTR_STALL
+static void sym_clock_handler(void *arg);
+#endif
+
 /*
  *  Print something which allows to retrieve the controler type,
  *  unit, target, lun concerned by a kernel message.
@@ -4216,6 +4235,9 @@
 static void sym_intr(void *arg)
 {
 =09if (DEBUG_FLAGS & DEBUG_TINY) printf ("[");
+#ifdef SYM_CONF_HANDLE_INTR_STALL
+=09++((hcb_p)arg)->intr_count;
+#endif
 =09sym_intr1((hcb_p) arg);
 =09if (DEBUG_FLAGS & DEBUG_TINY) printf ("]");
 =09return;
@@ -9509,6 +9531,13 @@
 =09=09goto attach_failed;

 =09/*
+=09 * No comments for this one. :)
+=09 */
+#ifdef SYM_CONF_HANDLE_INTR_STALL
+=09np->clock_ch =3D timeout(sym_clock_handler, (caddr_t)np, SYM_CLOCK_TICK=
);
+#endif
+
+=09/*
 =09 *  Sigh! we are done.
 =09 */
 =09return 0;
@@ -10410,3 +10439,91 @@
 }

 #endif=09/* SYM_CONF_NVRAM_SUPPORT */
+
+#ifdef SYM_CONF_HANDLE_INTR_STALL
+/*
+ * The below code tries to detect interrupt stalls.
+ *
+ * It assumes that an interrupt condition raised
+ * in the chip interrupt status that is not serviced
+ * for 0.2 second is a possible stall.
+ *
+ * If such happens 5 times, it installs a work-around
+ * that forces interrupt service each time an interrupt
+ * condition is present in the chip interrupt status.
+ */
+
+static void sym_clock_handler(void *arg)
+{
+=09int s;
+=09hcb_p np;
+=09u_char istat;
+=09int intr_prevc;
+
+=09np =3D arg;
+=09if (!np)
+=09=09return;
+
+=09s =3D splcam();
+
+=09/*
+=09 * Update our clock and interrupt counter copy.
+=09 */
+=09intr_prevc =3D np->intr_prevc;
+=09np->intr_prevc =3D np->intr_count;
+=09np->clock_curr +=3D SYM_CLOCK_TICK;
+
+=09/*
+=09 * Read the chip interrupt status.
+=09 */
+=09istat =3D INB (nc_istat) & (INTF|SIP|DIP);
+
+=09/*
+=09 * Try to detect interrupt stalls.
+=09 */
+=09switch(np->stall_state) {
+=09default:
+=09case 0:=09/* Wait for the first unserviced interrupt condition */
+=09=09np->stall_count =3D 0;
+
+=09case 2:=09/* Wait for subsequent ones */
+=09=09if (istat) {
+=09=09=09np->clock_stall =3D np->clock_curr;
+=09=09=09np->stall_state =3D 1;
+=09=09}
+=09=09break;
+
+=09case 1:=09/* Detect a possible interrupt stall */
+#ifndef DEBUG_INTR_STALL
+=09=09if (intr_prevc !=3D np->intr_count || !istat) {
+=09=09=09np->stall_state =3D 2;
+=09=09=09break;
+=09=09}
+#endif
+=09=09if (((int)(np->clock_curr - np->clock_stall)) < (hz+4)/5)
+=09=09=09break;
+
+=09=09++np->stall_count;
+=09=09if (np->stall_count < 5) {
+=09=09=09np->stall_state =3D 2;
+=09=09=09printf("%s: interrupt stall, forcing service.\n",
+=09=09=09       sym_name(np));
+=09=09}
+=09=09else {
+=09=09=09np->stall_state =3D 3;
+=09=09=09printf("%s: interrupt stall, installing workaround.\n",
+=09=09=09       sym_name(np));
+=09=09}
+=09=09sym_intr1(np);
+=09=09break;
+
+=09case 3:=09/* Force service if interrupt condition is pending */
+=09=09if (istat)
+=09=09=09sym_intr1(np);
+=09=09break;
+=09}
+
+=09np->clock_ch =3D timeout(sym_clock_handler, (caddr_t)np, SYM_CLOCK_TICK=
);
+=09splx(s);
+}
+#endif /* SYM_CONF_HANDLE_INTR_STALL */


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020819004743.K200-100000>