Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 26 Jul 2011 23:17:15 +0300
From:      Alexander Motin <mav@FreeBSD.org>
To:        lev@FreeBSD.org
Cc:        freebsd-hardware@freebsd.org
Subject:   Re: ahci.ko / geom_mirror / zfs hangs up system when one of HDDs fauilts.
Message-ID:  <4E2F20CB.30906@FreeBSD.org>
In-Reply-To: <125242768.20110724101257@serebryakov.spb.ru>
References:  <1981757790.20110720013856@serebryakov.spb.ru> <4E29A3D6.1080609@FreeBSD.org> <2710115660.20110723004620@serebryakov.spb.ru> <4E2B4B38.70207@FreeBSD.org> <125242768.20110724101257@serebryakov.spb.ru>

next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------040609030808030904090208
Content-Type: text/plain; charset=windows-1251
Content-Transfer-Encoding: 8bit

Lev Serebryakov wrote:
> You wrote 24 èþëÿ 2011 ã., 2:29:12:
>>>   I'm not sure, that it is possible to update firmware on these
>>> drives. And MoBo BIOS looks like latest one.
>> Then I have no idea what to do about the cause of errors. What's about
>> consequences, I've tried to simulate alike problem (device detected, but
>> doesn't respond). Recovery (dropping failed device) took a lot of time,
>> but finally (after about 10 minutes) it succeeded and ZFS continued
>> operation without that drive. After that I've just committed one patch
>> to the HEAD and sent another one to freebsd-scsi@ for review. That, I
>> hope, should significantly (down to 1-2 minutes) speedup that process.
> 
>> How long have you waited before and after making that screenshot?
>   About one and half hour -- server stopped to respond on
>  HTTP/SSH/SMTP/POP3 (but responded to pings and traceroute), I've
>  requested access to remote console, tech support provide such access
>  and all this process takes more than hour.

Not sure it is related to your case, but attached patch fixes timeout
handling problem I've found while testing Marvell 88SE912x controller.
In my test scenario without this patch some commands could stuck inside
controller infinitely.

-- 
Alexander Motin

--------------040609030808030904090208
Content-Type: text/plain;
 name="ahci_wrong_ccs.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="ahci_wrong_ccs.patch"

Index: dev/ahci/ahci.c
===================================================================
--- dev/ahci/ahci.c	(revision 224305)
+++ dev/ahci/ahci.c	(working copy)
@@ -1879,12 +1879,13 @@
 			device_printf(dev, "Poll timeout on slot %d port %d\n",
 			    slot->slot, port);
 			device_printf(dev, "is %08x cs %08x ss %08x "
-			    "rs %08x tfd %02x serr %08x\n",
+			    "rs %08x tfd %02x serr %08x cmd %08x\n",
 			    ATA_INL(ch->r_mem, AHCI_P_IS),
 			    ATA_INL(ch->r_mem, AHCI_P_CI),
 			    ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
 			    ATA_INL(ch->r_mem, AHCI_P_TFD),
-			    ATA_INL(ch->r_mem, AHCI_P_SERR));
+			    ATA_INL(ch->r_mem, AHCI_P_SERR),
+			    ATA_INL(ch->r_mem, AHCI_P_CMD));
 			et = AHCI_ERR_TIMEOUT;
 		}
 
@@ -1960,8 +1961,12 @@
 		ccs = (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_CCS_MASK)
 		    >> AHCI_P_CMD_CCS_SHIFT;
 		if ((sstatus & (1 << slot->slot)) != 0 || ccs == slot->slot ||
-		    ch->fbs_enabled)
+		    ch->fbs_enabled || ch->wrongccs)
 			slot->state = AHCI_SLOT_EXECUTING;
+		else if ((ch->rslots & (1 << ccs)) == 0) {
+			ch->wrongccs = 1;
+			slot->state = AHCI_SLOT_EXECUTING;
+		}
 
 		callout_reset(&slot->timeout,
 		    (int)slot->ccb->ccb_h.timeout * hz / 2000,
@@ -1971,10 +1976,12 @@
 
 	device_printf(dev, "Timeout on slot %d port %d\n",
 	    slot->slot, slot->ccb->ccb_h.target_id & 0x0f);
-	device_printf(dev, "is %08x cs %08x ss %08x rs %08x tfd %02x serr %08x\n",
+	device_printf(dev, "is %08x cs %08x ss %08x rs %08x tfd %02x "
+	    "serr %08x cmd %08x\n",
 	    ATA_INL(ch->r_mem, AHCI_P_IS), ATA_INL(ch->r_mem, AHCI_P_CI),
 	    ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
-	    ATA_INL(ch->r_mem, AHCI_P_TFD), ATA_INL(ch->r_mem, AHCI_P_SERR));
+	    ATA_INL(ch->r_mem, AHCI_P_TFD), ATA_INL(ch->r_mem, AHCI_P_SERR),
+	    ATA_INL(ch->r_mem, AHCI_P_CMD));
 
 	/* Handle frozen command. */
 	if (ch->frozen) {
@@ -1987,7 +1994,7 @@
 		}
 		xpt_done(fccb);
 	}
-	if (!ch->fbs_enabled) {
+	if (!ch->fbs_enabled && !ch->wrongccs) {
 		/* Without FBS we know real timeout source. */
 		ch->fatalerr = 1;
 		/* Handle command with timeout. */
@@ -2585,6 +2592,7 @@
 		xpt_release_simq(ch->sim, TRUE);
 	ch->eslots = 0;
 	ch->toslots = 0;
+	ch->wrongccs = 0;
 	ch->fatalerr = 0;
 	/* Tell the XPT about the event */
 	xpt_async(AC_BUS_RESET, ch->path, NULL);
Index: dev/ahci/ahci.h
===================================================================
--- dev/ahci/ahci.h	(revision 224305)
+++ dev/ahci/ahci.h	(working copy)
@@ -426,6 +426,7 @@
 	int			resetting;	/* Hard-reset in progress. */
 	int			resetpolldiv;	/* Hard-reset poll divider. */
 	int			listening;	/* SUD bit is cleared. */
+	int			wrongccs;	/* CCS field in CMD was wrong */
 	union ccb		*frozen;	/* Frozen command */
 	struct callout		pm_timer;	/* Power management events */
 	struct callout		reset_timer;	/* Hard-reset timeout */

--------------040609030808030904090208--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4E2F20CB.30906>