Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 25 Oct 2010 15:55:07 +0000
From:      Mark Johnston <mjohnston@sandvine.com>
To:        "freebsd-scsi@freebsd.org" <freebsd-scsi@freebsd.org>
Subject:   [Patch] camcontrol - determine the defect list length
Message-ID:  <649630C24D5E884F85DD13645FE903A7640B244F@wtl-exch-2.sandvine.com>

next in thread | raw e-mail | index | archive | help
Hi,

Below is a patch for camcontrol from the tree at my work. It was introduced=
 several years ago to resolve some issues that we were seeing in Adaptec's =
RAID controller firmware. The change is to the readdefects() function - whe=
n sending the request to read the defect list, instead of using a 65000-ent=
ry buffer, we wrote a new function to query the drive and determine the siz=
e of the defect list before allocating any memory. Apparently, if we send a=
 maximum length longer than the number of bytes in the defect list, the cam=
 call returns an overflow error due to what is probably a firmware bug.

It's probable that this issue has been fixed in Adaptec's firmware for a wh=
ile now, and that this change is no longer necessary to us. However, it's c=
onceivable that other devices could have the same problem, and as far as I =
can see, this patch shouldn't introduce any regressions - it essentially co=
nsists of sending an additional CAM request before getting the defect list.

I'd like to know whether this patch is worth trying to get committed into F=
reeBSD. If anyone has some comments or suggestions on the patch, I'd highly=
 appreciate it.

Thanks,
-Mark

--- camcontrol.c@@/main/RELENG_8/RELENG_8_1/svos_8_1/0	2010-10-21 10:15:24.=
000000000 -0400
+++ camcontrol.c@@/main/RELENG_8/RELENG_8_1/svos_8_1/1	2010-10-21 15:39:04.=
000000000 -0400
@@ -202,6 +202,8 @@ static int scanlun_or_reset_dev(int bus,
 #ifndef MINIMALISTIC
 static int readdefects(struct cam_device *device, int argc, char **argv,
 		       char *combinedopt, int retry_count, int timeout);
+static int readdefectlen(struct cam_device *device, int retry_count,=20
+			 int timeout, cam_argmask args, int *ret_len);
 static void modepage(struct cam_device *device, int argc, char **argv,
 		     char *combinedopt, int retry_count, int timeout);
 static int scsicmd(struct cam_device *device, int argc, char **argv,=20
@@ -1722,7 +1724,7 @@ readdefects(struct cam_device *device, i
 	union ccb *ccb =3D NULL;
 	struct scsi_read_defect_data_10 *rdd_cdb;
 	u_int8_t *defect_list =3D NULL;
-	u_int32_t dlist_length =3D 65000;
+	u_int32_t dlist_length =3D 0;
 	u_int32_t returned_length =3D 0;
 	u_int32_t num_returned =3D 0;
 	u_int8_t returned_format;
@@ -1764,12 +1766,23 @@ readdefects(struct cam_device *device, i
=20
 	ccb =3D cam_getccb(device);
=20
-	/*
-	 * Hopefully 65000 bytes is enough to hold the defect list.  If it
-	 * isn't, the disk is probably dead already.  We'd have to go with
-	 * 12 byte command (i.e. alloc_length is 32 bits instead of 16)
-	 * to hold them all.
-	 */
+	/* get the defect list length
+	 * We need to do this as a workaround for a bug in either the Adaptec RAI=
D or Hitachi scsi
+	 * firmware. For some reason, if we give them a max length greater than t=
he number of bytes
+	 * in the dlist + the header length, the cam call returns an overflow err=
or.
+	 * It doesn't look like it's in the driver, as this error seems to be pop=
ulated by the SRB
+	 * error, and the SRB is an interpreted byte array stored in the FIB, lea=
ding me to believe
+	 * that it is directly populated by the firmware.
+	 */
+	if((error =3D readdefectlen(device, retry_count, timeout, arglist, &dlist=
_length)))
+	{
+		warnx("unable to get defect list length");
+		goto defect_bailout;
+	}
+=09
+	/* the +=3D4 is to hold the header */
+	dlist_length +=3D4;
+=09
 	defect_list =3D malloc(dlist_length);
 	if (defect_list =3D=3D NULL) {
 		warnx("can't malloc memory for defect list");
@@ -2004,6 +2017,170 @@ defect_bailout:
=20
 	return(error);
 }
+
+/* returns the number of bytes in the appropriate defect list */
+static int readdefectlen(struct cam_device *device, int retry_count,=20
+			 int timeout, cam_argmask args, int *ret_len)
+{
+	struct scsi_read_defect_data_10 *rdd_cdb;
+	u_int8_t *defect_list =3D NULL;
+	union ccb *ccb =3D NULL;
+
+	u_int8_t returned_format;
+	/*4 bytes will hold the defect header*/
+	u_int32_t dlist_length =3D 4;
+	u_int32_t returned_length =3D 0;
+=09
+	int error =3D 0;
+	int lists_specified =3D 0;
+
+	ccb =3D cam_getccb(device);
+
+	defect_list =3D malloc(dlist_length);
+	if (defect_list =3D=3D NULL) {
+		warnx("can't malloc memory for defect list");
+		error =3D 1;
+		goto defect_len_bailout;
+	}
+
+	rdd_cdb =3D(struct scsi_read_defect_data_10 *)&ccb->csio.cdb_io.cdb_bytes=
;
+
+	/*Essentially just delete stuff from here on*/
+	/*
+	 * cam_getccb() zeros the CCB header only.  So we need to zero the
+	 * payload portion of the ccb.
+	 */
+	bzero(&(&ccb->ccb_h)[1],
+	      sizeof(struct ccb_scsiio) - sizeof(struct ccb_hdr));
+
+	cam_fill_csio(&ccb->csio,
+		      /*retries*/ retry_count,
+		      /*cbfcnp*/ NULL,
+		      /*flags*/ CAM_DIR_IN | ((arglist & CAM_ARG_ERR_RECOVER) ?
+					      CAM_PASS_ERR_RECOVER : 0),
+		      /*tag_action*/ MSG_SIMPLE_Q_TAG,
+		      /*data_ptr*/ defect_list,
+		      /*dxfer_len*/ dlist_length,
+		      /*sense_len*/ SSD_FULL_SIZE,
+		      /*cdb_len*/ sizeof(struct scsi_read_defect_data_10),
+		      /*timeout*/ timeout ? timeout : 5000);
+
+	rdd_cdb->opcode =3D READ_DEFECT_DATA_10;
+	if (args & CAM_ARG_FORMAT_BLOCK)
+		rdd_cdb->format =3D SRDD10_BLOCK_FORMAT;
+	else if (args & CAM_ARG_FORMAT_BFI)
+		rdd_cdb->format =3D SRDD10_BYTES_FROM_INDEX_FORMAT;
+	else if (args & CAM_ARG_FORMAT_PHYS)
+		rdd_cdb->format =3D SRDD10_PHYSICAL_SECTOR_FORMAT;
+	else {
+		error =3D 1;
+		warnx("no defect list format specified");
+		goto defect_len_bailout;
+	}
+	if (args & CAM_ARG_PLIST) {
+		rdd_cdb->format |=3D SRDD10_PLIST;
+		lists_specified++;
+	}
+
+	if (args & CAM_ARG_GLIST) {
+		rdd_cdb->format |=3D SRDD10_GLIST;
+		lists_specified++;
+	}
+
+	scsi_ulto2b(dlist_length, rdd_cdb->alloc_length);
+
+	/* Disable freezing the device queue */
+	ccb->ccb_h.flags |=3D CAM_DEV_QFRZDIS;
+
+	if (cam_send_ccb(device, ccb) < 0) {
+		perror("error reading defect list");
+
+		if (args & CAM_ARG_VERBOSE) {
+			cam_error_print(device, ccb, CAM_ESF_ALL,
+					CAM_EPF_ALL, stderr);
+		}
+
+		error =3D 1;
+		goto defect_len_bailout;
+	}
+
+	returned_length =3D scsi_2btoul(((struct
+		scsi_read_defect_data_hdr_10 *)defect_list)->length);
+
+	returned_format =3D ((struct scsi_read_defect_data_hdr_10 *)
+			defect_list)->format;
+
+	if (((ccb->ccb_h.status & CAM_STATUS_MASK) =3D=3D CAM_SCSI_STATUS_ERROR)
+	 && (ccb->csio.scsi_status =3D=3D SCSI_STATUS_CHECK_COND)
+	 && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) !=3D 0)) {
+		struct scsi_sense_data *sense;
+		int error_code, sense_key, asc, ascq;
+
+		sense =3D &ccb->csio.sense_data;
+		scsi_extract_sense(sense, &error_code, &sense_key, &asc, &ascq);
+
+		/*
+		 * According to the SCSI spec, if the disk doesn't support
+		 * the requested format, it will generally return a sense
+		 * key of RECOVERED ERROR, and an additional sense code
+		 * of "DEFECT LIST NOT FOUND".  So, we check for that, and
+		 * also check to make sure that the returned length is
+		 * greater than 0, and then print out whatever format the
+		 * disk gave us.
+		 */
+		if ((sense_key =3D=3D SSD_KEY_RECOVERED_ERROR)
+		 && (asc =3D=3D 0x1c) && (ascq =3D=3D 0x00)
+		 && (returned_length > 0)) {
+			warnx("requested defect format not available");
+			switch(returned_format & SRDDH10_DLIST_FORMAT_MASK) {
+			case SRDD10_BLOCK_FORMAT:
+				warnx("Device returned block format");
+				break;
+			case SRDD10_BYTES_FROM_INDEX_FORMAT:
+				warnx("Device returned bytes from index"
+				      " format");
+				break;
+			case SRDD10_PHYSICAL_SECTOR_FORMAT:
+				warnx("Device returned physical sector format");
+				break;
+			default:
+				error =3D 1;
+				warnx("Device returned unknown defect"
+				     " data format %#x", returned_format);
+				goto defect_len_bailout;
+				break; /* NOTREACHED */
+			}
+		} else {
+			error =3D 1;
+			warnx("Error returned from read defect data command");
+			if (args & CAM_ARG_VERBOSE)
+				cam_error_print(device, ccb, CAM_ESF_ALL,
+						CAM_EPF_ALL, stderr);
+			goto defect_len_bailout;
+		}
+	} else if ((ccb->ccb_h.status & CAM_STATUS_MASK) !=3D CAM_REQ_CMP) {
+		error =3D 1;
+		warnx("Error returned from read defect data command");
+		if (args & CAM_ARG_VERBOSE)
+			cam_error_print(device, ccb, CAM_ESF_ALL,
+					CAM_EPF_ALL, stderr);
+		goto defect_len_bailout;
+	}
+
+	*ret_len =3D returned_length;
+	return 0;
+defect_len_bailout:
+
+	if (defect_list !=3D NULL)
+		free(defect_list);
+
+	if (ccb !=3D NULL)
+		cam_freeccb(ccb);
+
+	return(error);
+
+}
+
 #endif /* MINIMALISTIC */
=20
 #if 0



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?649630C24D5E884F85DD13645FE903A7640B244F>