Date: Wed, 6 Sep 2017 09:19:54 +0000 (UTC) From: Scott Long <scottl@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r323215 - in head/sys/dev: mpr mps Message-ID: <201709060919.v869Jsk9060601@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: scottl Date: Wed Sep 6 09:19:54 2017 New Revision: 323215 URL: https://svnweb.freebsd.org/changeset/base/323215 Log: Checkpoint the next phase in debug message cleanup, this time focusing on error recovery messages. Sponsored by: Netflix Modified: head/sys/dev/mpr/mpr_sas.c head/sys/dev/mps/mps_sas.c Modified: head/sys/dev/mpr/mpr_sas.c ============================================================================== --- head/sys/dev/mpr/mpr_sas.c Wed Sep 6 07:19:22 2017 (r323214) +++ head/sys/dev/mpr/mpr_sas.c Wed Sep 6 09:19:54 2017 (r323215) @@ -1286,33 +1286,39 @@ mprsas_logical_unit_reset_complete(struct mpr_softc *s * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { - mpr_dprint(sc, MPR_ERROR, "%s: cm_flags = %#x for LUN reset! " + mpr_dprint(sc, MPR_RECOVERY|MPR_ERROR, + "%s: cm_flags = %#x for LUN reset! " "This should not happen!\n", __func__, tm->cm_flags); mprsas_free_tm(sc, tm); return; } if (reply == NULL) { - mprsas_log_command(tm, MPR_RECOVERY, "NULL reset reply for tm " - "%p\n", tm); + mpr_dprint(sc, MPR_RECOVERY, "NULL reset reply for tm %p\n", + tm); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL LUN reset reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " + "LUN reset attempt, resetting controller\n"); mpr_reinit(sc); } return; } - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_RECOVERY, "logical unit reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); - - /* See if there are any outstanding commands for this LUN. + + /* + * See if there are any outstanding commands for this LUN. * This could be made more efficient by using a per-LU data * structure of some sort. */ @@ -1322,34 +1328,36 @@ mprsas_logical_unit_reset_complete(struct mpr_softc *s } if (cm_count == 0) { - mprsas_log_command(tm, MPR_RECOVERY|MPR_INFO, - "logical unit %u finished recovery after reset\n", - tm->cm_lun, tm); + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "Finished recovery after LUN reset for target %u\n", + targ->tid); - mprsas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, + mprsas_announce_reset(sc, AC_SENT_BDR, targ->tid, tm->cm_lun); - /* we've finished recovery for this logical unit. check and + /* + * We've finished recovery for this logical unit. check and * see if some other logical unit has a timedout command * that needs to be processed. */ cm = TAILQ_FIRST(&targ->timedout_commands); if (cm) { + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "More commands to abort for target %u\n", targ->tid); mprsas_send_abort(sc, tm, cm); - } - else { + } else { targ->tm = NULL; mprsas_free_tm(sc, tm); } - } - else { + } else { /* if we still have commands for this LUN, the reset * effectively failed, regardless of the status reported. * Escalate to a target reset. */ - mprsas_log_command(tm, MPR_RECOVERY, - "logical unit reset complete for tm %p, but still have %u " - "command(s)\n", tm, cm_count); + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "logical unit reset complete for target %u, but still " + "have %u command(s), sending target reset\n", targ->tid, + cm_count); mprsas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET); } @@ -1381,46 +1389,53 @@ mprsas_target_reset_complete(struct mpr_softc *sc, str } if (reply == NULL) { - mprsas_log_command(tm, MPR_RECOVERY, "NULL reset reply for tm " - "%p\n", tm); + mpr_dprint(sc, MPR_RECOVERY, + "NULL target reset reply for tm %p TaskMID %u\n", + tm, le16toh(req->TaskMID)); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL target reset reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " + "target reset attempt, resetting controller\n"); mpr_reinit(sc); } return; } - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_RECOVERY, "target reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); if (targ->outstanding == 0) { - /* we've finished recovery for this target and all + /* + * We've finished recovery for this target and all * of its logical units. */ - mprsas_log_command(tm, MPR_RECOVERY|MPR_INFO, - "recovery finished after target reset\n"); + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "Finished reset recovery for target %u\n", targ->tid); mprsas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); targ->tm = NULL; mprsas_free_tm(sc, tm); - } - else { - /* after a target reset, if this target still has + } else { + /* + * After a target reset, if this target still has * outstanding commands, the reset effectively failed, * regardless of the status reported. escalate. */ - mprsas_log_command(tm, MPR_RECOVERY, - "target reset complete for tm %p, but still have %u " - "command(s)\n", tm, targ->outstanding); + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "Target reset complete for target %u, but still have %u " + "command(s), resetting controller\n", targ->tid, + targ->outstanding); mpr_reinit(sc); } } @@ -1450,20 +1465,20 @@ mprsas_send_reset(struct mpr_softc *sc, struct mpr_com /* XXX Need to handle invalid LUNs */ MPR_SET_LUN(req->LUN, tm->cm_lun); tm->cm_targ->logical_unit_resets++; - mprsas_log_command(tm, MPR_RECOVERY|MPR_INFO, - "sending logical unit reset\n"); + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "Sending logical unit reset to target %u lun %d\n", + target->tid, tm->cm_lun); tm->cm_complete = mprsas_logical_unit_reset_complete; mprsas_prepare_for_tm(sc, tm, target, tm->cm_lun); - } - else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { + } else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { /* * Target reset method = * SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_targ->target_resets++; - mprsas_log_command(tm, MPR_RECOVERY|MPR_INFO, - "sending target reset\n"); + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "Sending target reset to target %u\n", target->tid); tm->cm_complete = mprsas_target_reset_complete; mprsas_prepare_for_tm(sc, tm, target, CAM_LUN_WILDCARD); } @@ -1472,12 +1487,11 @@ mprsas_send_reset(struct mpr_softc *sc, struct mpr_com return -1; } - mpr_dprint(sc, MPR_INFO, "to target %u handle 0x%04x\n", target->tid, - target->handle); if (target->encl_level_valid) { - mpr_dprint(sc, MPR_INFO, "At enclosure level %d, slot %d, " - "connector name (%4s)\n", target->encl_level, - target->encl_slot, target->connector_name); + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "At enclosure level %d, slot %d, connector name (%4s)\n", + target->encl_level, target->encl_slot, + target->connector_name); } tm->cm_data = NULL; @@ -1490,7 +1504,7 @@ mprsas_send_reset(struct mpr_softc *sc, struct mpr_com err = mpr_map_command(sc, tm); if (err) - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "error %d sending reset type %u\n", err, type); return err; @@ -1517,7 +1531,7 @@ mprsas_abort_complete(struct mpr_softc *sc, struct mpr * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) { - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_RECOVERY|MPR_ERROR, "cm_flags = %#x for abort %p TaskMID %u!\n", tm->cm_flags, tm, le16toh(req->TaskMID)); mprsas_free_tm(sc, tm); @@ -1525,22 +1539,25 @@ mprsas_abort_complete(struct mpr_softc *sc, struct mpr } if (reply == NULL) { - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_RECOVERY, "NULL abort reply for tm %p TaskMID %u\n", tm, le16toh(req->TaskMID)); if ((sc->mpr_flags & MPR_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mpr_dprint(sc, MPR_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL abort reply\n"); targ->tm = NULL; mprsas_free_tm(sc, tm); - } - else { + } else { /* we should have gotten a reply. */ + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, "NULL reply on " + "abort attempt, resetting controller\n"); mpr_reinit(sc); } return; } - mprsas_log_command(tm, MPR_RECOVERY, + mpr_dprint(sc, MPR_RECOVERY, "abort TaskMID %u status 0x%x code 0x%x count %u\n", le16toh(req->TaskMID), le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), @@ -1548,31 +1565,27 @@ mprsas_abort_complete(struct mpr_softc *sc, struct mpr cm = TAILQ_FIRST(&tm->cm_targ->timedout_commands); if (cm == NULL) { - /* if there are no more timedout commands, we're done with + /* + * if there are no more timedout commands, we're done with * error recovery for this target. */ - mprsas_log_command(tm, MPR_RECOVERY, - "finished recovery after aborting TaskMID %u\n", - le16toh(req->TaskMID)); - + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "Finished abort recovery for target %u\n", targ->tid); targ->tm = NULL; mprsas_free_tm(sc, tm); - } - else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { + } else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { /* abort success, but we have more timedout commands to abort */ - mprsas_log_command(tm, MPR_RECOVERY, - "continuing recovery after aborting TaskMID %u\n", - le16toh(req->TaskMID)); - + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "Continuing abort recovery for target %u\n", targ->tid); mprsas_send_abort(sc, tm, cm); - } - else { - /* we didn't get a command completion, so the abort + } else { + /* + * we didn't get a command completion, so the abort * failed as far as we're concerned. escalate. */ - mprsas_log_command(tm, MPR_RECOVERY, - "abort failed for TaskMID %u tm %p\n", - le16toh(req->TaskMID), tm); + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "Abort failed for target %u, sending logical unit reset\n", + targ->tid); mprsas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET); @@ -1591,7 +1604,8 @@ mprsas_send_abort(struct mpr_softc *sc, struct mpr_com targ = cm->cm_targ; if (targ->handle == 0) { - mpr_dprint(sc, MPR_ERROR,"%s null devhandle for target_id %d\n", + mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, + "%s null devhandle for target_id %d\n", __func__, cm->cm_ccb->ccb_h.target_id); return -1; } @@ -1622,13 +1636,11 @@ mprsas_send_abort(struct mpr_softc *sc, struct mpr_com targ->aborts++; - mpr_dprint(sc, MPR_INFO, "Sending reset from %s for target ID %d\n", - __func__, targ->tid); mprsas_prepare_for_tm(sc, tm, targ, tm->cm_lun); err = mpr_map_command(sc, tm); if (err) - mpr_dprint(sc, MPR_RECOVERY, + mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, "error %d sending abort for cm %p SMID %u\n", err, cm, req->TaskMID); return err; @@ -1637,17 +1649,21 @@ mprsas_send_abort(struct mpr_softc *sc, struct mpr_com static void mprsas_scsiio_timeout(void *data) { + sbintime_t elapsed, now; + union ccb *ccb; struct mpr_softc *sc; struct mpr_command *cm; struct mprsas_target *targ; cm = (struct mpr_command *)data; sc = cm->cm_sc; + ccb = cm->cm_ccb; + now = sbinuptime(); MPR_FUNCTRACE(sc); mtx_assert(&sc->mpr_mtx, MA_OWNED); - mpr_dprint(sc, MPR_XINFO, "Timeout checking cm %p\n", cm); + mpr_dprint(sc, MPR_XINFO|MPR_RECOVERY, "Timeout checking cm %p\n", cm); /* * Run the interrupt handler to make sure it's not pending. This @@ -1669,13 +1685,15 @@ mprsas_scsiio_timeout(void *data) targ = cm->cm_targ; targ->timeouts++; - mprsas_log_command(cm, MPR_ERROR, "command timeout %d cm %p target " - "%u, handle(0x%04x)\n", cm->cm_ccb->ccb_h.timeout, cm, targ->tid, - targ->handle); + elapsed = now - ccb->ccb_h.qos.sim_data; + mprsas_log_command(cm, MPR_INFO|MPR_RECOVERY, + "Command timeout on target %u(0x%04x), %d set, %d.%d elapsed\n", + targ->tid, targ->handle, ccb->ccb_h.timeout, + sbintime_getsec(elapsed), elapsed & 0xffffffff); if (targ->encl_level_valid) { - mpr_dprint(sc, MPR_ERROR, "At enclosure level %d, slot %d, " - "connector name (%4s)\n", targ->encl_level, targ->encl_slot, - targ->connector_name); + mpr_dprint(sc, MPR_INFO|MPR_RECOVERY, + "At enclosure level %d, slot %d, connector name (%4s)\n", + targ->encl_level, targ->encl_slot, targ->connector_name); } /* XXX first, check the firmware state, to see if it's still @@ -1693,10 +1711,13 @@ mprsas_scsiio_timeout(void *data) "processing by tm %p\n", cm, targ->tm); } else if ((targ->tm = mprsas_alloc_tm(sc)) != NULL) { - mpr_dprint(sc, MPR_RECOVERY, "timedout cm %p allocated tm %p\n", - cm, targ->tm); /* start recovery by aborting the first timedout command */ + mpr_dprint(sc, MPR_RECOVERY|MPR_INFO, + "Sending abort to target %u for SMID %d\n", targ->tid, + cm->cm_desc.Default.SMID); + mpr_dprint(sc, MPR_RECOVERY, "timedout cm %p allocated tm %p\n", + cm, targ->tm); mprsas_send_abort(sc, targ->tm, cm); } else { @@ -1709,8 +1730,8 @@ mprsas_scsiio_timeout(void *data) * more credits than disks in an enclosure, and limit * ourselves to one TM per target for recovery. */ - mpr_dprint(sc, MPR_RECOVERY, "timedout cm %p failed to " - "allocate a tm\n", cm); + mpr_dprint(sc, MPR_ERROR|MPR_RECOVERY, + "timedout cm %p failed to allocate a tm\n", cm); } } @@ -1843,6 +1864,7 @@ mprsas_build_nvme_unmap(struct mpr_softc *sc, struct m cm->cm_desc.Default.RequestFlags = MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED; + csio->ccb_h.qos.sim_data = sbinuptime(); #if __FreeBSD_version >= 1000029 callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mprsas_scsiio_timeout, cm, 0); @@ -2161,6 +2183,7 @@ mprsas_action_scsiio(struct mprsas_softc *sassc, union cm->cm_desc.SCSIIO.DevHandle = htole16(targ->handle); } + csio->ccb_h.qos.sim_data = sbinuptime(); #if __FreeBSD_version >= 1000029 callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mprsas_scsiio_timeout, cm, 0); @@ -2936,9 +2959,14 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mp * avoiding getting into an infinite retry loop. */ mprsas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); - mprsas_log_command(cm, MPR_INFO, - "terminated ioc %x loginfo %x scsi %x state %x xfer %u\n", - le16toh(rep->IOCStatus), le32toh(rep->IOCLogInfo), + mpr_dprint(sc, MPR_INFO, + "Controller reported %s status for target %u SMID %u, " + "loginfo %x\n", ((rep->IOCStatus & MPI2_IOCSTATUS_MASK) == + MPI2_IOCSTATUS_SCSI_IOC_TERMINATED) ? "IOC_TERMINATED" : + "EXT_TERMINATED", target_id, cm->cm_desc.Default.SMID, + le32toh(rep->IOCLogInfo)); + mpr_dprint(sc, MPR_XINFO, + "SCSIStatus %x SCSIState %x xfercount %u\n", rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); break; Modified: head/sys/dev/mps/mps_sas.c ============================================================================== --- head/sys/dev/mps/mps_sas.c Wed Sep 6 07:19:22 2017 (r323214) +++ head/sys/dev/mps/mps_sas.c Wed Sep 6 09:19:54 2017 (r323215) @@ -1227,33 +1227,39 @@ mpssas_logical_unit_reset_complete(struct mps_softc *s * XXXSL So should it be an assertion? */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { - mps_dprint(sc, MPS_ERROR, "%s: cm_flags = %#x for LUN reset! " - "This should not happen!\n", __func__, tm->cm_flags); + mps_dprint(sc, MPS_RECOVERY|MPS_ERROR, + "%s: cm_flags = %#x for LUN reset! " + "This should not happen!\n", __func__, tm->cm_flags); mpssas_free_tm(sc, tm); return; } if (reply == NULL) { - mpssas_log_command(tm, MPS_RECOVERY, - "NULL reset reply for tm %p\n", tm); + mps_dprint(sc, MPS_RECOVERY, "NULL reset reply for tm %p\n", + tm); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mps_dprint(sc, MPS_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL LUN reset reply\n"); targ->tm = NULL; mpssas_free_tm(sc, tm); } else { /* we should have gotten a reply. */ + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, "NULL reply on " + "LUN reset attempt, resetting controller\n"); mps_reinit(sc); } return; } - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_RECOVERY, "logical unit reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); - /* See if there are any outstanding commands for this LUN. + /* + * See if there are any outstanding commands for this LUN. * This could be made more efficient by using a per-LU data * structure of some sort. */ @@ -1263,34 +1269,37 @@ mpssas_logical_unit_reset_complete(struct mps_softc *s } if (cm_count == 0) { - mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, - "logical unit %u finished recovery after reset\n", - tm->cm_lun, tm); + mps_dprint(sc, MPS_RECOVERY|MPS_INFO, + "Finished recovery after LUN reset for target %u\n", + targ->tid); - mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, - tm->cm_lun); + mpssas_announce_reset(sc, AC_SENT_BDR, targ->tid, tm->cm_lun); - /* we've finished recovery for this logical unit. check and + /* + * We've finished recovery for this logical unit. check and * see if some other logical unit has a timedout command * that needs to be processed. */ cm = TAILQ_FIRST(&targ->timedout_commands); if (cm) { + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, + "More commands to abort for target %u\n", + targ->tid); mpssas_send_abort(sc, tm, cm); - } - else { + } else { targ->tm = NULL; mpssas_free_tm(sc, tm); } - } - else { - /* if we still have commands for this LUN, the reset + } else { + /* + * If we still have commands for this LUN, the reset * effectively failed, regardless of the status reported. * Escalate to a target reset. */ - mpssas_log_command(tm, MPS_RECOVERY, - "logical unit reset complete for tm %p, but still have %u command(s)\n", - tm, cm_count); + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, + "logical unit reset complete for target %u, but still " + "have %u command(s), sending target reset\n", targ->tid, + cm_count); mpssas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET); } @@ -1322,21 +1331,25 @@ mpssas_target_reset_complete(struct mps_softc *sc, str } if (reply == NULL) { - mpssas_log_command(tm, MPS_RECOVERY, - "NULL reset reply for tm %p\n", tm); + mps_dprint(sc, MPS_RECOVERY, + "NULL target reset reply for tm %pi TaskMID %u\n", + tm, le16toh(req->TaskMID)); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mps_dprint(sc, MPS_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL target reset reply\n"); targ->tm = NULL; mpssas_free_tm(sc, tm); - } - else { + } else { /* we should have gotten a reply. */ + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, "NULL reply on " + "target reset attempt, resetting controller\n"); mps_reinit(sc); } return; } - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_RECOVERY, "target reset status 0x%x code 0x%x count %u\n", le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), le32toh(reply->TerminationCount)); @@ -1345,23 +1358,24 @@ mpssas_target_reset_complete(struct mps_softc *sc, str /* we've finished recovery for this target and all * of its logical units. */ - mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, - "recovery finished after target reset\n"); + mps_dprint(sc, MPS_RECOVERY|MPS_INFO, + "Finished reset recovery for target %u\n", targ->tid); mpssas_announce_reset(sc, AC_SENT_BDR, tm->cm_targ->tid, CAM_LUN_WILDCARD); targ->tm = NULL; mpssas_free_tm(sc, tm); - } - else { - /* after a target reset, if this target still has + } else { + /* + * After a target reset, if this target still has * outstanding commands, the reset effectively failed, * regardless of the status reported. escalate. */ - mpssas_log_command(tm, MPS_RECOVERY, - "target reset complete for tm %p, but still have %u command(s)\n", - tm, targ->outstanding); + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, + "Target reset complete for target %u, but still have %u " + "command(s), resetting controller\n", targ->tid, + targ->outstanding); mps_reinit(sc); } } @@ -1391,24 +1405,23 @@ mpssas_send_reset(struct mps_softc *sc, struct mps_com /* XXX Need to handle invalid LUNs */ MPS_SET_LUN(req->LUN, tm->cm_lun); tm->cm_targ->logical_unit_resets++; - mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, - "sending logical unit reset\n"); + mps_dprint(sc, MPS_RECOVERY|MPS_INFO, + "Sending logical unit reset to target %u lun %d\n", + target->tid, tm->cm_lun); tm->cm_complete = mpssas_logical_unit_reset_complete; mpssas_prepare_for_tm(sc, tm, target, tm->cm_lun); - } - else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { + } else if (type == MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET) { /* * Target reset method = * SAS Hard Link Reset / SATA Link Reset */ req->MsgFlags = MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET; tm->cm_targ->target_resets++; - mpssas_log_command(tm, MPS_RECOVERY|MPS_INFO, - "sending target reset\n"); + mps_dprint(sc, MPS_RECOVERY|MPS_INFO, + "Sending target reset to target %u\n", target->tid); tm->cm_complete = mpssas_target_reset_complete; mpssas_prepare_for_tm(sc, tm, target, CAM_LUN_WILDCARD); - } - else { + } else { mps_dprint(sc, MPS_ERROR, "unexpected reset type 0x%x\n", type); return -1; } @@ -1422,7 +1435,7 @@ mpssas_send_reset(struct mps_softc *sc, struct mps_com err = mps_map_command(sc, tm); if (err) - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_ERROR|MPS_RECOVERY, "error %d sending reset type %u\n", err, type); @@ -1450,7 +1463,7 @@ mpssas_abort_complete(struct mps_softc *sc, struct mps * task management commands don't have S/G lists. */ if ((tm->cm_flags & MPS_CM_FLAGS_ERROR_MASK) != 0) { - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_RECOVERY, "cm_flags = %#x for abort %p TaskMID %u!\n", tm->cm_flags, tm, le16toh(req->TaskMID)); mpssas_free_tm(sc, tm); @@ -1458,22 +1471,25 @@ mpssas_abort_complete(struct mps_softc *sc, struct mps } if (reply == NULL) { - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_RECOVERY, "NULL abort reply for tm %p TaskMID %u\n", tm, le16toh(req->TaskMID)); if ((sc->mps_flags & MPS_FLAGS_DIAGRESET) != 0) { /* this completion was due to a reset, just cleanup */ + mps_dprint(sc, MPS_RECOVERY, "Hardware undergoing " + "reset, ignoring NULL abort reply\n"); targ->tm = NULL; mpssas_free_tm(sc, tm); - } - else { + } else { /* we should have gotten a reply. */ + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, "NULL reply on " + "abort attempt, resetting controller\n"); mps_reinit(sc); } return; } - mpssas_log_command(tm, MPS_RECOVERY, + mps_dprint(sc, MPS_RECOVERY, "abort TaskMID %u status 0x%x code 0x%x count %u\n", le16toh(req->TaskMID), le16toh(reply->IOCStatus), le32toh(reply->ResponseCode), @@ -1481,31 +1497,28 @@ mpssas_abort_complete(struct mps_softc *sc, struct mps cm = TAILQ_FIRST(&tm->cm_targ->timedout_commands); if (cm == NULL) { - /* if there are no more timedout commands, we're done with + /* + * If there are no more timedout commands, we're done with * error recovery for this target. */ - mpssas_log_command(tm, MPS_RECOVERY, - "finished recovery after aborting TaskMID %u\n", - le16toh(req->TaskMID)); + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, + "Finished abort recovery for target %u\n", targ->tid); targ->tm = NULL; mpssas_free_tm(sc, tm); - } - else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { + } else if (le16toh(req->TaskMID) != cm->cm_desc.Default.SMID) { /* abort success, but we have more timedout commands to abort */ - mpssas_log_command(tm, MPS_RECOVERY, - "continuing recovery after aborting TaskMID %u\n", - le16toh(req->TaskMID)); + mps_dprint(sc, MPS_INFO|MPS_RECOVERY, + "Continuing abort recovery for target %u\n", targ->tid); mpssas_send_abort(sc, tm, cm); - } - else { + } else { /* we didn't get a command completion, so the abort * failed as far as we're concerned. escalate. */ - mpssas_log_command(tm, MPS_RECOVERY, - "abort failed for TaskMID %u tm %p\n", - le16toh(req->TaskMID), tm); + mps_dprint(sc, MPS_RECOVERY, + "Abort failed for target %u, sending logical unit reset\n", + targ->tid); mpssas_send_reset(sc, tm, MPI2_SCSITASKMGMT_TASKTYPE_LOGICAL_UNIT_RESET); @@ -1523,7 +1536,8 @@ mpssas_send_abort(struct mps_softc *sc, struct mps_com targ = cm->cm_targ; if (targ->handle == 0) { - mps_dprint(sc, MPS_ERROR,"%s null devhandle for target_id %d\n", + mps_dprint(sc, MPS_ERROR|MPS_RECOVERY, + "%s null devhandle for target_id %d\n", __func__, cm->cm_ccb->ccb_h.target_id); return -1; } @@ -1553,13 +1567,11 @@ mpssas_send_abort(struct mps_softc *sc, struct mps_com targ->aborts++; - mps_dprint(sc, MPS_INFO, "Sending reset from %s for target ID %d\n", - __func__, targ->tid); mpssas_prepare_for_tm(sc, tm, targ, tm->cm_lun); err = mps_map_command(sc, tm); if (err) - mps_dprint(sc, MPS_RECOVERY, + mps_dprint(sc, MPS_ERROR|MPS_RECOVERY, "error %d sending abort for cm %p SMID %u\n", err, cm, req->TaskMID); return err; @@ -1568,17 +1580,21 @@ mpssas_send_abort(struct mps_softc *sc, struct mps_com static void mpssas_scsiio_timeout(void *data) { + sbintime_t elapsed, now; + union ccb *ccb; struct mps_softc *sc; struct mps_command *cm; struct mpssas_target *targ; cm = (struct mps_command *)data; sc = cm->cm_sc; + ccb = cm->cm_ccb; + now = sbinuptime(); MPS_FUNCTRACE(sc); mtx_assert(&sc->mps_mtx, MA_OWNED); - mps_dprint(sc, MPS_XINFO, "Timeout checking cm %p\n", sc); + mps_dprint(sc, MPS_XINFO|MPS_RECOVERY, "Timeout checking cm %p\n", sc); /* * Run the interrupt handler to make sure it's not pending. This @@ -1600,9 +1616,11 @@ mpssas_scsiio_timeout(void *data) targ = cm->cm_targ; targ->timeouts++; - mpssas_log_command(cm, MPS_ERROR, "command timeout %d cm %p target " - "%u, handle(0x%04x)\n", cm->cm_ccb->ccb_h.timeout, cm, targ->tid, - targ->handle); + elapsed = now - ccb->ccb_h.qos.sim_data; + mpssas_log_command(cm, MPS_INFO|MPS_RECOVERY, + "Command timeout on target %u(0x%04x) %d set, %d.%d elapsed\n", + targ->tid, targ->handle, ccb->ccb_h.timeout, + sbintime_getsec(elapsed), elapsed & 0xffffffff); /* XXX first, check the firmware state, to see if it's still * operational. if not, do a diag reset. @@ -1618,15 +1636,16 @@ mpssas_scsiio_timeout(void *data) mps_dprint(sc, MPS_RECOVERY, "queued timedout cm %p for processing by tm %p\n", cm, targ->tm); - } - else if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) { + } else if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) { + mps_dprint(sc, MPS_RECOVERY|MPS_INFO, + "Sending abort to target %u for SMID %d\n", targ->tid, + cm->cm_desc.Default.SMID); mps_dprint(sc, MPS_RECOVERY, "timedout cm %p allocated tm %p\n", cm, targ->tm); /* start recovery by aborting the first timedout command */ mpssas_send_abort(sc, targ->tm, cm); - } - else { + } else { /* XXX queue this target up for recovery once a TM becomes * available. The firmware only has a limited number of * HighPriority credits for the high priority requests used @@ -1636,7 +1655,7 @@ mpssas_scsiio_timeout(void *data) * more credits than disks in an enclosure, and limit * ourselves to one TM per target for recovery. */ - mps_dprint(sc, MPS_RECOVERY, + mps_dprint(sc, MPS_ERROR|MPS_RECOVERY, "timedout cm %p failed to allocate a tm\n", cm); } @@ -1900,6 +1919,7 @@ mpssas_action_scsiio(struct mpssas_softc *sassc, union if (csio->bio != NULL) biotrack(csio->bio, __func__); #endif + csio->ccb_h.qos.sim_data = sbinuptime(); callout_reset_sbt(&cm->cm_callout, SBT_1MS * ccb->ccb_h.timeout, 0, mpssas_scsiio_timeout, cm, 0); @@ -2469,9 +2489,14 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mp * avoiding getting into an infinite retry loop. */ mpssas_set_ccbstatus(ccb, CAM_REQ_CMP_ERR); - mpssas_log_command(cm, MPS_INFO, - "terminated ioc %x loginfo %x scsi %x state %x xfer %u\n", - le16toh(rep->IOCStatus), le32toh(rep->IOCLogInfo), + mps_dprint(sc, MPS_INFO, + "Controller reported %s status for target %u SMID %u, " + "loginfo %x\n", ((rep->IOCStatus & MPI2_IOCSTATUS_MASK) == + MPI2_IOCSTATUS_SCSI_IOC_TERMINATED) ? "IOC_TERMINATED" : + "EXT_TERMINATED", target_id, cm->cm_desc.Default.SMID, + le32toh(rep->IOCLogInfo)); + mps_dprint(sc, MPS_XINFO, + "SCSIStatus %x SCSIState %x xfercount %u\n", rep->SCSIStatus, rep->SCSIState, le32toh(rep->TransferCount)); break;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201709060919.v869Jsk9060601>