Date: Mon, 8 Jul 2019 20:20:02 +0000 (UTC) From: Warner Losh <imp@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r349849 - in head/sys/dev: mpr mps Message-ID: <201907082020.x68KK2WE000426@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: imp Date: Mon Jul 8 20:20:01 2019 New Revision: 349849 URL: https://svnweb.freebsd.org/changeset/base/349849 Log: Fix bugs in recovery path and improve cm tracking Eliminate the TIMEDOUT state. This state really conveyed two different concepts: I timed out during recovery (and my command got put on the recovery queue), and I timed out diring discovery (which doesn't). Separate those two concepts into two flags. Use the TIMEDOUT flag to fail requests as timed out. Use the on queue flag to remove them from the queue. In mps_intr_locked for MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY message type, when completing commands, ignore the ones that are not in state INQUEUE. They were already completed as part of the recovery process. When we complete them twice, we wind up with entries on the free queue that are marked as busy, trigging asserts. Reviewed by: scottl (earlier version, just for mpr) Differential Revision: https://reviews.freebsd.org/D20785 Modified: head/sys/dev/mpr/mpr.c head/sys/dev/mpr/mpr_sas.c head/sys/dev/mpr/mpr_sas_lsi.c head/sys/dev/mpr/mprvar.h head/sys/dev/mps/mps.c head/sys/dev/mps/mps_sas.c head/sys/dev/mps/mps_sas_lsi.c head/sys/dev/mps/mpsvar.h Modified: head/sys/dev/mpr/mpr.c ============================================================================== --- head/sys/dev/mpr/mpr.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mpr/mpr.c Mon Jul 8 20:20:01 2019 (r349849) @@ -2617,12 +2617,17 @@ mpr_intr_locked(void *data) } else { cm = &sc->commands[ le16toh(desc->AddressReply.SMID)]; - if (cm->cm_state != MPR_CM_STATE_TIMEDOUT) - cm->cm_state = MPR_CM_STATE_BUSY; - cm->cm_reply = reply; - cm->cm_reply_data = - le32toh(desc->AddressReply. - ReplyFrameAddress); + if (cm->cm_state == MPR_CM_STATE_INQUEUE) { + cm->cm_reply = reply; + cm->cm_reply_data = + le32toh(desc->AddressReply. + ReplyFrameAddress); + } else { + mpr_dprint(sc, MPR_RECOVERY, + "Bad state for ADDRESS_REPLY status," + " ignoring state %d cm %p\n", + cm->cm_state, cm); + } } break; } Modified: head/sys/dev/mpr/mpr_sas.c ============================================================================== --- head/sys/dev/mpr/mpr_sas.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mpr/mpr_sas.c Mon Jul 8 20:20:01 2019 (r349849) @@ -1692,7 +1692,7 @@ mprsas_scsiio_timeout(void *data) * and been re-used, though this is unlikely. */ mpr_intr_locked(sc); - if (cm->cm_state != MPR_CM_STATE_INQUEUE) { + if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) { mprsas_log_command(cm, MPR_XINFO, "SCSI command %p almost timed out\n", cm); return; @@ -1721,7 +1721,7 @@ mprsas_scsiio_timeout(void *data) * operational. if not, do a diag reset. */ mprsas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT); - cm->cm_state = MPR_CM_STATE_TIMEDOUT; + cm->cm_flags |= MPR_CM_FLAGS_ON_RECOVERY | MPR_CM_FLAGS_TIMEDOUT; TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery); if (targ->tm != NULL) { @@ -2529,9 +2529,11 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mp TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link); ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED); - if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) { + if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) { TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery); - cm->cm_state = MPR_CM_STATE_BUSY; + KASSERT(cm->cm_state == MPR_CM_STATE_BUSY, + ("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state)); + cm->cm_flags &= ~MPR_CM_FLAGS_ON_RECOVERY; if (cm->cm_reply != NULL) mprsas_log_command(cm, MPR_RECOVERY, "completed timedout cm %p ccb %p during recovery " @@ -2817,7 +2819,7 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mp * retry counter), the only difference is what gets printed * on the console. */ - if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) + if (cm->cm_flags & MPR_CM_FLAGS_TIMEDOUT) mprsas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT); else mprsas_set_ccbstatus(ccb, CAM_REQ_ABORTED); Modified: head/sys/dev/mpr/mpr_sas_lsi.c ============================================================================== --- head/sys/dev/mpr/mpr_sas_lsi.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mpr/mpr_sas_lsi.c Mon Jul 8 20:20:01 2019 (r349849) @@ -1017,7 +1017,7 @@ mprsas_add_device(struct mpr_softc *sc, u16 handle, u8 cm = &sc->commands[i]; if (cm->cm_flags & MPR_CM_FLAGS_SATA_ID_TIMEOUT) { targ->timeouts++; - cm->cm_state = MPR_CM_STATE_TIMEDOUT; + cm->cm_flags |= MPR_CM_FLAGS_TIMEDOUT; if ((targ->tm = mprsas_alloc_tm(sc)) != NULL) { mpr_dprint(sc, MPR_INFO, "%s: sending Target " @@ -1244,9 +1244,11 @@ mprsas_ata_id_timeout(struct mpr_softc *sc, struct mpr /* * The Abort Task cannot be sent from here because the driver has not * completed setting up targets. Instead, the command is flagged so - * that special handling will be used to send the abort. + * that special handling will be used to send the abort. Now that + * this command has timed out, it's no longer in the queue. */ cm->cm_flags |= MPR_CM_FLAGS_SATA_ID_TIMEOUT; + cm->cm_state = MPR_CM_STATE_BUSY; } static int Modified: head/sys/dev/mpr/mprvar.h ============================================================================== --- head/sys/dev/mpr/mprvar.h Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mpr/mprvar.h Mon Jul 8 20:20:01 2019 (r349849) @@ -275,11 +275,12 @@ struct mpr_command { #define MPR_CM_FLAGS_ERROR_MASK MPR_CM_FLAGS_CHAIN_FAILED #define MPR_CM_FLAGS_USE_CCB (1 << 9) #define MPR_CM_FLAGS_SATA_ID_TIMEOUT (1 << 10) +#define MPR_CM_FLAGS_ON_RECOVERY (1 << 12) +#define MPR_CM_FLAGS_TIMEDOUT (1 << 13) u_int cm_state; #define MPR_CM_STATE_FREE 0 #define MPR_CM_STATE_BUSY 1 -#define MPR_CM_STATE_TIMEDOUT 2 -#define MPR_CM_STATE_INQUEUE 3 +#define MPR_CM_STATE_INQUEUE 2 bus_dmamap_t cm_dmamap; struct scsi_sense_data *cm_sense; uint64_t *nvme_error_response; Modified: head/sys/dev/mps/mps.c ============================================================================== --- head/sys/dev/mps/mps.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mps/mps.c Mon Jul 8 20:20:01 2019 (r349849) @@ -2479,13 +2479,24 @@ mps_intr_locked(void *data) (MPI2_EVENT_NOTIFICATION_REPLY *) reply); } else { + /* + * Ignore commands not in INQUEUE state + * since they've already been completed + * via another path. + */ cm = &sc->commands[ le16toh(desc->AddressReply.SMID)]; - if (cm->cm_state != MPS_CM_STATE_TIMEDOUT) + if (cm->cm_state == MPS_CM_STATE_INQUEUE) { cm->cm_state = MPS_CM_STATE_BUSY; - cm->cm_reply = reply; - cm->cm_reply_data = le32toh( - desc->AddressReply.ReplyFrameAddress); + cm->cm_reply = reply; + cm->cm_reply_data = le32toh( + desc->AddressReply.ReplyFrameAddress); + } else { + mps_dprint(sc, MPS_RECOVERY, + "Bad state for ADDRESS_REPLY status," + " ignoring state %d cm %p\n", + cm->cm_state, cm); + } } break; } Modified: head/sys/dev/mps/mps_sas.c ============================================================================== --- head/sys/dev/mps/mps_sas.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mps/mps_sas.c Mon Jul 8 20:20:01 2019 (r349849) @@ -1602,7 +1602,7 @@ mpssas_scsiio_timeout(void *data) * and been re-used, though this is unlikely. */ mps_intr_locked(sc); - if (cm->cm_state != MPS_CM_STATE_INQUEUE) { + if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) { mpssas_log_command(cm, MPS_XINFO, "SCSI command %p almost timed out\n", cm); return; @@ -1626,7 +1626,7 @@ mpssas_scsiio_timeout(void *data) * operational. if not, do a diag reset. */ mpssas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT); - cm->cm_state = MPS_CM_STATE_TIMEDOUT; + cm->cm_flags |= MPS_CM_FLAGS_ON_RECOVERY | MPS_CM_FLAGS_TIMEDOUT; TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery); if (targ->tm != NULL) { @@ -2040,9 +2040,11 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mp biotrack(ccb->csio.bio, __func__); #endif - if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) { + if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) { TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery); - cm->cm_state = MPS_CM_STATE_BUSY; + KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, + ("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state)); + cm->cm_flags &= ~MPS_CM_FLAGS_ON_RECOVERY; if (cm->cm_reply != NULL) mpssas_log_command(cm, MPS_RECOVERY, "completed timedout cm %p ccb %p during recovery " @@ -2325,7 +2327,7 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mp * retry counter), the only difference is what gets printed * on the console. */ - if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) + if (cm->cm_flags & MPS_CM_FLAGS_TIMEDOUT) mpssas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT); else mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED); Modified: head/sys/dev/mps/mps_sas_lsi.c ============================================================================== --- head/sys/dev/mps/mps_sas_lsi.c Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mps/mps_sas_lsi.c Mon Jul 8 20:20:01 2019 (r349849) @@ -790,7 +790,7 @@ mpssas_add_device(struct mps_softc *sc, u16 handle, u8 cm = &sc->commands[i]; if (cm->cm_flags & MPS_CM_FLAGS_SATA_ID_TIMEOUT) { targ->timeouts++; - cm->cm_state = MPS_CM_STATE_TIMEDOUT; + cm->cm_flags |= MPS_CM_FLAGS_TIMEDOUT; if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) { mps_dprint(sc, MPS_INFO, "%s: sending Target " @@ -1017,9 +1017,11 @@ mpssas_ata_id_timeout(struct mps_softc *sc, struct mps /* * The Abort Task cannot be sent from here because the driver has not * completed setting up targets. Instead, the command is flagged so - * that special handling will be used to send the abort. + * that special handling will be used to send the abort. Now that + * this command has timed out, it's no longer in the queue. */ cm->cm_flags |= MPS_CM_FLAGS_SATA_ID_TIMEOUT; + cm->cm_state = MPS_CM_STATE_BUSY; } static int Modified: head/sys/dev/mps/mpsvar.h ============================================================================== --- head/sys/dev/mps/mpsvar.h Mon Jul 8 20:01:28 2019 (r349848) +++ head/sys/dev/mps/mpsvar.h Mon Jul 8 20:20:01 2019 (r349849) @@ -242,11 +242,12 @@ struct mps_command { #define MPS_CM_FLAGS_ERROR_MASK MPS_CM_FLAGS_CHAIN_FAILED #define MPS_CM_FLAGS_USE_CCB (1 << 10) #define MPS_CM_FLAGS_SATA_ID_TIMEOUT (1 << 11) +#define MPS_CM_FLAGS_ON_RECOVERY (1 << 12) +#define MPS_CM_FLAGS_TIMEDOUT (1 << 13) u_int cm_state; #define MPS_CM_STATE_FREE 0 #define MPS_CM_STATE_BUSY 1 -#define MPS_CM_STATE_TIMEDOUT 2 -#define MPS_CM_STATE_INQUEUE 3 +#define MPS_CM_STATE_INQUEUE 2 bus_dmamap_t cm_dmamap; struct scsi_sense_data *cm_sense; TAILQ_HEAD(, mps_chain) cm_chain_list; @@ -545,7 +546,8 @@ mps_free_command(struct mps_softc *sc, struct mps_comm { struct mps_chain *chain, *chain_temp; - KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n")); + KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, + ("state not busy: %d\n", cm->cm_state)); if (cm->cm_reply != NULL) mps_free_reply(sc, cm->cm_reply_data); @@ -581,7 +583,7 @@ mps_alloc_command(struct mps_softc *sc) return (NULL); KASSERT(cm->cm_state == MPS_CM_STATE_FREE, - ("mps: Allocating busy command\n")); + ("mps: Allocating busy command: %d\n", cm->cm_state)); TAILQ_REMOVE(&sc->req_list, cm, cm_link); cm->cm_state = MPS_CM_STATE_BUSY; @@ -594,7 +596,8 @@ mps_free_high_priority_command(struct mps_softc *sc, s { struct mps_chain *chain, *chain_temp; - KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n")); + KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, + ("state not busy: %d\n", cm->cm_state)); if (cm->cm_reply != NULL) mps_free_reply(sc, cm->cm_reply_data); @@ -623,7 +626,7 @@ mps_alloc_high_priority_command(struct mps_softc *sc) return (NULL); KASSERT(cm->cm_state == MPS_CM_STATE_FREE, - ("mps: Allocating busy command\n")); + ("mps: Allocating high priority busy command: %d\n", cm->cm_state)); TAILQ_REMOVE(&sc->high_priority_req_list, cm, cm_link); cm->cm_state = MPS_CM_STATE_BUSY;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201907082020.x68KK2WE000426>