Date: Mon, 15 Mar 2021 05:11:20 GMT From: Wei Hu <whu@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: 919a160fc622 - stable/12 - Hyper-V: storvsc: Enhance srb_status code handling. Message-ID: <202103150511.12F5BKQM073065@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch stable/12 has been updated by whu: URL: https://cgit.FreeBSD.org/src/commit/?id=919a160fc6224a4bf281134b3b9584ffb7ebba3d commit 919a160fc6224a4bf281134b3b9584ffb7ebba3d Author: Wei Hu <whu@FreeBSD.org> AuthorDate: 2020-08-31 09:05:45 +0000 Commit: Wei Hu <whu@FreeBSD.org> CommitDate: 2021-03-15 05:08:43 +0000 Hyper-V: storvsc: Enhance srb_status code handling. In hv_storvsc_io_request() when coring, prevent changing of the send channel from the base channel to another one. storvsc_poll always probes on the base channel. Based upon conversations with Microsoft, changed the handling of srb_status codes. Most we should never get, others yes. All are treated as retry-able except for two. We should not get these statuses, but if we ever do, the I/O state is not known. Submitted by: Alexander Sideropoulos <Alexander.Sideropoulos@netapp.com> Reviewed by: trasz, allanjude, whu MFC after: 1 week Sponsored by: Netapp Inc Differential Revision: https://reviews.freebsd.org/D25756 (cherry picked from commit 2a0ce39d086ffe13782c9dc1e24bb240abbe790a) --- sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c | 158 +++++++++++++++++++++--- sys/dev/hyperv/storvsc/hv_vstorage.h | 34 ++++- 2 files changed, 168 insertions(+), 24 deletions(-) diff --git a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c index cab86500ace1..968de9d14e7b 100644 --- a/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c +++ b/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c @@ -151,6 +151,12 @@ SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN, static int hv_storvsc_chan_cnt = 0; SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hv_storvsc_chan_cnt, 0, "# of channels to use"); +#ifdef DIAGNOSTIC +static int hv_storvsc_srb_status = -1; +SYSCTL_INT(_hw_storvsc, OID_AUTO, srb_status, CTLFLAG_RW, + &hv_storvsc_srb_status, 0, "srb_status to inject"); +TUNABLE_INT("hw_storvsc.srb_status", &hv_storvsc_srb_status); +#endif /* DIAGNOSTIC */ #define STORVSC_MAX_IO \ vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size, \ @@ -704,7 +710,16 @@ hv_storvsc_io_request(struct storvsc_softc *sc, vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan; - outgoing_channel = sc->hs_sel_chan[ch_sel]; + /* + * If we are panic'ing, then we are dumping core. Since storvsc_polls + * always uses sc->hs_chan, then we must send to that channel or a poll + * timeout will occur. + */ + if (panicstr) { + outgoing_channel = sc->hs_chan; + } else { + outgoing_channel = sc->hs_sel_chan[ch_sel]; + } mtx_unlock(&request->softc->hs_lock); if (request->prp_list.gpa_range.gpa_len) { @@ -2155,26 +2170,133 @@ storvsc_io_done(struct hv_storvsc_request *reqp) ccb->ccb_h.status &= ~CAM_SIM_QUEUED; ccb->ccb_h.status &= ~CAM_STATUS_MASK; int srb_status = SRB_STATUS(vm_srb->srb_status); +#ifdef DIAGNOSTIC + if (hv_storvsc_srb_status != -1) { + srb_status = SRB_STATUS(hv_storvsc_srb_status & 0x3f); + hv_storvsc_srb_status = -1; + } +#endif /* DIAGNOSTIC */ if (vm_srb->scsi_status == SCSI_STATUS_OK) { if (srb_status != SRB_STATUS_SUCCESS) { - /* - * If there are errors, for example, invalid LUN, - * host will inform VM through SRB status. - */ - if (bootverbose) { - if (srb_status == SRB_STATUS_INVALID_LUN) { - xpt_print(ccb->ccb_h.path, - "invalid LUN %d for op: %s\n", - vm_srb->lun, - scsi_op_desc(cmd->opcode, NULL)); - } else { - xpt_print(ccb->ccb_h.path, - "Unknown SRB flag: %d for op: %s\n", - srb_status, - scsi_op_desc(cmd->opcode, NULL)); - } + bool log_error = true; + switch (srb_status) { + case SRB_STATUS_PENDING: + /* We should never get this */ + panic("storvsc_io_done: SRB_STATUS_PENDING"); + break; + case SRB_STATUS_ABORTED: + /* + * storvsc doesn't support aborts yet + * but if we ever get this status + * the I/O is complete - treat it as a + * timeout + */ + ccb->ccb_h.status |= CAM_CMD_TIMEOUT; + break; + case SRB_STATUS_ABORT_FAILED: + /* We should never get this */ + panic("storvsc_io_done: SRB_STATUS_ABORT_FAILED"); + break; + case SRB_STATUS_ERROR: + /* + * We should never get this. + * Treat it as a CAM_UNREC_HBA_ERROR. + * It will be retried + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_BUSY: + /* Host is busy. Delay and retry */ + ccb->ccb_h.status |= CAM_BUSY; + break; + case SRB_STATUS_INVALID_REQUEST: + case SRB_STATUS_INVALID_PATH_ID: + case SRB_STATUS_NO_DEVICE: + case SRB_STATUS_INVALID_TARGET_ID: + /* + * These indicate an invalid address + * and really should never be seen. + * A CAM_PATH_INVALID could be + * used here but I want to run + * down retries. Do a CAM_BUSY + * since the host might be having issues. + */ + ccb->ccb_h.status |= CAM_BUSY; + break; + case SRB_STATUS_TIMEOUT: + case SRB_STATUS_COMMAND_TIMEOUT: + /* The backend has timed this out */ + ccb->ccb_h.status |= CAM_BUSY; + break; + /* Some old pSCSI errors below */ + case SRB_STATUS_SELECTION_TIMEOUT: + case SRB_STATUS_MESSAGE_REJECTED: + case SRB_STATUS_PARITY_ERROR: + case SRB_STATUS_NO_HBA: + case SRB_STATUS_DATA_OVERRUN: + case SRB_STATUS_UNEXPECTED_BUS_FREE: + case SRB_STATUS_PHASE_SEQUENCE_FAILURE: + /* + * Old pSCSI responses, should never get. + * If we do treat as a CAM_UNREC_HBA_ERROR + * which will be retried + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_BUS_RESET: + ccb->ccb_h.status |= CAM_SCSI_BUS_RESET; + break; + case SRB_STATUS_BAD_SRB_BLOCK_LENGTH: + /* + * The request block is malformed and + * I doubt it is from the guest. Just retry. + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + /* Not used statuses just retry */ + case SRB_STATUS_REQUEST_FLUSHED: + case SRB_STATUS_BAD_FUNCTION: + case SRB_STATUS_NOT_POWERED: + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + break; + case SRB_STATUS_INVALID_LUN: + /* + * Don't log an EMS for this response since + * there is no device at this LUN. This is a + * normal and expected response when a device + * is detached. + */ + ccb->ccb_h.status |= CAM_DEV_NOT_THERE; + log_error = false; + break; + case SRB_STATUS_ERROR_RECOVERY: + case SRB_STATUS_LINK_DOWN: + /* + * I don't ever expect these from + * the host but if we ever get + * retry after a delay + */ + ccb->ccb_h.status |= CAM_BUSY; + break; + default: + /* + * An undefined response assert on + * on debug builds else retry + */ + ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; + KASSERT(srb_status <= SRB_STATUS_LINK_DOWN, + ("storvsc: %s, unexpected srb_status of 0x%x", + __func__, srb_status)); + break; + } + if (log_error) { + xpt_print(ccb->ccb_h.path, "The hypervisor's I/O adapter " + "driver received an unexpected response code 0x%x " + "for operation: %s. If this continues to occur, " + "report the condition to your hypervisor vendor so " + "they can rectify the issue.\n", srb_status, + scsi_op_desc(cmd->opcode, NULL)); } - ccb->ccb_h.status |= CAM_DEV_NOT_THERE; } else { ccb->ccb_h.status |= CAM_REQ_CMP; } diff --git a/sys/dev/hyperv/storvsc/hv_vstorage.h b/sys/dev/hyperv/storvsc/hv_vstorage.h index 866623c22e7a..f1d4c1dfd2e2 100644 --- a/sys/dev/hyperv/storvsc/hv_vstorage.h +++ b/sys/dev/hyperv/storvsc/hv_vstorage.h @@ -241,12 +241,34 @@ struct vstor_packet { /** * SRB (SCSI Request Block) Status Codes */ -#define SRB_STATUS_PENDING 0x00 -#define SRB_STATUS_SUCCESS 0x01 -#define SRB_STATUS_ABORTED 0x02 -#define SRB_STATUS_ERROR 0x04 -#define SRB_STATUS_DATA_OVERRUN 0x12 -#define SRB_STATUS_INVALID_LUN 0x20 +#define SRB_STATUS_PENDING 0x00 +#define SRB_STATUS_SUCCESS 0x01 +#define SRB_STATUS_ABORTED 0x02 +#define SRB_STATUS_ABORT_FAILED 0x03 +#define SRB_STATUS_ERROR 0x04 +#define SRB_STATUS_BUSY 0x05 +#define SRB_STATUS_INVALID_REQUEST 0x06 +#define SRB_STATUS_INVALID_PATH_ID 0x07 +#define SRB_STATUS_NO_DEVICE 0x08 +#define SRB_STATUS_TIMEOUT 0x09 +#define SRB_STATUS_SELECTION_TIMEOUT 0x0A +#define SRB_STATUS_COMMAND_TIMEOUT 0x0B +#define SRB_STATUS_MESSAGE_REJECTED 0x0D +#define SRB_STATUS_BUS_RESET 0x0E +#define SRB_STATUS_PARITY_ERROR 0x0F +#define SRB_STATUS_REQUEST_SENSE_FAILED 0x10 +#define SRB_STATUS_NO_HBA 0x11 +#define SRB_STATUS_DATA_OVERRUN 0x12 +#define SRB_STATUS_UNEXPECTED_BUS_FREE 0x13 +#define SRB_STATUS_PHASE_SEQUENCE_FAILURE 0x14 +#define SRB_STATUS_BAD_SRB_BLOCK_LENGTH 0x15 +#define SRB_STATUS_REQUEST_FLUSHED 0x16 +#define SRB_STATUS_INVALID_LUN 0x20 +#define SRB_STATUS_INVALID_TARGET_ID 0x21 +#define SRB_STATUS_BAD_FUNCTION 0x22 +#define SRB_STATUS_ERROR_RECOVERY 0x23 +#define SRB_STATUS_NOT_POWERED 0x24 +#define SRB_STATUS_LINK_DOWN 0x25 /** * SRB Status Masks (can be combined with above status codes) */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202103150511.12F5BKQM073065>