Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 29 Jun 2020 00:31:24 +0000 (UTC)
From:      Chuck Tuffli <chuck@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r362748 - head/usr.sbin/bhyve
Message-ID:  <202006290031.05T0VOlI047065@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: chuck
Date: Mon Jun 29 00:31:24 2020
New Revision: 362748
URL: https://svnweb.freebsd.org/changeset/base/362748

Log:
  bhyve: add locks around NVMe queue accesses
  
  The NVMe code attempted to ensure thread safety through a combination of
  using atomics and a "busy" flag. But this approach leads to unavoidable
  race conditions.
  
  Fix is to use per-queue mutex locks to ensure thread safety within the
  queue processing code. While in the neighborhood, move all the queue
  initialization code to a common function.
  
  Tested by:	Jason Tubnor
  MFC after:	2 weeks
  Differential Revision: https://reviews.freebsd.org/D19841

Modified:
  head/usr.sbin/bhyve/pci_nvme.c

Modified: head/usr.sbin/bhyve/pci_nvme.c
==============================================================================
--- head/usr.sbin/bhyve/pci_nvme.c	Mon Jun 29 00:31:20 2020	(r362747)
+++ head/usr.sbin/bhyve/pci_nvme.c	Mon Jun 29 00:31:24 2020	(r362748)
@@ -153,21 +153,21 @@ enum nvme_copy_dir {
 
 struct nvme_completion_queue {
 	struct nvme_completion *qbase;
+	pthread_mutex_t	mtx;
 	uint32_t	size;
 	uint16_t	tail; /* nvme progress */
 	uint16_t	head; /* guest progress */
 	uint16_t	intr_vec;
 	uint32_t	intr_en;
-	pthread_mutex_t	mtx;
 };
 
 struct nvme_submission_queue {
 	struct nvme_command *qbase;
+	pthread_mutex_t	mtx;
 	uint32_t	size;
 	uint16_t	head; /* nvme progress */
 	uint16_t	tail; /* guest progress */
 	uint16_t	cqid; /* completion queue id */
-	int		busy; /* queue is being processed */
 	int		qpriority;
 };
 
@@ -339,7 +339,63 @@ pci_nvme_toggle_phase(uint16_t *status, int prev)
 		*status |= NVME_STATUS_P;
 }
 
+/*
+ * Initialize the requested number or IO Submission and Completion Queues.
+ * Admin queues are allocated implicitly.
+ */
 static void
+pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
+{
+	uint32_t i;
+
+	/*
+	 * Allocate and initialize the Submission Queues
+	 */
+	if (nsq > NVME_QUEUES) {
+		WPRINTF("%s: clamping number of SQ from %u to %u",
+					__func__, nsq, NVME_QUEUES);
+		nsq = NVME_QUEUES;
+	}
+
+	sc->num_squeues = nsq;
+
+	sc->submit_queues = calloc(sc->num_squeues + 1,
+				sizeof(struct nvme_submission_queue));
+	if (sc->submit_queues == NULL) {
+		WPRINTF("%s: SQ allocation failed", __func__);
+		sc->num_squeues = 0;
+	} else {
+		struct nvme_submission_queue *sq = sc->submit_queues;
+
+		for (i = 0; i < sc->num_squeues; i++)
+			pthread_mutex_init(&sq[i].mtx, NULL);
+	}
+
+	/*
+	 * Allocate and initialize the Completion Queues
+	 */
+	if (ncq > NVME_QUEUES) {
+		WPRINTF("%s: clamping number of CQ from %u to %u",
+					__func__, ncq, NVME_QUEUES);
+		ncq = NVME_QUEUES;
+	}
+
+	sc->num_cqueues = ncq;
+
+	sc->compl_queues = calloc(sc->num_cqueues + 1,
+				sizeof(struct nvme_completion_queue));
+	if (sc->compl_queues == NULL) {
+		WPRINTF("%s: CQ allocation failed", __func__);
+		sc->num_cqueues = 0;
+	} else {
+		struct nvme_completion_queue *cq = sc->compl_queues;
+
+		for (i = 0; i < sc->num_cqueues; i++)
+			pthread_mutex_init(&cq[i].mtx, NULL);
+	}
+}
+
+static void
 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
 {
 	struct nvme_controller_data *cd = &sc->ctrldata;
@@ -498,6 +554,8 @@ pci_nvme_init_logpages(struct pci_nvme_softc *sc)
 static void
 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
 {
+	uint32_t i;
+
 	DPRINTF("%s", __func__);
 
 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
@@ -511,44 +569,23 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc)
 	sc->regs.cc = 0;
 	sc->regs.csts = 0;
 
-	sc->num_cqueues = sc->num_squeues = sc->max_queues;
-	if (sc->submit_queues != NULL) {
-		for (int i = 0; i < sc->num_squeues + 1; i++) {
-			/*
-			 * The Admin Submission Queue is at index 0.
-			 * It must not be changed at reset otherwise the
-			 * emulation will be out of sync with the guest.
-			 */
-			if (i != 0) {
-				sc->submit_queues[i].qbase = NULL;
-				sc->submit_queues[i].size = 0;
-				sc->submit_queues[i].cqid = 0;
-			}
-			sc->submit_queues[i].tail = 0;
-			sc->submit_queues[i].head = 0;
-			sc->submit_queues[i].busy = 0;
-		}
-	} else
-		sc->submit_queues = calloc(sc->num_squeues + 1,
-		                        sizeof(struct nvme_submission_queue));
+	assert(sc->submit_queues != NULL);
 
-	if (sc->compl_queues != NULL) {
-		for (int i = 0; i < sc->num_cqueues + 1; i++) {
-			/* See Admin Submission Queue note above */
-			if (i != 0) {
-				sc->compl_queues[i].qbase = NULL;
-				sc->compl_queues[i].size = 0;
-			}
+	for (i = 0; i < sc->num_squeues + 1; i++) {
+		sc->submit_queues[i].qbase = NULL;
+		sc->submit_queues[i].size = 0;
+		sc->submit_queues[i].cqid = 0;
+		sc->submit_queues[i].tail = 0;
+		sc->submit_queues[i].head = 0;
+	}
 
-			sc->compl_queues[i].tail = 0;
-			sc->compl_queues[i].head = 0;
-		}
-	} else {
-		sc->compl_queues = calloc(sc->num_cqueues + 1,
-		                        sizeof(struct nvme_completion_queue));
+	assert(sc->compl_queues != NULL);
 
-		for (int i = 0; i < sc->num_cqueues + 1; i++)
-			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
+	for (i = 0; i < sc->num_cqueues + 1; i++) {
+		sc->compl_queues[i].qbase = NULL;
+		sc->compl_queues[i].size = 0;
+		sc->compl_queues[i].tail = 0;
+		sc->compl_queues[i].head = 0;
 	}
 }
 
@@ -1092,14 +1129,9 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u
 	sq = &sc->submit_queues[0];
 	cq = &sc->compl_queues[0];
 
-	sqhead = atomic_load_acq_short(&sq->head);
+	pthread_mutex_lock(&sq->mtx);
 
-	if (atomic_testandset_int(&sq->busy, 1)) {
-		DPRINTF("%s SQ busy, head %u, tail %u",
-		        __func__, sqhead, sq->tail);
-		return;
-	}
-
+	sqhead = sq->head;
 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
 	
 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
@@ -1162,6 +1194,8 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u
 			struct nvme_completion *cp;
 			int phase;
 
+			pthread_mutex_lock(&cq->mtx);
+
 			cp = &(cq->qbase)[cq->tail];
 			cp->cdw0 = compl.cdw0;
 			cp->sqid = 0;
@@ -1173,16 +1207,18 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u
 			pci_nvme_toggle_phase(&cp->status, phase);
 
 			cq->tail = (cq->tail + 1) % cq->size;
+
+			pthread_mutex_unlock(&cq->mtx);
 		}
 	}
 
 	DPRINTF("setting sqhead %u", sqhead);
-	atomic_store_short(&sq->head, sqhead);
-	atomic_store_int(&sq->busy, 0);
+	sq->head = sqhead;
 
 	if (cq->head != cq->tail)
 		pci_generate_msix(sc->nsc_pi, 0);
 
+	pthread_mutex_unlock(&sq->mtx);
 }
 
 static int
@@ -1272,7 +1308,7 @@ pci_nvme_append_iov_req(struct pci_nvme_softc *sc, str
 static void
 pci_nvme_set_completion(struct pci_nvme_softc *sc,
 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
-	uint32_t cdw0, uint16_t status, int ignore_busy)
+	uint32_t cdw0, uint16_t status)
 {
 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
 	struct nvme_completion *compl;
@@ -1290,7 +1326,7 @@ pci_nvme_set_completion(struct pci_nvme_softc *sc,
 
 	compl->cdw0 = cdw0;
 	compl->sqid = sqid;
-	compl->sqhd = atomic_load_acq_short(&sq->head);
+	compl->sqhd = sq->head;
 	compl->cid = cid;
 
 	// toggle phase
@@ -1375,7 +1411,7 @@ pci_nvme_io_done(struct blockif_req *br, int err)
 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
 	pci_nvme_status_genc(&status, code);
 
-	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
+	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
 	pci_nvme_release_ioreq(req->sc, req);
 }
 
@@ -1575,7 +1611,7 @@ pci_nvme_dealloc_sm(struct blockif_req *br, int err)
 
 	if (done) {
 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
-		    req->cid, 0, status, 0);
+		    req->cid, 0, status);
 		pci_nvme_release_ioreq(sc, req);
 	}
 }
@@ -1677,13 +1713,9 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint
 	/* handle all submissions up to sq->tail index */
 	sq = &sc->submit_queues[idx];
 
-	if (atomic_testandset_int(&sq->busy, 1)) {
-		DPRINTF("%s sqid %u busy", __func__, idx);
-		return;
-	}
+	pthread_mutex_lock(&sq->mtx);
 
-	sqhead = atomic_load_acq_short(&sq->head);
-
+	sqhead = sq->head;
 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
 	         idx, sqhead, sq->tail, sq->qbase);
 
@@ -1750,14 +1782,15 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint
 complete:
 		if (!pending) {
 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
-			    status, 1);
+			    status);
 			if (req != NULL)
 				pci_nvme_release_ioreq(sc, req);
 		}
 	}
 
-	atomic_store_short(&sq->head, sqhead);
-	atomic_store_int(&sq->busy, 0);
+	sq->head = sqhead;
+
+	pthread_mutex_unlock(&sq->mtx);
 }
 
 static void
@@ -1768,6 +1801,13 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci
 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
 
 	if (is_sq) {
+		if (idx > sc->num_squeues) {
+			WPRINTF("%s queue index %lu overflow from "
+			         "guest (max %u)",
+			         __func__, idx, sc->num_squeues);
+			return;
+		}
+
 		atomic_store_short(&sc->submit_queues[idx].tail,
 		                   (uint16_t)value);
 
@@ -1791,7 +1831,8 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci
 			return;
 		}
 
-		sc->compl_queues[idx].head = (uint16_t)value;
+		atomic_store_short(&sc->compl_queues[idx].head,
+				(uint16_t)value);
 	}
 }
 
@@ -2236,7 +2277,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p
 	pthread_mutex_init(&sc->mtx, NULL);
 	sem_init(&sc->iosemlock, 0, sc->ioslots);
 
-	pci_nvme_reset(sc);
+	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
 	/*
 	 * Controller data depends on Namespace data so initialize Namespace
 	 * data first.
@@ -2244,6 +2285,8 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p
 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
 	pci_nvme_init_ctrldata(sc);
 	pci_nvme_init_logpages(sc);
+
+	pci_nvme_reset(sc);
 
 	pci_lintr_request(pi);
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202006290031.05T0VOlI047065>