Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 14 Mar 2018 16:44:50 +0000 (UTC)
From:      Warner Losh <imp@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r330932 - in head/sys: cam/nvme dev/nvme
Message-ID:  <201803141644.w2EGioti046140@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: imp
Date: Wed Mar 14 16:44:50 2018
New Revision: 330932
URL: https://svnweb.freebsd.org/changeset/base/330932

Log:
  Implement trim collapsing in nda
  
  When multiple trims are in the queue, collapse them as much as
  possible. At present, this usually results in only a few trims being
  collapsed together, but more work on that will make it possible to do
  hundreds (up to some configurable max).
  
  Sponsored by: Netflix

Modified:
  head/sys/cam/nvme/nvme_da.c
  head/sys/dev/nvme/nvme.h

Modified: head/sys/cam/nvme/nvme_da.c
==============================================================================
--- head/sys/cam/nvme/nvme_da.c	Wed Mar 14 16:44:16 2018	(r330931)
+++ head/sys/cam/nvme/nvme_da.c	Wed Mar 14 16:44:50 2018	(r330932)
@@ -93,12 +93,10 @@ typedef enum {
 } nda_ccb_state;
 
 /* Offsets into our private area for storing information */
-#define ccb_state	ppriv_field0
-#define ccb_bp		ppriv_ptr1
+#define ccb_state	ccb_h.ppriv_field0
+#define ccb_bp		ccb_h.ppriv_ptr1	/* For NDA_CCB_BUFFER_IO */
+#define ccb_trim	ccb_h.ppriv_ptr1	/* For NDA_CCB_TRIM */
 
-struct trim_request {
-	TAILQ_HEAD(, bio) bps;
-};
 struct nda_softc {
 	struct   cam_iosched_softc *cam_iosched;
 	int			outstanding_cmds;	/* Number of active commands */
@@ -107,12 +105,13 @@ struct nda_softc {
 	nda_flags		flags;
 	nda_quirks		quirks;
 	int			unmappedio;
+	quad_t			deletes;
+	quad_t			dsm_req;
 	uint32_t		nsid;			/* Namespace ID for this nda device */
 	struct disk		*disk;
 	struct task		sysctl_task;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
-	struct trim_request	trim_req;
 #ifdef CAM_IO_STATS
 	struct sysctl_ctx_list	sysctl_stats_ctx;
 	struct sysctl_oid	*sysctl_stats_tree;
@@ -122,6 +121,14 @@ struct nda_softc {
 #endif
 };
 
+struct nda_trim_request {
+	union {
+		struct nvme_dsm_range dsm;
+		uint8_t		data[NVME_MAX_DSM_TRIM];
+	};
+	TAILQ_HEAD(, bio) bps;
+};
+
 /* Need quirk table */
 
 static	disk_strategy_t	ndastrategy;
@@ -150,11 +157,14 @@ static void		ndasuspend(void *arg);
 #ifndef	NDA_DEFAULT_RETRY
 #define	NDA_DEFAULT_RETRY	4
 #endif
+#ifndef NDA_MAX_TRIM_ENTRIES
+#define NDA_MAX_TRIM_ENTRIES 256	/* Number of DSM trims to use, max 256 */
+#endif
 
-
 //static int nda_retry_count = NDA_DEFAULT_RETRY;
 static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED;
 static int nda_default_timeout = NDA_DEFAULT_TIMEOUT;
+static int nda_max_trim_entries = NDA_MAX_TRIM_ENTRIES;
 
 /*
  * All NVMe media is non-rotational, so all nvme device instances
@@ -361,6 +371,9 @@ ndastrategy(struct bio *bp)
 		return;
 	}
 	
+	if (bp->bio_cmd == BIO_DELETE)
+		softc->deletes++;
+
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
@@ -401,7 +414,7 @@ ndadump(void *arg, void *virtual, vm_offset_t physical
 	memset(&nvmeio, 0, sizeof(nvmeio));
 	if (length > 0) {
 		xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
-		nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP;
+		nvmeio.ccb_state = NDA_CCB_DUMP;
 		nda_nvme_write(softc, &nvmeio, virtual, lba, length, count);
 		error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
 		    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
@@ -414,7 +427,7 @@ ndadump(void *arg, void *virtual, vm_offset_t physical
 	/* Flush */
 	xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL);
 
-	nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP;
+	nvmeio.ccb_state = NDA_CCB_DUMP;
 	nda_nvme_flush(softc, &nvmeio);
 	error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error,
 	    0, SF_NO_RECOVERY | SF_NO_RETRY, NULL);
@@ -610,6 +623,14 @@ ndasysctlinit(void *context, int pending)
 		OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE,
 		&softc->unmappedio, 0, "Unmapped I/O leaf");
 
+	SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "deletes", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->deletes, "Number of BIO_DELETE requests");
+
+	SYSCTL_ADD_QUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+		OID_AUTO, "dsm_req", CTLFLAG_RD | CTLFLAG_MPSAFE,
+		&softc->dsm_req, "Number of DSM requests sent to SIM");
+
 	SYSCTL_ADD_INT(&softc->sysctl_ctx,
 		       SYSCTL_CHILDREN(softc->sysctl_tree),
 		       OID_AUTO,
@@ -902,24 +923,42 @@ ndastart(struct cam_periph *periph, union ccb *start_c
 		}
 		case BIO_DELETE:
 		{
-			struct nvme_dsm_range *dsm_range;
+			struct nvme_dsm_range *dsm_range, *dsm_end;
+			struct nda_trim_request *trim;
+			struct bio *bp1;
+			int ents;
 
-			dsm_range =
-			    malloc(sizeof(*dsm_range), M_NVMEDA, M_ZERO | M_NOWAIT);
-			if (dsm_range == NULL) {
+			trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO | M_NOWAIT);
+			if (trim == NULL) {
 				biofinish(bp, NULL, ENOMEM);
 				xpt_release_ccb(start_ccb);
 				ndaschedule(periph);
 				return;
 			}
-			dsm_range->length =
-			    htole32(bp->bio_bcount / softc->disk->d_sectorsize);
-			dsm_range->starting_lba =
-			    htole64(bp->bio_offset / softc->disk->d_sectorsize);
-			bp->bio_driver2 = dsm_range;
-			nda_nvme_trim(softc, &start_ccb->nvmeio, dsm_range, 1);
-			start_ccb->ccb_h.ccb_state = NDA_CCB_TRIM;
-			start_ccb->ccb_h.flags |= CAM_UNLOCKED;
+			TAILQ_INIT(&trim->bps);
+			bp1 = bp;
+			ents = sizeof(trim->data) / sizeof(struct nvme_dsm_range);
+			ents = min(ents, nda_max_trim_entries);
+			dsm_range = &trim->dsm;
+			dsm_end = dsm_range + ents;
+			do {
+				TAILQ_INSERT_TAIL(&trim->bps, bp1, bio_queue);
+				dsm_range->length =
+				    htole32(bp1->bio_bcount / softc->disk->d_sectorsize);
+				dsm_range->starting_lba =
+				    htole32(bp1->bio_offset / softc->disk->d_sectorsize);
+				dsm_range++;
+				if (dsm_range >= dsm_end)
+					break;
+				bp1 = cam_iosched_next_trim(softc->cam_iosched);
+				/* XXX -- Could collapse adjacent ranges, but we don't for now */
+				/* XXX -- Could limit based on total payload size */
+			} while (bp1 != NULL);
+			start_ccb->ccb_trim = trim;
+			softc->dsm_req++;
+			nda_nvme_trim(softc, &start_ccb->nvmeio, &trim->dsm,
+			    dsm_range - &trim->dsm);
+			start_ccb->ccb_state = NDA_CCB_TRIM;
 			/*
 			 * Note: We can have multiple TRIMs in flight, so we don't call
 			 * cam_iosched_submit_trim(softc->cam_iosched);
@@ -932,10 +971,10 @@ ndastart(struct cam_periph *periph, union ccb *start_c
 			nda_nvme_flush(softc, nvmeio);
 			break;
 		}
-		start_ccb->ccb_h.ccb_state = NDA_CCB_BUFFER_IO;
-		start_ccb->ccb_h.flags |= CAM_UNLOCKED;
+		start_ccb->ccb_state = NDA_CCB_BUFFER_IO;
+		start_ccb->ccb_bp = bp;
 out:
-		start_ccb->ccb_h.ccb_bp = bp;
+		start_ccb->ccb_h.flags |= CAM_UNLOCKED;
 		softc->outstanding_cmds++;
 		softc->refcount++;
 		cam_periph_unlock(periph);
@@ -963,16 +1002,14 @@ ndadone(struct cam_periph *periph, union ccb *done_ccb
 
 	CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n"));
 
-	state = nvmeio->ccb_h.ccb_state & NDA_CCB_TYPE_MASK;
+	state = nvmeio->ccb_state & NDA_CCB_TYPE_MASK;
 	switch (state) {
 	case NDA_CCB_BUFFER_IO:
 	case NDA_CCB_TRIM:
 	{
-		struct bio *bp;
 		int error;
 
 		cam_periph_lock(periph);
-		bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
 		if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			error = ndaerror(done_ccb, 0, 0);
 			if (error == ERESTART) {
@@ -991,59 +1028,68 @@ ndadone(struct cam_periph *periph, union ccb *done_ccb
 				panic("REQ_CMP with QFRZN");
 			error = 0;
 		}
-		bp->bio_error = error;
-		if (error != 0) {
-			bp->bio_resid = bp->bio_bcount;
-			bp->bio_flags |= BIO_ERROR;
-		} else {
-			bp->bio_resid = 0;
-		}
-		if (state == NDA_CCB_TRIM)
-			free(bp->bio_driver2, M_NVMEDA);
-		softc->outstanding_cmds--;
+		if (state == NDA_CCB_BUFFER_IO) {
+			struct bio *bp;
 
-		/*
-		 * We need to call cam_iosched before we call biodone so that we
-		 * don't measure any activity that happens in the completion
-		 * routine, which in the case of sendfile can be quite
-		 * extensive.
-		 */
-		cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
-		xpt_release_ccb(done_ccb);
-		if (state == NDA_CCB_TRIM) {
-#ifdef notyet
+			bp = (struct bio *)done_ccb->ccb_bp;
+			bp->bio_error = error;
+			if (error != 0) {
+				bp->bio_resid = bp->bio_bcount;
+				bp->bio_flags |= BIO_ERROR;
+			} else {
+				bp->bio_resid = 0;
+			}
+			softc->outstanding_cmds--;
+
+			/*
+			 * We need to call cam_iosched before we call biodone so that we
+			 * don't measure any activity that happens in the completion
+			 * routine, which in the case of sendfile can be quite
+			 * extensive.
+			 */
+			cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
+			xpt_release_ccb(done_ccb);
+			ndaschedule(periph);
+			cam_periph_unlock(periph);
+			biodone(bp);
+		} else { /* state == NDA_CCB_TRIM */
+			struct nda_trim_request *trim;
+			struct bio *bp1, *bp2;
 			TAILQ_HEAD(, bio) queue;
-			struct bio *bp1;
 
+			trim = nvmeio->ccb_trim;
 			TAILQ_INIT(&queue);
-			TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue);
-#endif
+			TAILQ_CONCAT(&queue, &trim->bps, bio_queue);
+			free(trim, M_NVMEDA);
+
 			/*
 			 * Since we can have multiple trims in flight, we don't
 			 * need to call this here.
 			 * cam_iosched_trim_done(softc->cam_iosched);
 			 */
+			/*
+			 * The the I/O scheduler that we're finishing the I/O
+			 * so we can keep book. The first one we pass in the CCB
+			 * which has the timing information. The rest we pass in NULL
+			 * so we can keep proper counts.
+			 */
+			bp1 = TAILQ_FIRST(&queue);
+			cam_iosched_bio_complete(softc->cam_iosched, bp1, done_ccb);
+			xpt_release_ccb(done_ccb);
 			ndaschedule(periph);
 			cam_periph_unlock(periph);
-#ifdef notyet
-/* Not yet collapsing several BIO_DELETE requests into one TRIM */
-			while ((bp1 = TAILQ_FIRST(&queue)) != NULL) {
-				TAILQ_REMOVE(&queue, bp1, bio_queue);
-				bp1->bio_error = error;
+			while ((bp2 = TAILQ_FIRST(&queue)) != NULL) {
+				TAILQ_REMOVE(&queue, bp2, bio_queue);
+				bp2->bio_error = error;
 				if (error != 0) {
-					bp1->bio_flags |= BIO_ERROR;
-					bp1->bio_resid = bp1->bio_bcount;
+					bp2->bio_flags |= BIO_ERROR;
+					bp2->bio_resid = bp1->bio_bcount;
 				} else
-					bp1->bio_resid = 0;
-				biodone(bp1);
+					bp2->bio_resid = 0;
+				if (bp1 != bp2)
+					cam_iosched_bio_complete(softc->cam_iosched, bp2, NULL);
+				biodone(bp2);
 			}
-#else
-			biodone(bp);
-#endif
-		} else {
-			ndaschedule(periph);
-			cam_periph_unlock(periph);
-			biodone(bp);
 		}
 		return;
 	}

Modified: head/sys/dev/nvme/nvme.h
==============================================================================
--- head/sys/dev/nvme/nvme.h	Wed Mar 14 16:44:16 2018	(r330931)
+++ head/sys/dev/nvme/nvme.h	Wed Mar 14 16:44:50 2018	(r330932)
@@ -484,6 +484,9 @@ struct nvme_dsm_range {
 	uint64_t starting_lba;
 } __packed;
 
+/* Largest DSM Trim that can be done */
+#define NVME_MAX_DSM_TRIM		4096
+
 _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage");
 
 /* status code types */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803141644.w2EGioti046140>