Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 19 Aug 2018 16:56:43 +0000 (UTC)
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r338056 - head/sys/ufs/ffs
Message-ID:  <201808191656.w7JGuhCQ031146@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mckusick
Date: Sun Aug 19 16:56:42 2018
New Revision: 338056
URL: https://svnweb.freebsd.org/changeset/base/338056

Log:
  Add consolodation of TRIM / BIO_DELETE commands to the UFS/FFS filesystem.
  
  When deleting files on filesystems that are stored on flash-memory
  (solid-state) disk drives, the filesystem notifies the underlying
  disk of the blocks that it is no longer using. The notification
  allows the drive to avoid saving these blocks when it needs to
  flash (zero out) one of its flash pages. These notifications of
  no-longer-being-used blocks are referred to as TRIM notifications.
  In FreeBSD these TRIM notifications are sent from the filesystem
  to the drive using the BIO_DELETE command.
  
  Until now, the filesystem would send a separate message to the drive
  for each block of the file that was deleted. Each Gigabyte of file
  size resulted in over 3000 TRIM messages being sent to the drive.
  This burst of messages can overwhelm the drive's task queue causing
  multiple second delays for read and write requests.
  
  This implementation collects runs of contiguous blocks in the file
  and then consolodates them into a single BIO_DELETE command to the
  drive. The BIO_DELETE command describes the run of blocks as a
  single large block being deleted. Each Gigabyte of file size can
  result in as few as two BIO_DELETE commands and is typically less
  than ten.  Though these larger BIO_DELETE commands take longer to
  run, they do not clog the drive task queue, so read and write
  commands can intersperse effectively with them.
  
  Though this new feature has been throughly reviewed and tested, it
  is being added disabled by default so as to minimize the possibility
  of disrupting the upcoming 12.0 release. It can be enabled by running
  ``sysctl vfs.ffs.dotrimcons=1''. Users are encouraged to test it.
  If no problems arise, we will consider requesting that it be enabled
  by default for 12.0.
  
  Reviewed by:  kib
  Tested by:    Peter Holm
  Sponsored by: Netflix

Modified:
  head/sys/ufs/ffs/ffs_alloc.c

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	Sun Aug 19 16:14:59 2018	(r338055)
+++ head/sys/ufs/ffs/ffs_alloc.c	Sun Aug 19 16:56:42 2018	(r338056)
@@ -484,6 +484,10 @@ static int doreallocblks = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
 "enable block reallocation");
 
+static int dotrimcons = 1;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RW, &dotrimcons, 0,
+"enable BIO_DELETE / TRIM consolodation");
+
 static int maxclustersearch = 10;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
 0, "max number of cylinder group to search for contigous blocks");
@@ -2301,51 +2305,193 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
 
 /*
  * Structures and routines associated with trim management.
+ *
+ * The following requests are passed to trim_lookup to indicate
+ * the actions that should be taken.
  */
+#define	NEW	1	/* if found, error else allocate and hash it */
+#define	OLD	2	/* if not found, error, else return it */
+#define	REPLACE	3	/* if not found, error else unhash and reallocate it */
+#define	DONE	4	/* if not found, error else unhash and return it */
+#define	SINGLE	5	/* don't look up, just allocate it and don't hash it */
+
 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
 
 #define	TRIMLIST_HASH(ump, key) \
 	(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
 
-static void	ffs_blkfree_trim_completed(struct buf *);
-static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
+/*
+ * These structures describe each of the block free requests aggregated
+ * together to make up a trim request.
+ */
+struct trim_blkreq {
+	TAILQ_ENTRY(trim_blkreq) blkreqlist;
+	ufs2_daddr_t bno;
+	long size;
+	struct workhead *pdephd;
+	struct workhead dephd;
+};
 
+/*
+ * Description of a trim request.
+ */
 struct ffs_blkfree_trim_params {
+	TAILQ_HEAD(, trim_blkreq) blklist;
+	LIST_ENTRY(ffs_blkfree_trim_params) hashlist;
 	struct task task;
 	struct ufsmount *ump;
 	struct vnode *devvp;
+	ino_t inum;
 	ufs2_daddr_t bno;
 	long size;
-	ino_t inum;
-	struct workhead *pdephd;
-	struct workhead dephd;
+	long key;
 };
 
+static void	ffs_blkfree_trim_completed(struct buf *);
+static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
+static struct	ffs_blkfree_trim_params *trim_lookup(struct ufsmount *,
+		    struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int);
+static void	ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *);
+
+/*
+ * Called on trim completion to start a task to free the associated block(s).
+ */
 static void
+ffs_blkfree_trim_completed(bp)
+	struct buf *bp;
+{
+	struct ffs_blkfree_trim_params *tp;
+
+	tp = bp->b_fsprivate1;
+	free(bp, M_TRIM);
+	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
+	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
+}
+
+/*
+ * Trim completion task that free associated block(s).
+ */
+static void
 ffs_blkfree_trim_task(ctx, pending)
 	void *ctx;
 	int pending;
 {
 	struct ffs_blkfree_trim_params *tp;
+	struct trim_blkreq *blkelm;
+	struct ufsmount *ump;
 
 	tp = ctx;
-	ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size,
-	    tp->inum, tp->pdephd);
-	vn_finished_secondary_write(UFSTOVFS(tp->ump));
-	atomic_add_int(&tp->ump->um_trim_inflight, -1);
+	ump = tp->ump;
+	while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) {
+		ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno,
+		    blkelm->size, tp->inum, blkelm->pdephd);
+		TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist);
+		free(blkelm, M_TRIM);
+	}
+	vn_finished_secondary_write(UFSTOVFS(ump));
+	UFS_LOCK(ump);
+	ump->um_trim_inflight -= 1;
+	ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size);
+	UFS_UNLOCK(ump);
 	free(tp, M_TRIM);
 }
 
-static void
-ffs_blkfree_trim_completed(bp)
-	struct buf *bp;
+/*
+ * Lookup a trim request by inode number.
+ * Allocate if requested (NEW, REPLACE, SINGLE).
+ */
+static struct ffs_blkfree_trim_params *
+trim_lookup(ump, devvp, bno, size, inum, key, alloctype)
+	struct ufsmount *ump;
+	struct vnode *devvp;
+	ufs2_daddr_t bno;
+	long size;
+	ino_t inum;
+	u_long key;
+	int alloctype;
 {
+	struct trimlist_hashhead *tphashhead;
+	struct ffs_blkfree_trim_params *tp, *ntp;
+
+	ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
+	if (alloctype != SINGLE) {
+		KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key"));
+		UFS_LOCK(ump);
+		tphashhead = TRIMLIST_HASH(ump, key);
+		LIST_FOREACH(tp, tphashhead, hashlist)
+			if (key == tp->key)
+				break;
+	}
+	switch (alloctype) {
+	case NEW:
+		KASSERT(tp == NULL, ("trim_lookup: found trim"));
+		break;
+	case OLD:
+		KASSERT(tp != NULL,
+		    ("trim_lookup: missing call to ffs_blkrelease_start()"));
+		UFS_UNLOCK(ump);
+		free(ntp, M_TRIM);
+		return (tp);
+	case REPLACE:
+		KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim"));
+		LIST_REMOVE(tp, hashlist);
+		/* tp will be freed by caller */
+		break;
+	case DONE:
+		KASSERT(tp != NULL, ("trim_lookup: missing DONE trim"));
+		LIST_REMOVE(tp, hashlist);
+		UFS_UNLOCK(ump);
+		free(ntp, M_TRIM);
+		return (tp);
+	}
+	TAILQ_INIT(&ntp->blklist);
+	ntp->ump = ump;
+	ntp->devvp = devvp;
+	ntp->bno = bno;
+	ntp->size = size;
+	ntp->inum = inum;
+	ntp->key = key;
+	if (alloctype != SINGLE) {
+		LIST_INSERT_HEAD(tphashhead, ntp, hashlist);
+		UFS_UNLOCK(ump);
+	}
+	return (ntp);
+}
+
+/*
+ * Dispatch a trim request.
+ */
+static void
+ffs_blkfree_sendtrim(tp)
 	struct ffs_blkfree_trim_params *tp;
+{
+	struct ufsmount *ump;
+	struct mount *mp;
+	struct buf *bp;
 
-	tp = bp->b_fsprivate1;
-	free(bp, M_TRIM);
-	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
-	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
+	/*
+	 * Postpone the set of the free bit in the cg bitmap until the
+	 * BIO_DELETE is completed.  Otherwise, due to disk queue
+	 * reordering, TRIM might be issued after we reuse the block
+	 * and write some new data into it.
+	 */
+	ump = tp->ump;
+	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
+	bp->b_iocmd = BIO_DELETE;
+	bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno));
+	bp->b_iodone = ffs_blkfree_trim_completed;
+	bp->b_bcount = tp->size;
+	bp->b_fsprivate1 = tp;
+	UFS_LOCK(ump);
+	ump->um_trim_total += 1;
+	ump->um_trim_inflight += 1;
+	ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size);
+	ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size);
+	UFS_UNLOCK(ump);
+
+	mp = UFSTOVFS(ump);
+	vn_start_secondary_write(NULL, &mp, 0);
+	g_vfs_strategy(ump->um_bo, bp);
 }
 
 /*
@@ -2360,11 +2506,12 @@ ffs_blkrelease_start(ump, devvp, inum)
 	static u_long masterkey;
 	u_long key;
 
-	if ((ump->um_flags & UM_CANDELETE) == 0)
+	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
 		return (SINGLETON_KEY);
 	do {
 		key = atomic_fetchadd_long(&masterkey, 1);
 	} while (key < FIRST_VALID_KEY);
+	(void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW);
 	return (key);
 }
 
@@ -2376,10 +2523,32 @@ ffs_blkrelease_finish(ump, key)
 	struct ufsmount *ump;
 	u_long key;
 {
+	struct ffs_blkfree_trim_params *tp;
 
-	return;
+	if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0)
+		return;
+	/*
+	 * We are done with sending blocks using this key. Look up the key
+	 * using the DONE alloctype (in tp) to request that it be unhashed
+	 * as we will not be adding to it. If the key has never been used,
+	 * tp->size will be zero, so we can just free tp. Otherwise the call
+	 * to ffs_blkfree_sendtrim(tp) causes the block range described by
+	 * tp to be issued (and then tp to be freed).
+	 */
+	tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE);
+	if (tp->size == 0)
+		free(tp, M_TRIM);
+	else
+		ffs_blkfree_sendtrim(tp);
 }
 
+/*
+ * Setup to free a block or fragment.
+ *
+ * Check for snapshots that might want to claim the block.
+ * If trims are requested, prepare a trim request. Attempt to
+ * aggregate consecutive blocks into a single trim request.
+ */
 void
 ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
 	struct ufsmount *ump;
@@ -2392,9 +2561,8 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	struct workhead *dephd;
 	u_long key;
 {
-	struct mount *mp;
-	struct buf *bp;
-	struct ffs_blkfree_trim_params *tp;
+	struct ffs_blkfree_trim_params *tp, *ntp;
+	struct trim_blkreq *blkelm;
 
 	/*
 	 * Check to see if a snapshot wants to claim the block.
@@ -2416,37 +2584,72 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
-
+	blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK);
+	blkelm->bno = bno;
+	blkelm->size = size;
+	if (dephd == NULL) {
+		blkelm->pdephd = NULL;
+	} else {
+		LIST_INIT(&blkelm->dephd);
+		LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list);
+		blkelm->pdephd = &blkelm->dephd;
+	}
+	if (key == SINGLETON_KEY) {
+		/*
+		 * Just a single non-contiguous piece. Use the SINGLE
+		 * alloctype to return a trim request that will not be
+		 * hashed for future lookup.
+		 */
+		tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE);
+		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
+		ffs_blkfree_sendtrim(tp);
+		return;
+	}
 	/*
-	 * Postpone the set of the free bit in the cg bitmap until the
-	 * BIO_DELETE is completed.  Otherwise, due to disk queue
-	 * reordering, TRIM might be issued after we reuse the block
-	 * and write some new data into it.
+	 * The callers of this function are not tracking whether or not
+	 * the blocks are contiguous. They are just saying that they
+	 * are freeing a set of blocks. It is this code that determines
+	 * the pieces of that range that are actually contiguous.
+	 *
+	 * Calling ffs_blkrelease_start() will have created an entry
+	 * that we will use.
 	 */
-	atomic_add_int(&ump->um_trim_inflight, 1);
-	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
-	tp->ump = ump;
-	tp->devvp = devvp;
-	tp->bno = bno;
-	tp->size = size;
-	tp->inum = inum;
-	if (dephd != NULL) {
-		LIST_INIT(&tp->dephd);
-		LIST_SWAP(dephd, &tp->dephd, worklist, wk_list);
-		tp->pdephd = &tp->dephd;
-	} else
-		tp->pdephd = NULL;
-
-	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
-	bp->b_iocmd = BIO_DELETE;
-	bp->b_iooffset = dbtob(fsbtodb(fs, bno));
-	bp->b_iodone = ffs_blkfree_trim_completed;
-	bp->b_bcount = size;
-	bp->b_fsprivate1 = tp;
-
-	mp = UFSTOVFS(ump);
-	vn_start_secondary_write(NULL, &mp, 0);
-	g_vfs_strategy(ump->um_bo, bp);
+	tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD);
+	if (tp->size == 0) {
+		/*
+		 * First block of a potential range, set block and size
+		 * for the trim block.
+		 */
+		tp->bno = bno;
+		tp->size = size;
+		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
+		return;
+	}
+	/*
+	 * If this block is a continuation of the range (either
+	 * follows at the end or preceeds in the front) then we
+	 * add it to the front or back of the list and return.
+	 *
+	 * If it is not a continuation of the trim that we were
+	 * building, using the REPLACE alloctype, we request that
+	 * the old trim request (still in tp) be unhashed and a
+	 * new range started (in ntp). The ffs_blkfree_sendtrim(tp)
+	 * call causes the block range described by tp to be issued
+	 * (and then tp to be freed).
+	 */
+	if (bno + numfrags(fs, size) == tp->bno) {
+		TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist);
+		tp->bno = bno;
+		tp->size += size;
+		return;
+	} else if (bno == tp->bno + numfrags(fs, tp->size)) {
+		TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist);
+		tp->size += size;
+		return;
+	}
+	ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE);
+	TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist);
+	ffs_blkfree_sendtrim(tp);
 }
 
 #ifdef INVARIANTS



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808191656.w7JGuhCQ031146>