Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 18 Aug 2018 22:21:59 +0000 (UTC)
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r338031 - in head/sys/ufs: ffs ufs
Message-ID:  <201808182221.w7IMLxBV059347@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mckusick
Date: Sat Aug 18 22:21:59 2018
New Revision: 338031
URL: https://svnweb.freebsd.org/changeset/base/338031

Log:
  Replace the TRIM consolodation framework originally added in -r337396
  driven by problems found with the algorithms being tested for TRIM
  consolodation.
  
  Reported by:  Peter Holm
  Suggested by: kib
  Reviewed by:  kib
  Sponsored by: Netflix

Modified:
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_snapshot.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/softdep.h
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_alloc.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -110,8 +110,6 @@ static ufs2_daddr_t
 static void	ffs_blkfree_cg(struct ufsmount *, struct fs *,
 		    struct vnode *, ufs2_daddr_t, long, ino_t,
 		    struct workhead *);
-static void	ffs_blkfree_trim_completed(struct buf *);
-static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
 #ifdef INVARIANTS
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
@@ -395,8 +393,24 @@ retry:
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a smaller fragment that
+			 * was just allocated has been replaced with a bigger
+			 * fragment or a full-size block. If it is marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the block was written
+			 * earlier, but very uncommon. If the block has never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for it when it is freed. The gain from avoiding the
+			 * TRIMs for the common case of unwritten blocks far
+			 * exceeds the cost of the write amplification for the
+			 * uncommon case of failing to send a TRIM for a block
+			 * that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
-			    ip->i_number, vp->v_type, NULL);
+			    ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ?
+			    NOTRIM_KEY : SINGLETON_KEY);
 		delta = btodb(nsize - osize);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
 		if (flags & IO_EXT)
@@ -521,7 +535,7 @@ ffs_reallocblks_ufs1(ap)
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
-	struct buf *sbp, *ebp;
+	struct buf *sbp, *ebp, *bp;
 	ufs1_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
@@ -730,14 +744,30 @@ ffs_reallocblks_ufs1(ap)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a set of N-contiguous blocks
+			 * that was just allocated has been replaced with a
+			 * set of N+1-contiguous blocks. If they are marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the blocks were written
+			 * earlier, but very uncommon. If the blocks have never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for them when they are freed. The gain from avoiding
+			 * the TRIMs for the common case of unwritten blocks
+			 * far exceeds the cost of the write amplification for
+			 * the uncommon case of failing to send a TRIM for the
+			 * blocks that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
-			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+			    dbtofsb(fs, bp->b_blkno),
+			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ?
+			    NOTRIM_KEY : SINGLETON_KEY);
+		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-		if (!ffs_checkblk(ip,
-		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -771,7 +801,7 @@ ffs_reallocblks_ufs2(ap)
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
-	struct buf *sbp, *ebp;
+	struct buf *sbp, *ebp, *bp;
 	ufs2_daddr_t *bap, *sbap, *ebap;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
@@ -978,14 +1008,30 @@ ffs_reallocblks_ufs2(ap)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+		bp = buflist->bs_children[i];
 		if (!DOINGSOFTDEP(vp))
+			/*
+			 * The usual case is that a set of N-contiguous blocks
+			 * that was just allocated has been replaced with a
+			 * set of N+1-contiguous blocks. If they are marked as
+			 * B_DELWRI, the current contents have not been written
+			 * to disk. It is possible that the blocks were written
+			 * earlier, but very uncommon. If the blocks have never
+			 * been written, there is no need to send a BIO_DELETE
+			 * for them when they are freed. The gain from avoiding
+			 * the TRIMs for the common case of unwritten blocks
+			 * far exceeds the cost of the write amplification for
+			 * the uncommon case of failing to send a TRIM for the
+			 * blocks that had been written.
+			 */
 			ffs_blkfree(ump, fs, ump->um_devvp,
-			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-			    fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+			    dbtofsb(fs, bp->b_blkno),
+			    fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+			    (bp->b_flags & B_DELWRI) != 0 ?
+			    NOTRIM_KEY : SINGLETON_KEY);
+		bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-		if (!ffs_checkblk(ip,
-		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+		if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -1823,8 +1869,7 @@ gotit:
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
-		    size, 0);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
 	UFS_LOCK(ump);
 	return (blkno);
 }
@@ -2254,6 +2299,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
 	bdwrite(bp);
 }
 
+/*
+ * Structures and routines associated with trim management.
+ */
+MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
+
+#define	TRIMLIST_HASH(ump, key) \
+	(&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
+
+static void	ffs_blkfree_trim_completed(struct buf *);
+static void	ffs_blkfree_trim_task(void *ctx, int pending __unused);
+
 struct ffs_blkfree_trim_params {
 	struct task task;
 	struct ufsmount *ump;
@@ -2277,7 +2333,7 @@ ffs_blkfree_trim_task(ctx, pending)
 	    tp->inum, tp->pdephd);
 	vn_finished_secondary_write(UFSTOVFS(tp->ump));
 	atomic_add_int(&tp->ump->um_trim_inflight, -1);
-	free(tp, M_TEMP);
+	free(tp, M_TRIM);
 }
 
 static void
@@ -2287,14 +2343,46 @@ ffs_blkfree_trim_completed(bp)
 	struct ffs_blkfree_trim_params *tp;
 
 	tp = bp->b_fsprivate1;
-	free(bp, M_TEMP);
+	free(bp, M_TRIM);
 	TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
 	taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
+/*
+ * Allocate a new key to use to identify a range of blocks.
+ */
+u_long
+ffs_blkrelease_start(ump, devvp, inum)
+	struct ufsmount *ump;
+	struct vnode *devvp;
+	ino_t inum;
+{
+	static u_long masterkey;
+	u_long key;
+
+	if ((ump->um_flags & UM_CANDELETE) == 0)
+		return (SINGLETON_KEY);
+	do {
+		key = atomic_fetchadd_long(&masterkey, 1);
+	} while (key < FIRST_VALID_KEY);
+	return (key);
+}
+
+/*
+ * Deallocate a key that has been used to identify a range of blocks.
+ */
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
+ffs_blkrelease_finish(ump, key)
 	struct ufsmount *ump;
+	u_long key;
+{
+
+	return;
+}
+
+void
+ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
+	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
@@ -2302,6 +2390,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	ino_t inum;
 	enum vtype vtype;
 	struct workhead *dephd;
+	u_long key;
 {
 	struct mount *mp;
 	struct buf *bp;
@@ -2319,10 +2408,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 		return;
 	}
 	/*
-	 * Nothing to delay if TRIM is disabled, or the operation is
-	 * performed on the snapshot.
+	 * Nothing to delay if TRIM is not required for this block or TRIM
+	 * is disabled or the operation is performed on a snapshot.
 	 */
-	if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
+	if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
+	    devvp->v_type == VREG) {
 		ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
 		return;
 	}
@@ -2334,7 +2424,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	 * and write some new data into it.
 	 */
 	atomic_add_int(&ump->um_trim_inflight, 1);
-	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
+	tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
 	tp->ump = ump;
 	tp->devvp = devvp;
 	tp->bno = bno;
@@ -2347,7 +2437,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
 	} else
 		tp->pdephd = NULL;
 
-	bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
+	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_iooffset = dbtob(fsbtodb(fs, bno));
 	bp->b_iodone = ffs_blkfree_trim_completed;
@@ -2822,6 +2912,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 	struct fs *fs;
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
+	u_long key;
 	struct file *fp, *vfp;
 	cap_rights_t rights;
 	int filetype, error;
@@ -2956,15 +3047,18 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 		blkno = cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
+		key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
 		while (blkcnt > 0) {
-			if (blksize > blkcnt)
+			if (blkcnt < blksize)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-			    blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
+			    blksize * fs->fs_fsize, UFS_ROOTINO, 
+			    VDIR, NULL, key);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
 		}
+		ffs_blkrelease_finish(ump, key);
 		break;
 
 	/*

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_balloc.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -553,7 +553,7 @@ fail:
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		    ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
 	}
 	return (error);
 }
@@ -1147,7 +1147,7 @@ fail:
 		lbns_remfree++;
 #endif
 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		    ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
 	}
 	return (error);
 }

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- head/sys/ufs/ffs/ffs_extern.h	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_extern.h	Sat Aug 18 22:21:59 2018	(r338031)
@@ -63,9 +63,11 @@ int	ffs_balloc_ufs2(struct vnode *a_vp, off_t a_starto
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void	ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-	    ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
+	    ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, u_long);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
+void	ffs_blkrelease_finish(struct ufsmount *, u_long);
+u_long	ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
 void	ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
@@ -111,10 +113,26 @@ vfs_vget_t ffs_vget;
 int	ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
 void	process_deferred_inactive(struct mount *mp);
 
+/*
+ * Flags to ffs_vgetf
+ */
 #define	FFSV_FORCEINSMQ	0x0001
 
+/*
+ * Flags to ffs_reload
+ */
 #define	FFSR_FORCE	0x0001
 #define	FFSR_UNSUSPEND	0x0002
+
+/*
+ * Definitions for TRIM interface
+ *
+ * Special keys and recommended hash table size
+ */
+#define	NOTRIM_KEY	1	/* never written, so don't call trim for it */
+#define	SINGLETON_KEY	2	/* only block being freed, so trim it now */
+#define	FIRST_VALID_KEY	3	/* first valid key describing a block range */
+#define	MAXTRIMIO	1024	/* maximum expected outstanding trim requests */
 
 extern struct vop_vector ffs_vnodeops1;
 extern struct vop_vector ffs_fifoops1;

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_inode.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -197,6 +197,7 @@ ffs_truncate(vp, length, flags, cred)
 	int needextclean, extblocks;
 	int offset, size, level, nblocks;
 	int i, error, allerror, indiroff, waitforupdate;
+	u_long key;
 	off_t osize;
 
 	ip = VTOI(vp);
@@ -275,7 +276,7 @@ ffs_truncate(vp, length, flags, cred)
 					continue;
 				ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
 				    sblksize(fs, osize, i), ip->i_number,
-				    vp->v_type, NULL);
+				    vp->v_type, NULL, SINGLETON_KEY);
 			}
 		}
 	}
@@ -523,7 +524,7 @@ ffs_truncate(vp, length, flags, cred)
 				DIP_SET(ip, i_ib[level], 0);
 				ffs_blkfree(ump, fs, ump->um_devvp, bn,
 				    fs->fs_bsize, ip->i_number,
-				    vp->v_type, NULL);
+				    vp->v_type, NULL, SINGLETON_KEY);
 				blocksreleased += nblocks;
 			}
 		}
@@ -534,6 +535,7 @@ ffs_truncate(vp, length, flags, cred)
 	/*
 	 * All whole direct blocks or frags.
 	 */
+	key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number);
 	for (i = UFS_NDADDR - 1; i > lastblock; i--) {
 		long bsize;
 
@@ -543,9 +545,10 @@ ffs_truncate(vp, length, flags, cred)
 		DIP_SET(ip, i_db[i], 0);
 		bsize = blksize(fs, ip, i);
 		ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, key);
 		blocksreleased += btodb(bsize);
 	}
+	ffs_blkrelease_finish(ump, key);
 	if (lastblock < 0)
 		goto done;
 
@@ -575,7 +578,8 @@ ffs_truncate(vp, length, flags, cred)
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(ump, fs, ump->um_devvp, bn,
-			   oldspace - newspace, ip->i_number, vp->v_type, NULL);
+			   oldspace - newspace, ip->i_number, vp->v_type,
+			   NULL, SINGLETON_KEY);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
@@ -634,8 +638,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 {
 	struct buf *bp;
 	struct fs *fs;
+	struct ufsmount *ump;
 	struct vnode *vp;
 	caddr_t copy = NULL;
+	u_long key;
 	int i, nblocks, error = 0, allerror = 0;
 	ufs2_daddr_t nb, nlbn, last;
 	ufs2_daddr_t blkcount, factor, blocksreleased = 0;
@@ -644,6 +650,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
 
 	fs = ITOFS(ip);
+	ump = ITOUMP(ip);
 
 	/*
 	 * Calculate index in current block of last
@@ -719,6 +726,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	/*
 	 * Recursively free totally unused blocks.
 	 */
+	key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number);
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = BAP(ip, i);
@@ -730,10 +738,11 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
-		ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
-		    ip->i_number, vp->v_type, NULL);
+		ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize,
+		    ip->i_number, vp->v_type, NULL, key);
 		blocksreleased += nblocks;
 	}
+	ffs_blkrelease_finish(ump, key);
 
 	/*
 	 * Recursively free last partial block.

Modified: head/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_snapshot.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -583,7 +583,7 @@ loop:
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number,
-				    xvp->v_type, NULL);
+				    xvp->v_type, NULL, SINGLETON_KEY);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
@@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expung
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, SINGLETON_KEY);
 	}
 	return (0);
 }
@@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expung
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-		    vp->v_type, NULL);
+		    vp->v_type, NULL, SINGLETON_KEY);
 	}
 	return (0);
 }

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_softdep.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -869,7 +869,7 @@ static	void cancel_allocdirect(struct allocdirectlst *
 	    struct allocdirect *, struct freeblks *);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
-static	void freework_freeblock(struct freework *);
+static	void freework_freeblock(struct freework *, u_long);
 static	void freework_enqueue(struct freework *);
 static	int handle_workitem_freeblocks(struct freeblks *, int);
 static	int handle_complete_freeblocks(struct freeblks *, int);
@@ -884,7 +884,7 @@ static	struct allocindir *newallocindir(struct inode *
 	    ufs2_daddr_t, ufs_lbn_t);
 static	void handle_workitem_freefrag(struct freefrag *);
 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
-	    ufs_lbn_t);
+	    ufs_lbn_t, u_long);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
 static	struct freefrag *allocindir_merge(struct allocindir *,
@@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno,
 	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
 	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
 	if (oldblkno && oldblkno != newblkno)
-		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+		/*
+		 * The usual case is that a smaller fragment that
+		 * was just allocated has been replaced with a bigger
+		 * fragment or a full-size block. If it is marked as
+		 * B_DELWRI, the current contents have not been written
+		 * to disk. It is possible that the block was written
+		 * earlier, but very uncommon. If the block has never
+		 * been written, there is no need to send a BIO_DELETE
+		 * for it when it is freed. The gain from avoiding the
+		 * TRIMs for the common case of unwritten blocks far
+		 * exceeds the cost of the write amplification for the
+		 * uncommon case of failing to send a TRIM for a block
+		 * that had been written.
+		 */
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
 	else
 		freefrag = NULL;
 
@@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
  * Allocate a new freefrag structure.
  */
 static struct freefrag *
-newfreefrag(ip, blkno, size, lbn)
+newfreefrag(ip, blkno, size, lbn, key)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 	ufs_lbn_t lbn;
+	u_long key;
 {
 	struct freefrag *freefrag;
 	struct ufsmount *ump;
@@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
 	freefrag->ff_vtype = ITOV(ip)->v_type;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
+	freefrag->ff_key = key;
 
 	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
 		freefrag->ff_jdep = (struct worklist *)
@@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
 	}
 	FREE_LOCK(ump);
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
-	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
+	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
+	   &wkhd, freefrag->ff_key);
 	ACQUIRE_LOCK(ump);
 	WORKITEM_FREE(freefrag, D_FREEFRAG);
 	FREE_LOCK(ump);
@@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, ne
 
 	lbn = bp->b_lblkno;
 	if (oldblkno && oldblkno != newblkno)
-		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+		/*
+		 * The usual case is that a smaller fragment that
+		 * was just allocated has been replaced with a bigger
+		 * fragment or a full-size block. If it is marked as
+		 * B_DELWRI, the current contents have not been written
+		 * to disk. It is possible that the block was written
+		 * earlier, but very uncommon. If the block has never
+		 * been written, there is no need to send a BIO_DELETE
+		 * for it when it is freed. The gain from avoiding the
+		 * TRIMs for the common case of unwritten blocks far
+		 * exceeds the cost of the write amplification for the
+		 * uncommon case of failing to send a TRIM for a block
+		 * that had been written.
+		 */
+		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
 	else
 		freefrag = NULL;
 
@@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
 	struct jnewblk *jnewblk;
 
 	if (oldblkno)
-		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
+		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
+		    SINGLETON_KEY);
 	else
 		freefrag = NULL;
 	ACQUIRE_LOCK(ITOUMP(ip));
@@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
  * in memory immediately.
  */
 static void
-freework_freeblock(freework)
+freework_freeblock(freework, key)
 	struct freework *freework;
+	u_long key;
 {
 	struct freeblks *freeblks;
 	struct jnewblk *jnewblk;
@@ -7779,10 +7814,10 @@ freework_freeblock(freework)
 	FREE_LOCK(ump);
 	freeblks_free(ump, freeblks, btodb(bsize));
 	CTR4(KTR_SUJ,
-	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
 	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
-	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
+	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
 	ACQUIRE_LOCK(ump);
 	/*
 	 * The jnewblk will be discarded and the bits in the map never
@@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
 		return;
 	}
 	if (freework->fw_off == NINDIR(fs)) {
-		freework_freeblock(freework);
+		freework_freeblock(freework, SINGLETON_KEY);
 		return;
 	}
 	freework->fw_state |= INPROGRESS;
@@ -7894,10 +7929,12 @@ handle_workitem_freeblocks(freeblks, flags)
 	struct allocindir *aip;
 	struct ufsmount *ump;
 	struct worklist *wk;
+	u_long key;
 
 	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
 	    ("handle_workitem_freeblocks: Journal entries not written."));
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
 	ACQUIRE_LOCK(ump);
 	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
 		WORKLIST_REMOVE(wk);
@@ -7935,7 +7972,7 @@ handle_workitem_freeblocks(freeblks, flags)
 			if (freework->fw_lbn <= -UFS_NDADDR)
 				handle_workitem_indirblk(freework);
 			else
-				freework_freeblock(freework);
+				freework_freeblock(freework, key);
 			continue;
 		default:
 			panic("handle_workitem_freeblocks: Unknown type %s",
@@ -7948,6 +7985,7 @@ handle_workitem_freeblocks(freeblks, flags)
 		freeblks = NULL;
 	}
 	FREE_LOCK(ump);
+	ffs_blkrelease_finish(ump, key);
 	if (freeblks)
 		return handle_complete_freeblocks(freeblks, flags);
 	return (0);
@@ -8080,13 +8118,9 @@ indir_trunc(freework, dbn, lbn)
 	ufs1_daddr_t *bap1;
 	ufs2_daddr_t nb, nnb, *bap2;
 	ufs_lbn_t lbnadd, nlbn;
-	int i, nblocks, ufs1fmt;
-	int freedblocks;
-	int goingaway;
-	int freedeps;
-	int needj;
-	int level;
-	int cnt;
+	u_long key;
+	int nblocks, ufs1fmt, freedblocks;
+	int goingaway, freedeps, needj, level, cnt, i;
 
 	freeblks = freework->fw_freeblks;
 	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@@ -8180,6 +8214,7 @@ indir_trunc(freework, dbn, lbn)
 	 * arranges for the current level to be freed when subordinates
 	 * are free when journaling.
 	 */
+	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
 	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
 		if (i != NINDIR(fs) - 1) {
 			if (ufs1fmt)
@@ -8215,13 +8250,14 @@ indir_trunc(freework, dbn, lbn)
 				freedeps++;
 			}
 			CTR3(KTR_SUJ,
-			    "indir_trunc: ino %d blkno %jd size %ld",
+			    "indir_trunc: ino %jd blkno %jd size %d",
 			    freeblks->fb_inum, nb, fs->fs_bsize);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
 			    fs->fs_bsize, freeblks->fb_inum,
-			    freeblks->fb_vtype, &wkhd);
+			    freeblks->fb_vtype, &wkhd, key);
 		}
 	}
+	ffs_blkrelease_finish(ump, key);
 	if (goingaway) {
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
@@ -8244,7 +8280,7 @@ indir_trunc(freework, dbn, lbn)
 		if (level == 0)
 			freeblks->fb_cgwait += freedeps;
 		if (freework->fw_ref == 0)
-			freework_freeblock(freework);
+			freework_freeblock(freework, SINGLETON_KEY);
 		FREE_LOCK(ump);
 		return;
 	}
@@ -8253,10 +8289,10 @@ indir_trunc(freework, dbn, lbn)
 	 */
 	dbn = dbtofsb(fs, dbn);
 	CTR3(KTR_SUJ,
-	    "indir_trunc 2: ino %d blkno %jd size %ld",
+	    "indir_trunc 2: ino %jd blkno %jd size %d",
 	    freeblks->fb_inum, dbn, fs->fs_bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
-	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
+	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
 	/* Non SUJ softdep does single-threaded truncations. */
 	if (freework->fw_blkno == dbn) {
 		freework->fw_state |= ALLCOMPLETE;

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/ffs_vfsops.c	Sat Aug 18 22:21:59 2018	(r338031)
@@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
 			    taskqueue_thread_enqueue, &ump->um_trim_tq);
 			taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
 			    "%s trim", mp->mnt_stat.f_mntonname);
+			ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
+			    &ump->um_trimlisthashsize);
 		}
 	}
 
@@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
 			pause("ufsutr", hz);
 		taskqueue_drain_all(ump->um_trim_tq);
 		taskqueue_free(ump->um_trim_tq);
+		free (ump->um_trimhash, M_TRIM);
 	}
 	g_topology_lock();
 	if (ump->um_fsckpid > 0) {

Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ffs/softdep.h	Sat Aug 18 22:21:59 2018	(r338031)
@@ -557,6 +557,7 @@ struct freefrag {
 	long	ff_fragsize;		/* size of fragment being deleted */
 	ino_t	ff_inum;		/* owning inode number */
 	enum	vtype ff_vtype;		/* owning inode's file type */
+	int	ff_key;			/* trim key when deleted */
 };
 
 /*

Modified: head/sys/ufs/ufs/ufsmount.h
==============================================================================
--- head/sys/ufs/ufs/ufsmount.h	Sat Aug 18 22:07:48 2018	(r338030)
+++ head/sys/ufs/ufs/ufsmount.h	Sat Aug 18 22:21:59 2018	(r338031)
@@ -47,6 +47,7 @@ struct ufs_args {
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_UFSMNT);
+MALLOC_DECLARE(M_TRIM);
 #endif
 
 struct buf;
@@ -63,6 +64,7 @@ struct inodedep;
 
 TAILQ_HEAD(inodedeplst, inodedep);
 LIST_HEAD(bmsafemaphd, bmsafemap);
+LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
 
 /*
  * This structure describes the UFS specific mount structure data.
@@ -70,7 +72,6 @@ LIST_HEAD(bmsafemaphd, bmsafemap);
  * UFS (UFS1, UFS2, etc).
  *
  * Lock reference:
- *	a - atomic operations
  *	c - set at allocation then constant until freed
  *	i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK)
  *	q - associated quota file is locked
@@ -99,8 +100,13 @@ struct ufsmount {
 	char	um_qflags[MAXQUOTAS];		/* (i) quota specific flags */
 	int64_t	um_savedmaxfilesize;		/* (c) track maxfilesize */
 	u_int	um_flags;			/* (i) filesystem flags */
-	u_int	um_trim_inflight;		/* (a) outstanding trim count */
+	u_int	um_trim_inflight;		/* (i) outstanding trim count */
+	u_int	um_trim_inflight_blks;		/* (i) outstanding trim blks */
+	u_long	um_trim_total;			/* (i) total trim count */
+	u_long	um_trim_total_blks;		/* (i) total trim block count */
 	struct	taskqueue *um_trim_tq;		/* (c) trim request queue */
+	struct	trimlist_hashhead *um_trimhash;	/* (i) trimlist hash table */
+	u_long	um_trimlisthashsize;		/* (i) trim hash table size-1 */
 						/* (c) - below function ptrs */
 	int	(*um_balloc)(struct vnode *, off_t, int, struct ucred *,
 		    int, struct buf **);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808182221.w7IMLxBV059347>