Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 7 Jan 2021 22:58:44 GMT
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 5cc52631b3b8 - main - Rewrite the disk I/O management system in fsck_ffs(8). Other than making fsck_ffs(8) run faster, there should be no functional change.
Message-ID:  <202101072258.107MwiZb028474@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by mckusick:

URL: https://cgit.FreeBSD.org/src/commit/?id=5cc52631b3b88dfc36d8049dc8bece8573c5f9af

commit 5cc52631b3b88dfc36d8049dc8bece8573c5f9af
Author:     Kirk McKusick <mckusick@FreeBSD.org>
AuthorDate: 2021-01-07 01:37:08 +0000
Commit:     Kirk McKusick <mckusick@FreeBSD.org>
CommitDate: 2021-01-07 23:03:15 +0000

    Rewrite the disk I/O management system in fsck_ffs(8). Other than
    making fsck_ffs(8) run faster, there should be no functional change.
    
    The original fsck_ffs(8) had its own disk I/O management system.
    When gjournal(8) was added to FreeBSD 7, code was added to fsck_ffs(8)
    to do the necessary gjournal rollback. Rather than use the existing
    fsck_ffs(8) disk I/O system, it wrote its own from scratch. Similarly
    when journalled soft updates were added in FreeBSD 9, code was added
    to fsck_ffs(8) to do the necessary journal rollback. And once again,
    rather than using either of the existing fsck_ffs(8) disk I/O
    systems, it wrote its own from scratch. Lastly the fsdb(8) utility
    uses the fsck_ffs(8) disk I/O management system. In preparation for
    making the changes necessary to enable snapshots to be taken when
    using journalled soft updates, it was necessary to have a single
    disk I/O system used by all the various subsystems in fsck_ffs(8).
    
    This commit merges the functionality required by all the different
    subsystems into a single disk I/O system that supports all of their
    needs. In so doing it picks up optimizations from each of them
    with the results that each of the subsystems does fewer reads and
    writes than it did with its own customized I/O system. It also
    greatly simplifies making changes to fsck_ffs(8) since everything
    goes through a single place. For example the ginode() function
    fetches an inode from the disk. When inode check hashes were added,
    they previously had to be checked in the code implementing inode
    fetch in each of the three different disk I/O systems. Now they
    need only be checked in ginode().
    
    Tested by:    Peter Holm
    Sponsored by: Netflix
---
 sbin/fsck_ffs/dir.c      | 139 ++++---
 sbin/fsck_ffs/ea.c       |   1 +
 sbin/fsck_ffs/fsck.h     |  81 +++--
 sbin/fsck_ffs/fsutil.c   | 424 +++++++++++++++++-----
 sbin/fsck_ffs/gjournal.c | 411 ++-------------------
 sbin/fsck_ffs/globs.c    |   9 +-
 sbin/fsck_ffs/inode.c    | 368 ++++++++++++++-----
 sbin/fsck_ffs/main.c     |  16 +-
 sbin/fsck_ffs/pass1.c    |  61 ++--
 sbin/fsck_ffs/pass1b.c   |  14 +-
 sbin/fsck_ffs/pass2.c    |  35 +-
 sbin/fsck_ffs/pass3.c    |   7 +-
 sbin/fsck_ffs/pass4.c    |  11 +-
 sbin/fsck_ffs/setup.c    |  18 +-
 sbin/fsck_ffs/suj.c      | 914 ++++++++++++++---------------------------------
 sbin/fsdb/fsdb.c         |  89 ++---
 sbin/fsdb/fsdb.h         |   1 +
 17 files changed, 1215 insertions(+), 1384 deletions(-)

diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c
index a86d65a9f183..e88d1650ce5a 100644
--- a/sbin/fsck_ffs/dir.c
+++ b/sbin/fsck_ffs/dir.c
@@ -62,7 +62,7 @@ static struct	dirtemplate dirhead = {
 
 static int chgino(struct inodesc *);
 static int dircheck(struct inodesc *, struct bufarea *, struct direct *);
-static int expanddir(union dinode *dp, char *name);
+static int expanddir(struct inode *ip, char *name);
 static void freedir(ino_t ino, ino_t parent);
 static struct direct *fsck_readdir(struct inodesc *);
 static struct bufarea *getdirblk(ufs2_daddr_t blkno, long size);
@@ -126,6 +126,8 @@ dirscan(struct inodesc *idesc)
 		idesc->id_dirp = (struct direct *)dbuf;
 		if ((n = (*idesc->id_func)(idesc)) & ALTERED) {
 			bp = getdirblk(idesc->id_blkno, blksiz);
+			if (bp->b_errs != 0)
+				return (STOP);
 			memmove(bp->b_un.b_buf + idesc->id_loc - dsize, dbuf,
 			    (size_t)dsize);
 			dirty(bp);
@@ -155,6 +157,8 @@ fsck_readdir(struct inodesc *idesc)
 	if (idesc->id_filesize <= 0 || idesc->id_loc >= blksiz)
 		return (NULL);
 	bp = getdirblk(idesc->id_blkno, blksiz);
+	if (bp->b_errs != 0)
+		return (NULL);
 	dp = (struct direct *)(bp->b_un.b_buf + idesc->id_loc);
 	/*
 	 * Only need to check current entry if it is the first in the
@@ -330,6 +334,7 @@ direrror(ino_t ino, const char *errmesg)
 void
 fileerror(ino_t cwd, ino_t ino, const char *errmesg)
 {
+	struct inode ip;
 	union dinode *dp;
 	char pathbuf[MAXPATHLEN + 1];
 
@@ -338,8 +343,9 @@ fileerror(ino_t cwd, ino_t ino, const char *errmesg)
 		pfatal("out-of-range inode number %ju", (uintmax_t)ino);
 		return;
 	}
-	dp = ginode(ino);
-	prtinode(ino, dp);
+	ginode(ino, &ip);
+	dp = ip.i_dp;
+	prtinode(&ip);
 	printf("\n");
 	getpathname(pathbuf, cwd, ino);
 	if (ftypeok(dp))
@@ -348,15 +354,18 @@ fileerror(ino_t cwd, ino_t ino, const char *errmesg)
 		    pathbuf);
 	else
 		pfatal("NAME=%s\n", pathbuf);
+	irelse(&ip);
 }
 
 void
 adjust(struct inodesc *idesc, int lcnt)
 {
+	struct inode ip;
 	union dinode *dp;
 	int saveresolved;
 
-	dp = ginode(idesc->id_number);
+	ginode(idesc->id_number, &ip);
+	dp = ip.i_dp;
 	if (DIP(dp, di_nlink) == lcnt) {
 		/*
 		 * If we have not hit any unresolved problems, are running
@@ -365,6 +374,7 @@ adjust(struct inodesc *idesc, int lcnt)
 		 */
 		if (resolved && (preen || bkgrdflag) && usedsoftdep) {
 			clri(idesc, "UNREF", 1);
+			irelse(&ip);
 			return;
 		} else {
 			/*
@@ -377,19 +387,19 @@ adjust(struct inodesc *idesc, int lcnt)
 			if (linkup(idesc->id_number, (ino_t)0, NULL) == 0) {
 				resolved = saveresolved;
 				clri(idesc, "UNREF", 0);
+				irelse(&ip);
 				return;
 			}
 			/*
 			 * Account for the new reference created by linkup().
 			 */
-			dp = ginode(idesc->id_number);
 			lcnt--;
 		}
 	}
 	if (lcnt != 0) {
 		pwarn("LINK COUNT %s", (lfdir == idesc->id_number) ? lfname :
 			((DIP(dp, di_mode) & IFMT) == IFDIR ? "DIR" : "FILE"));
-		prtinode(idesc->id_number, dp);
+		prtinode(&ip);
 		printf(" COUNT %d SHOULD BE %d",
 			DIP(dp, di_nlink), DIP(dp, di_nlink) - lcnt);
 		if (preen || usedsoftdep) {
@@ -403,7 +413,7 @@ adjust(struct inodesc *idesc, int lcnt)
 		if (preen || reply("ADJUST") == 1) {
 			if (bkgrdflag == 0) {
 				DIP_SET(dp, di_nlink, DIP(dp, di_nlink) - lcnt);
-				inodirty(dp);
+				inodirty(&ip);
 			} else {
 				cmd.value = idesc->id_number;
 				cmd.size = -lcnt;
@@ -417,6 +427,7 @@ adjust(struct inodesc *idesc, int lcnt)
 			}
 		}
 	}
+	irelse(&ip);
 }
 
 static int
@@ -460,6 +471,7 @@ chgino(struct inodesc *idesc)
 int
 linkup(ino_t orphan, ino_t parentdir, char *name)
 {
+	struct inode ip;
 	union dinode *dp;
 	int lostdir;
 	ino_t oldlfdir;
@@ -467,29 +479,32 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
 	char tempname[BUFSIZ];
 
 	memset(&idesc, 0, sizeof(struct inodesc));
-	dp = ginode(orphan);
+	ginode(orphan, &ip);
+	dp = ip.i_dp;
 	lostdir = (DIP(dp, di_mode) & IFMT) == IFDIR;
 	pwarn("UNREF %s ", lostdir ? "DIR" : "FILE");
-	prtinode(orphan, dp);
+	prtinode(&ip);
 	printf("\n");
-	if (preen && DIP(dp, di_size) == 0)
+	if (preen && DIP(dp, di_size) == 0) {
+		irelse(&ip);
 		return (0);
+	}
+	irelse(&ip);
 	if (cursnapshot != 0) {
 		pfatal("FILE LINKUP IN SNAPSHOT");
 		return (0);
 	}
 	if (preen)
 		printf(" (RECONNECTED)\n");
-	else
-		if (reply("RECONNECT") == 0)
-			return (0);
+	else if (reply("RECONNECT") == 0)
+		return (0);
 	if (lfdir == 0) {
-		dp = ginode(UFS_ROOTINO);
+		ginode(UFS_ROOTINO, &ip);
 		idesc.id_name = strdup(lfname);
 		idesc.id_type = DATA;
 		idesc.id_func = findino;
 		idesc.id_number = UFS_ROOTINO;
-		if ((ckinode(dp, &idesc) & FOUND) != 0) {
+		if ((ckinode(ip.i_dp, &idesc) & FOUND) != 0) {
 			lfdir = idesc.id_parent;
 		} else {
 			pwarn("NO lost+found DIRECTORY");
@@ -510,42 +525,52 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
 				}
 			}
 		}
+		irelse(&ip);
 		if (lfdir == 0) {
 			pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY");
 			printf("\n\n");
 			return (0);
 		}
 	}
-	dp = ginode(lfdir);
+	ginode(lfdir, &ip);
+	dp = ip.i_dp;
 	if ((DIP(dp, di_mode) & IFMT) != IFDIR) {
 		pfatal("lost+found IS NOT A DIRECTORY");
-		if (reply("REALLOCATE") == 0)
+		if (reply("REALLOCATE") == 0) {
+			irelse(&ip);
 			return (0);
+		}
 		oldlfdir = lfdir;
 		if ((lfdir = allocdir(UFS_ROOTINO, (ino_t)0, lfmode)) == 0) {
 			pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n");
+			irelse(&ip);
 			return (0);
 		}
 		if ((changeino(UFS_ROOTINO, lfname, lfdir) & ALTERED) == 0) {
 			pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n");
+			irelse(&ip);
 			return (0);
 		}
-		inodirty(dp);
-		idesc.id_type = ADDR;
+		idesc.id_type = inoinfo(oldlfdir)->ino_idtype;
 		idesc.id_func = freeblock;
 		idesc.id_number = oldlfdir;
 		adjust(&idesc, inoinfo(oldlfdir)->ino_linkcnt + 1);
 		inoinfo(oldlfdir)->ino_linkcnt = 0;
-		dp = ginode(lfdir);
+		inodirty(&ip);
+		irelse(&ip);
+		ginode(lfdir, &ip);
+		dp = ip.i_dp;
 	}
 	if (inoinfo(lfdir)->ino_state != DFOUND) {
 		pfatal("SORRY. NO lost+found DIRECTORY\n\n");
+		irelse(&ip);
 		return (0);
 	}
 	(void)lftempname(tempname, orphan);
 	if (makeentry(lfdir, orphan, (name ? name : tempname)) == 0) {
 		pfatal("SORRY. NO SPACE IN lost+found DIRECTORY");
 		printf("\n\n");
+		irelse(&ip);
 		return (0);
 	}
 	inoinfo(orphan)->ino_linkcnt--;
@@ -553,9 +578,8 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
 		if ((changeino(orphan, "..", lfdir) & ALTERED) == 0 &&
 		    parentdir != (ino_t)-1)
 			(void)makeentry(orphan, lfdir, "..");
-		dp = ginode(lfdir);
 		DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1);
-		inodirty(dp);
+		inodirty(&ip);
 		inoinfo(lfdir)->ino_linkcnt++;
 		pwarn("DIR I=%lu CONNECTED. ", (u_long)orphan);
 		if (parentdir != (ino_t)-1) {
@@ -572,6 +596,7 @@ linkup(ino_t orphan, ino_t parentdir, char *name)
 		if (preen == 0)
 			printf("\n");
 	}
+	irelse(&ip);
 	return (1);
 }
 
@@ -582,6 +607,8 @@ int
 changeino(ino_t dir, const char *name, ino_t newnum)
 {
 	struct inodesc idesc;
+	struct inode ip;
+	int error;
 
 	memset(&idesc, 0, sizeof(struct inodesc));
 	idesc.id_type = DATA;
@@ -590,7 +617,10 @@ changeino(ino_t dir, const char *name, ino_t newnum)
 	idesc.id_fix = DONTKNOW;
 	idesc.id_name = strdup(name);
 	idesc.id_parent = newnum;	/* new value for name */
-	return (ckinode(ginode(dir), &idesc));
+	ginode(dir, &ip);
+	error = ckinode(ip.i_dp, &idesc);
+	irelse(&ip);
+	return (error);
 }
 
 /*
@@ -599,8 +629,10 @@ changeino(ino_t dir, const char *name, ino_t newnum)
 int
 makeentry(ino_t parent, ino_t ino, const char *name)
 {
+	struct inode ip;
 	union dinode *dp;
 	struct inodesc idesc;
+	int retval;
 	char pathbuf[MAXPATHLEN + 1];
 
 	if (parent < UFS_ROOTINO || parent >= maxino ||
@@ -613,30 +645,37 @@ makeentry(ino_t parent, ino_t ino, const char *name)
 	idesc.id_parent = ino;	/* this is the inode to enter */
 	idesc.id_fix = DONTKNOW;
 	idesc.id_name = strdup(name);
-	dp = ginode(parent);
+	ginode(parent, &ip);
+	dp = ip.i_dp;
 	if (DIP(dp, di_size) % DIRBLKSIZ) {
 		DIP_SET(dp, di_size, roundup(DIP(dp, di_size), DIRBLKSIZ));
-		inodirty(dp);
+		inodirty(&ip);
 	}
-	if ((ckinode(dp, &idesc) & ALTERED) != 0)
+	if ((ckinode(dp, &idesc) & ALTERED) != 0) {
+		irelse(&ip);
 		return (1);
+	}
 	getpathname(pathbuf, parent, parent);
-	dp = ginode(parent);
-	if (expanddir(dp, pathbuf) == 0)
+	if (expanddir(&ip, pathbuf) == 0) {
+		irelse(&ip);
 		return (0);
-	return (ckinode(dp, &idesc) & ALTERED);
+	}
+	retval = ckinode(dp, &idesc) & ALTERED;
+	irelse(&ip);
+	return (retval);
 }
 
 /*
  * Attempt to expand the size of a directory
  */
 static int
-expanddir(union dinode *dp, char *name)
+expanddir(struct inode *ip, char *name)
 {
 	ufs2_daddr_t lastlbn, oldblk, newblk, indirblk;
 	size_t filesize, lastlbnsize;
 	struct bufarea *bp, *nbp;
 	struct inodesc idesc;
+	union dinode *dp;
 	int indiralloced;
 	char *cp;
 
@@ -645,6 +684,7 @@ expanddir(union dinode *dp, char *name)
 	pwarn("NO SPACE LEFT IN %s", name);
 	if (!preen && reply("EXPAND") == 0)
 		return (0);
+	dp = ip->i_dp;
 	filesize = DIP(dp, di_size);
 	lastlbn = lblkno(&sblock, filesize);
 	/*
@@ -671,7 +711,7 @@ expanddir(union dinode *dp, char *name)
 		DIP_SET(dp, di_size, filesize + sblock.fs_bsize - lastlbnsize);
 		DIP_SET(dp, di_blocks, DIP(dp, di_blocks) +
 		    btodb(sblock.fs_bsize - lastlbnsize));
-		inodirty(dp);
+		inodirty(ip);
 		memmove(nbp->b_un.b_buf, bp->b_un.b_buf, lastlbnsize);
 		memset(&nbp->b_un.b_buf[lastlbnsize], 0,
 		    sblock.fs_bsize - lastlbnsize);
@@ -680,10 +720,12 @@ expanddir(union dinode *dp, char *name)
 		     cp += DIRBLKSIZ)
 			memmove(cp, &emptydir, sizeof emptydir);
 		dirty(nbp);
-		nbp->b_flags &= ~B_INUSE;
+		brelse(nbp);
 		idesc.id_blkno = oldblk;
 		idesc.id_numfrags = numfrags(&sblock, lastlbnsize);
 		(void)freeblock(&idesc);
+		if (preen)
+			printf(" (EXPANDED)\n");
 		return (1);
 	}
 	if ((newblk = allocblk(sblock.fs_frag)) == 0)
@@ -719,18 +761,18 @@ expanddir(union dinode *dp, char *name)
 		}
 		IBLK_SET(nbp, lastlbn - UFS_NDADDR, newblk);
 		dirty(nbp);
-		nbp->b_flags &= ~B_INUSE;
+		brelse(nbp);
 	}
 	DIP_SET(dp, di_size, filesize + sblock.fs_bsize);
 	DIP_SET(dp, di_blocks, DIP(dp, di_blocks) + btodb(sblock.fs_bsize));
-	inodirty(dp);
+	inodirty(ip);
 	if (preen)
 		printf(" (EXPANDED)\n");
 	return (1);
 bad:
 	pfatal(" (EXPANSION FAILED)\n");
 	if (nbp != NULL)
-		nbp->b_flags &= ~B_INUSE;
+		brelse(nbp);
 	if (newblk != 0) {
 		idesc.id_blkno = newblk;
 		idesc.id_numfrags = sblock.fs_frag;
@@ -752,6 +794,7 @@ allocdir(ino_t parent, ino_t request, int mode)
 {
 	ino_t ino;
 	char *cp;
+	struct inode ip;
 	union dinode *dp;
 	struct bufarea *bp;
 	struct inoinfo *inp;
@@ -761,10 +804,12 @@ allocdir(ino_t parent, ino_t request, int mode)
 	dirp = &dirhead;
 	dirp->dot_ino = ino;
 	dirp->dotdot_ino = parent;
-	dp = ginode(ino);
+	ginode(ino, &ip);
+	dp = ip.i_dp;
 	bp = getdirblk(DIP(dp, di_db[0]), sblock.fs_fsize);
 	if (bp->b_errs) {
 		freeino(ino);
+		irelse(&ip);
 		return (0);
 	}
 	memmove(bp->b_un.b_buf, dirp, sizeof(struct dirtemplate));
@@ -774,14 +819,16 @@ allocdir(ino_t parent, ino_t request, int mode)
 		memmove(cp, &emptydir, sizeof emptydir);
 	dirty(bp);
 	DIP_SET(dp, di_nlink, 2);
-	inodirty(dp);
+	inodirty(&ip);
 	if (ino == UFS_ROOTINO) {
 		inoinfo(ino)->ino_linkcnt = DIP(dp, di_nlink);
 		cacheino(dp, ino);
+		irelse(&ip);
 		return(ino);
 	}
 	if (!INO_IS_DVALID(parent)) {
 		freeino(ino);
+		irelse(&ip);
 		return (0);
 	}
 	cacheino(dp, ino);
@@ -793,9 +840,12 @@ allocdir(ino_t parent, ino_t request, int mode)
 		inoinfo(ino)->ino_linkcnt = DIP(dp, di_nlink);
 		inoinfo(parent)->ino_linkcnt++;
 	}
-	dp = ginode(parent);
+	irelse(&ip);
+	ginode(parent, &ip);
+	dp = ip.i_dp;
 	DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1);
-	inodirty(dp);
+	inodirty(&ip);
+	irelse(&ip);
 	return (ino);
 }
 
@@ -805,12 +855,15 @@ allocdir(ino_t parent, ino_t request, int mode)
 static void
 freedir(ino_t ino, ino_t parent)
 {
+	struct inode ip;
 	union dinode *dp;
 
 	if (ino != parent) {
-		dp = ginode(parent);
+		ginode(parent, &ip);
+		dp = ip.i_dp;
 		DIP_SET(dp, di_nlink, DIP(dp, di_nlink) - 1);
-		inodirty(dp);
+		inodirty(&ip);
+		irelse(&ip);
 	}
 	freeino(ino);
 }
@@ -847,8 +900,8 @@ static struct bufarea *
 getdirblk(ufs2_daddr_t blkno, long size)
 {
 
-	if (pdirbp != NULL)
-		pdirbp->b_flags &= ~B_INUSE;
+	if (pdirbp != NULL && pdirbp->b_errs == 0)
+		brelse(pdirbp);
 	pdirbp = getdatablk(blkno, size, BT_DIRDATA);
 	return (pdirbp);
 }
diff --git a/sbin/fsck_ffs/ea.c b/sbin/fsck_ffs/ea.c
index 29e5f46d7651..7cf20196dfae 100644
--- a/sbin/fsck_ffs/ea.c
+++ b/sbin/fsck_ffs/ea.c
@@ -82,6 +82,7 @@ eascan(struct inodesc *idesc, struct ufs2_dinode *dp)
 		if ((n & 31) == 31)
 			printf("\n");
 	}
+	brelse(bp);
 	return (STOP);
 #endif
 }
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index a471d1979438..676350b75767 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -73,8 +73,7 @@
 
 #define	MAXDUP		10	/* limit on dup blks (per inode) */
 #define	MAXBAD		10	/* limit on bad blks (per inode) */
-#define	MINBUFS		10	/* minimum number of buffers required */
-#define	MAXBUFS		40	/* maximum space to allocate to buffers */
+#define	MINBUFS		100	/* minimum number of buffers required */
 #define	INOBUFSIZE	64*1024	/* size of buffer to read inodes in pass1 */
 #define	ZEROBUFSIZE	(dev_bsize * 128) /* size of zero buffer used by -Z */
 
@@ -101,9 +100,10 @@ union dinode {
  * have its link count adjusted by the value remaining in ino_linkcnt.
  */
 struct inostat {
-	char	ino_state;	/* state of inode, see below */
-	char	ino_type;	/* type of inode */
-	short	ino_linkcnt;	/* number of links not found */
+	u_char	ino_state;	/* state of inode, see below */
+	u_char	ino_type:4;	/* type of inode */
+	u_char	ino_idtype:4;	/* idesc id_type, SNAP or ADDR */
+	u_short	ino_linkcnt;	/* number of links not found */
 };
 /*
  * Inode states.
@@ -132,16 +132,35 @@ extern struct inostatlist {
 	struct inostat *il_stat;/* inostat info for this cylinder group */
 } *inostathead;
 
+/*
+ * Structure to reference a dinode.
+ */
+struct inode {
+	struct bufarea *i_bp;	/* buffer containing the dinode */
+	union dinode *i_dp;	/* pointer to dinode in buffer */
+	ino_t i_number;		/* inode number */
+};
+
+/*
+ * Size of hash tables
+ */
+#define	HASHSIZE	2048
+#define	HASH(x)		((x * 2654435761) & (HASHSIZE - 1))
+
 /*
  * buffer cache structure.
  */
 struct bufarea {
-	TAILQ_ENTRY(bufarea) b_list;		/* buffer list */
+	TAILQ_ENTRY(bufarea) b_list;		/* LRU buffer queue */
+	LIST_ENTRY(bufarea) b_hash;		/* hash list */
 	ufs2_daddr_t b_bno;			/* disk block number */
 	int b_size;				/* size of I/O */
 	int b_errs;				/* I/O error */
 	int b_flags;				/* B_ flags below */
 	int b_type;				/* BT_ type below */
+	int b_refcnt;				/* ref count of users */
+	int b_index;				/* for BT_LEVEL, ptr index */
+						/* for BT_INODES, first inum */
 	union {
 		char *b_buf;			/* buffer space */
 		ufs1_daddr_t *b_indir1;		/* UFS1 indirect block */
@@ -151,7 +170,6 @@ struct bufarea {
 		struct ufs1_dinode *b_dinode1;	/* UFS1 inode block */
 		struct ufs2_dinode *b_dinode2;	/* UFS2 inode block */
 	} b_un;
-	char b_dirty;
 };
 
 #define	IBLK(bp, i) \
@@ -168,7 +186,7 @@ struct bufarea {
 /*
  * Buffer flags
  */
-#define	B_INUSE 	0x00000001	/* Buffer is in use */
+#define	B_DIRTY 	0x00000001	/* Buffer is dirty */
 /*
  * Type of data in buffer
  */
@@ -182,7 +200,8 @@ struct bufarea {
 #define	BT_INODES 	 7	/* Buffer holds inodes */
 #define	BT_DIRDATA 	 8	/* Buffer holds directory data */
 #define	BT_DATA	 	 9	/* Buffer holds user data */
-#define BT_NUMBUFTYPES	10
+#define	BT_EMPTY 	10	/* Buffer allocated but not filled */
+#define BT_NUMBUFTYPES	11
 #define BT_NAMES {			\
 	"unknown",			\
 	"Superblock",			\
@@ -193,27 +212,33 @@ struct bufarea {
 	"External Attribute",		\
 	"Inode Block",			\
 	"Directory Contents",		\
-	"User Data" }
+	"User Data",			\
+	"Allocated but not filled" }
+extern char *buftype[];
+#define BT_BUFTYPE(type) \
+	type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN]
 extern long readcnt[BT_NUMBUFTYPES];
 extern long totalreadcnt[BT_NUMBUFTYPES];
 extern struct timespec readtime[BT_NUMBUFTYPES];
 extern struct timespec totalreadtime[BT_NUMBUFTYPES];
 extern struct timespec startprog;
 
+extern struct bufarea *icachebp;	/* inode cache buffer */
 extern struct bufarea sblk;		/* file system superblock */
 extern struct bufarea *pdirbp;		/* current directory contents */
-extern struct bufarea *pbp;		/* current inode block */
+extern int sujrecovery;			/* 1 => doing check using the journal */
 
 #define	dirty(bp) do { \
 	if (fswritefd < 0) \
 		pfatal("SETTING DIRTY FLAG IN READ_ONLY MODE\n"); \
 	else \
-		(bp)->b_dirty = 1; \
+		(bp)->b_flags |= B_DIRTY; \
 } while (0)
 #define	initbarea(bp, type) do { \
-	(bp)->b_dirty = 0; \
 	(bp)->b_bno = (ufs2_daddr_t)-1; \
 	(bp)->b_flags = 0; \
+	(bp)->b_refcnt = 0; \
+	(bp)->b_index = 0; \
 	(bp)->b_type = type; \
 } while (0)
 
@@ -227,6 +252,8 @@ struct inodesc {
 	enum fixstate id_fix;	/* policy on fixing errors */
 	int (*id_func)(struct inodesc *);
 				/* function to be applied to blocks of inode */
+	struct bufarea *id_bp;	/* ckinode: buffer with indirect pointers */
+	union dinode *id_dp;	/* ckinode: dinode being traversed */
 	ino_t id_number;	/* inode number described */
 	ino_t id_parent;	/* for DATA nodes, their parent */
 	ufs_lbn_t id_lbn;	/* logical block number of current block */
@@ -239,7 +266,7 @@ struct inodesc {
 	int id_loc;		/* for DATA nodes, current location in dir */
 	struct direct *id_dirp;	/* for DATA nodes, ptr to current entry */
 	char *id_name;		/* for DATA nodes, name to find or enter */
-	char id_type;		/* type of descriptor, DATA or ADDR */
+	char id_type;		/* type of descriptor, DATA, ADDR, or SNAP */
 };
 /* file types */
 #define	DATA	1	/* a directory */
@@ -332,7 +359,6 @@ extern char	skipclean;		/* skip clean file systems if preening */
 extern int	fsmodified;		/* 1 => write done to file system */
 extern int	fsreadfd;		/* file descriptor for reading file system */
 extern int	fswritefd;		/* file descriptor for writing file system */
-extern struct	uufsd disk;		/* libufs user-ufs disk structure */
 extern int	surrender;		/* Give up if reads fail */
 extern int	wantrestart;		/* Restart fsck on early termination */
 
@@ -352,12 +378,11 @@ extern volatile sig_atomic_t	got_sigalarm;	/* received a SIGALRM */
 
 #define	clearinode(dp) \
 	if (sblock.fs_magic == FS_UFS1_MAGIC) { \
-		(dp)->dp1 = ufs1_zino; \
+		(dp)->dp1 = zino.dp1; \
 	} else { \
-		(dp)->dp2 = ufs2_zino; \
+		(dp)->dp2 = zino.dp2; \
 	}
-extern struct	ufs1_dinode ufs1_zino;
-extern struct	ufs2_dinode ufs2_zino;
+extern union dinode zino;
 
 #define	setbmap(blkno)	setbit(blockmap, blkno)
 #define	testbmap(blkno)	isset(blockmap, blkno)
@@ -408,6 +433,7 @@ struct fstab;
 
 
 void		adjust(struct inodesc *, int lcnt);
+void		alarmhandler(int sig);
 ufs2_daddr_t	allocblk(long frags);
 ino_t		allocdir(ino_t parent, ino_t request, int mode);
 ino_t		allocino(ino_t request, int type);
@@ -418,12 +444,14 @@ void		bufinit(void);
 void		blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size);
 void		blerase(int fd, ufs2_daddr_t blk, long size);
 void		blzero(int fd, ufs2_daddr_t blk, long size);
+void		brelse(struct bufarea *);
 void		cacheino(union dinode *dp, ino_t inumber);
 void		catch(int);
 void		catchquit(int);
 void		cgdirty(struct bufarea *);
+struct bufarea *cglookup(int cg);
 int		changeino(ino_t dir, const char *name, ino_t newnum);
-int		check_cgmagic(int cg, struct bufarea *cgbp);
+int		check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild);
 int		chkrange(ufs2_daddr_t blk, int cnt);
 void		ckfini(int markclean);
 int		ckinode(union dinode *dp, struct inodesc *);
@@ -444,16 +472,17 @@ void		freeinodebuf(void);
 void		fsutilinit(void);
 int		ftypeok(union dinode *dp);
 void		getblk(struct bufarea *bp, ufs2_daddr_t blk, long size);
-struct bufarea *cglookup(int cg);
 struct bufarea *getdatablk(ufs2_daddr_t blkno, long size, int type);
 struct inoinfo *getinoinfo(ino_t inumber);
 union dinode   *getnextinode(ino_t inumber, int rebuildcg);
 void		getpathname(char *namebuf, ino_t curdir, ino_t ino);
-union dinode   *ginode(ino_t inumber);
+void		ginode(ino_t, struct inode *);
 void		infohandler(int sig);
-void		alarmhandler(int sig);
+void		irelse(struct inode *);
+ufs2_daddr_t	ino_blkatoff(union dinode *, ino_t, ufs_lbn_t, int *,
+		    struct bufarea **);
 void		inocleanup(void);
-void		inodirty(union dinode *);
+void		inodirty(struct inode *);
 struct inostat *inoinfo(ino_t inum);
 void		IOstats(char *what);
 int		linkup(ino_t orphan, ino_t parentdir, char *name);
@@ -468,13 +497,13 @@ void		pass4(void);
 void		pass5(void);
 void		pfatal(const char *fmt, ...) __printflike(1, 2);
 void		propagate(void);
-void		prtinode(ino_t ino, union dinode *dp);
+void		prtinode(struct inode *);
 void		pwarn(const char *fmt, ...) __printflike(1, 2);
 int		readsb(int listerr);
 int		reply(const char *question);
 void		rwerror(const char *mesg, ufs2_daddr_t blk);
 void		sblock_init(void);
-void		setinodebuf(ino_t);
+void		setinodebuf(int, ino_t);
 int		setup(char *dev);
 void		gjournal_check(const char *filesys);
 int		suj_check(const char *filesys);
diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c
index 11d2ebd598fd..64c4701d9b7f 100644
--- a/sbin/fsck_ffs/fsutil.c
+++ b/sbin/fsck_ffs/fsutil.c
@@ -64,9 +64,14 @@ __FBSDID("$FreeBSD$");
 
 #include "fsck.h"
 
+int		sujrecovery = 0;
+
+static struct bufarea *allocbuf(const char *);
+static void cg_write(struct bufarea *);
 static void slowio_start(void);
 static void slowio_end(void);
 static void printIOstats(void);
+static void prtbuf(const char *, struct bufarea *);
 
 static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */
 static struct timespec startpass, finishpass;
@@ -74,12 +79,16 @@ struct timeval slowio_starttime;
 int slowio_delay_usec = 10000;	/* Initial IO delay for background fsck */
 int slowio_pollcnt;
 static struct bufarea cgblk;	/* backup buffer for cylinder group blocks */
-static TAILQ_HEAD(buflist, bufarea) bufhead;	/* head of buffer cache list */
+static TAILQ_HEAD(bufqueue, bufarea) bufqueuehd; /* head of buffer cache LRU */
+static LIST_HEAD(bufhash, bufarea) bufhashhd[HASHSIZE]; /* buffer hash list */
 static int numbufs;				/* size of buffer cache */
-static char *buftype[BT_NUMBUFTYPES] = BT_NAMES;
+static int cachelookups;			/* number of cache lookups */
+static int cachereads;				/* number of cache reads */
 static struct bufarea *cgbufs;	/* header for cylinder group cache */
 static int flushtries;		/* number of tries to reclaim memory */
 
+char *buftype[BT_NUMBUFTYPES] = BT_NAMES;
+
 void
 fsutilinit(void)
 {
@@ -89,11 +98,6 @@ fsutilinit(void)
 	bzero(&slowio_starttime, sizeof(struct timeval));
 	slowio_delay_usec = 10000;
 	slowio_pollcnt = 0;
-	bzero(&cgblk, sizeof(struct bufarea));
-	TAILQ_INIT(&bufhead);
-	numbufs = 0;
-	/* buftype ? */
-	cgbufs = NULL;
 	flushtries = 0;
 }
 
@@ -181,33 +185,19 @@ inoinfo(ino_t inum)
 void
 bufinit(void)
 {
-	struct bufarea *bp;
-	long bufcnt, i;
-	char *bufp;
+	int i;
 
-	pbp = pdirbp = (struct bufarea *)0;
-	bufp = Malloc((unsigned int)sblock.fs_bsize);
-	if (bufp == NULL)
-		errx(EEXIT, "cannot allocate buffer pool");
-	cgblk.b_un.b_buf = bufp;
+	pdirbp = (struct bufarea *)0;
+	bzero(&cgblk, sizeof(struct bufarea));
+	cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize);
+	if (cgblk.b_un.b_buf == NULL)
+		errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize);
 	initbarea(&cgblk, BT_CYLGRP);
-	TAILQ_INIT(&bufhead);
-	bufcnt = MAXBUFS;
-	if (bufcnt < MINBUFS)
-		bufcnt = MINBUFS;
-	for (i = 0; i < bufcnt; i++) {
-		bp = (struct bufarea *)Malloc(sizeof(struct bufarea));
-		bufp = Malloc((unsigned int)sblock.fs_bsize);
-		if (bp == NULL || bufp == NULL) {
-			if (i >= MINBUFS)
-				break;
-			errx(EEXIT, "cannot allocate buffer pool");
-		}
-		bp->b_un.b_buf = bufp;
-		TAILQ_INSERT_HEAD(&bufhead, bp, b_list);
-		initbarea(bp, BT_UNKNOWN);
-	}
-	numbufs = i;	/* save number of buffers */
+	cgbufs = NULL;
+	numbufs = cachelookups = cachereads = 0;
+	TAILQ_INIT(&bufqueuehd);
+	for (i = 0; i < HASHSIZE; i++)
+		LIST_INIT(&bufhashhd[i]);
 	for (i = 0; i < BT_NUMBUFTYPES; i++) {
 		readtime[i].tv_sec = totalreadtime[i].tv_sec = 0;
 		readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0;
@@ -215,6 +205,25 @@ bufinit(void)
 	}
 }
 
+static struct bufarea *
+allocbuf(const char *failreason)
+{
+	struct bufarea *bp;
+	char *bufp;
+
+	bp = (struct bufarea *)Malloc(sizeof(struct bufarea));
+	bufp = Malloc((unsigned int)sblock.fs_bsize);
+	if (bp == NULL || bufp == NULL) {
+		errx(EEXIT, "%s", failreason);
+		/* NOTREACHED */
+	}
+	numbufs++;
+	bp->b_un.b_buf = bufp;
+	TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
+	initbarea(bp, BT_UNKNOWN);
+	return (bp);
+}
+
 /*
  * Manage cylinder group buffers.
  *
@@ -230,18 +239,22 @@ cglookup(int cg)
 	struct bufarea *cgbp;
 	struct cg *cgp;
 
+	if ((unsigned) cg >= sblock.fs_ncg)
+		errx(EEXIT, "cglookup: out of range cylinder group %d", cg);
 	if (cgbufs == NULL) {
 		cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea));
 		if (cgbufs == NULL)
-			errx(EEXIT, "cannot allocate cylinder group buffers");
+			errx(EEXIT, "Cannot allocate cylinder group buffers");
 	}
 	cgbp = &cgbufs[cg];
 	if (cgbp->b_un.b_cg != NULL)
 		return (cgbp);
 	cgp = NULL;
 	if (flushtries == 0)
-		cgp = malloc((unsigned int)sblock.fs_cgsize);
+		cgp = Malloc((unsigned int)sblock.fs_cgsize);
 	if (cgp == NULL) {
+		if (sujrecovery)
+			errx(EEXIT,"Ran out of memory during journal recovery");
 		getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize);
 		return (&cgblk);
 	}
@@ -278,7 +291,7 @@ flushentry(void)
 {
 	struct bufarea *cgbp;
 
-	if (flushtries == sblock.fs_ncg || cgbufs == NULL)
+	if (sujrecovery || flushtries == sblock.fs_ncg || cgbufs == NULL)
 		return (0);
 	cgbp = &cgbufs[flushtries++];
 	if (cgbp->b_un.b_cg == NULL)
@@ -296,25 +309,84 @@ struct bufarea *
 getdatablk(ufs2_daddr_t blkno, long size, int type)
 {
 	struct bufarea *bp;
+	struct bufhash *bhdp;
 
-	TAILQ_FOREACH(bp, &bufhead, b_list)
-		if (bp->b_bno == fsbtodb(&sblock, blkno))
+	cachelookups++;
+	/* If out of range, return empty buffer with b_err == -1 */
+	if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) {
+		blkno = -1;
+		type = BT_EMPTY;
+	}
+	bhdp = &bufhashhd[HASH(blkno)];
+	LIST_FOREACH(bp, bhdp, b_hash)
+		if (bp->b_bno == fsbtodb(&sblock, blkno)) {
+			if (debug && bp->b_size != size) {
+				prtbuf("getdatablk: size mismatch", bp);
+				pfatal("getdatablk: b_size %d != size %ld\n",
+				    bp->b_size, size);
+			}
 			goto foundit;
-	TAILQ_FOREACH_REVERSE(bp, &bufhead, buflist, b_list)
-		if ((bp->b_flags & B_INUSE) == 0)
-			break;
-	if (bp == NULL)
-		errx(EEXIT, "deadlocked buffer pool");
+		}
+	/*
+	 * Move long-term busy buffer back to the front of the LRU so we 
+	 * do not endless inspect them for recycling.
+	 */
+	bp = TAILQ_LAST(&bufqueuehd, bufqueue);
+	if (bp != NULL && bp->b_refcnt != 0) {
+		TAILQ_REMOVE(&bufqueuehd, bp, b_list);
+		TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
+	}
+	/*
+	 * Allocate up to the minimum number of buffers before
+	 * considering recycling any of them.
+	 */
+	if (size > sblock.fs_bsize)
+		errx(EEXIT, "Excessive buffer size %ld > %d\n", size,
+		    sblock.fs_bsize);
+	if (numbufs < MINBUFS) {
+		bp = allocbuf("cannot create minimal buffer pool");
+	} else if (sujrecovery) {
+		/*
+		 * SUJ recovery does not want anything written until it 
+		 * has successfully completed (so it can fail back to
+		 * full fsck). Thus, we can only recycle clean buffers.
+		 */
+		TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list)
+			if ((bp->b_flags & B_DIRTY) == 0 && bp->b_refcnt == 0)
+				break;
+		if (bp == NULL)
+			bp = allocbuf("Ran out of memory during "
+			    "journal recovery");
+		else
+			LIST_REMOVE(bp, b_hash);
+	} else {
+		/*
+		 * Recycle oldest non-busy buffer.
+		 */
+		TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list)
+			if (bp->b_refcnt == 0)
+				break;
*** 4026 LINES SKIPPED ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202101072258.107MwiZb028474>