Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 25 Jan 2010 20:42:28 -1000 (HST)
From:      Jeff Roberson <jroberson@jroberson.net>
To:        Jeff Roberson <jeff@FreeBSD.org>
Cc:        svn-src-projects@freebsd.org, src-committers@freebsd.org
Subject:   Re: svn commit: r203012 - in projects/suj/head: lib/libufs sbin/fsck_ffs sbin/mount sbin/tunefs sys/sys sys/ufs/ffs sys/ufs/ufs
Message-ID:  <alpine.BSF.2.00.1001252041490.1027@desktop>
In-Reply-To: <201001260636.o0Q6aAwh005669@svn.freebsd.org>
References:  <201001260636.o0Q6aAwh005669@svn.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
I forgot to mention, this change breaks backwards compat with earlier suj 
releases.  I think this is the last fs.h revision I will make for some 
time to come.

Jeff

On Tue, 26 Jan 2010, Jeff Roberson wrote:

> Author: jeff
> Date: Tue Jan 26 06:36:10 2010
> New Revision: 203012
> URL: http://svn.freebsd.org/changeset/base/203012
>
> Log:
>   - Move the softdep journal inode into the namespace at /.sujournal.  This
>     requires quite a lot of code as tunefs needs to be able to create
>     directory entries in ROOTINO.  However this is much cleaner from a
>     compat standpoint.  The inode is marked IMMUTABLE and only readable by
>     root.  Eventually the kernel will prevent clearing of the IMMUTABLE bit.
>   - Fix a nasty link count bug involving changedirectory_offset().  When
>     a link may exist at more than one location depending on when the
>     directory block was written we create duplicate addref records.  When
>     an add and a remove are detected at the same offset the remove is
>     discarded based on the assumption that it cancels the link in the add.
>     A legitimate remove may collide with one of these alternate offset adds
>     that are created by fsck and be discarded even though it removed a real
>     link.  To resolve this the lineage of the addref must be established
>     to determine whether the remove refers to an alternate address or not.
>     Any offset which is not up-to-date with respect to the offset in the
>     move record is considered alternate and will not discard a remove.
>   - Use clear_remove() when we begin to exhaust dependencies to prevent
>     excessive looping in request_cleanup().  This should probably
>     also be done in softdep_fsync().  Only workloads which delete
>     incredible numbers of files within the same directory would be
>     affected.  stress2 can generate over 100,000 outstanding removes on
>     my test machine.
>
> Modified:
>  projects/suj/head/lib/libufs/cgroup.c
>  projects/suj/head/lib/libufs/libufs.h
>  projects/suj/head/sbin/fsck_ffs/pass4.c
>  projects/suj/head/sbin/fsck_ffs/suj.c
>  projects/suj/head/sbin/mount/mount.c
>  projects/suj/head/sbin/tunefs/tunefs.c
>  projects/suj/head/sys/sys/mount.h
>  projects/suj/head/sys/ufs/ffs/ffs_alloc.c
>  projects/suj/head/sys/ufs/ffs/ffs_softdep.c
>  projects/suj/head/sys/ufs/ffs/ffs_vfsops.c
>  projects/suj/head/sys/ufs/ffs/fs.h
>  projects/suj/head/sys/ufs/ufs/inode.h
>
> Modified: projects/suj/head/lib/libufs/cgroup.c
> ==============================================================================
> --- projects/suj/head/lib/libufs/cgroup.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/lib/libufs/cgroup.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -71,6 +71,67 @@ gotit:
> 	return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
> }
>
> +int
> +cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size)
> +{
> +	u_int8_t *blksfree;
> +	struct fs *fs;
> +	struct cg *cgp;
> +	ufs1_daddr_t fragno, cgbno;
> +	int i, cg, blk, frags, bbase;
> +
> +	fs = &disk->d_fs;
> +	cg = dtog(fs, bno);
> +	if (cgread1(disk, cg) != 1)
> +		return (-1);
> +	cgp = &disk->d_cg;
> +	cgbno = dtogd(fs, bno);
> +	blksfree = cg_blksfree(cgp);
> +	if (size == fs->fs_bsize) {
> +		fragno = fragstoblks(fs, cgbno);
> +		ffs_setblock(fs, blksfree, fragno);
> +		ffs_clusteracct(fs, cgp, fragno, 1);
> +		cgp->cg_cs.cs_nbfree++;
> +		fs->fs_cstotal.cs_nbfree++;
> +		fs->fs_cs(fs, cg).cs_nbfree++;
> +	} else {
> +		bbase = cgbno - fragnum(fs, cgbno);
> +		/*
> +		 * decrement the counts associated with the old frags
> +		 */
> +		blk = blkmap(fs, blksfree, bbase);
> +		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
> +		/*
> +		 * deallocate the fragment
> +		 */
> +		frags = numfrags(fs, size);
> +		for (i = 0; i < frags; i++)
> +			setbit(blksfree, cgbno + i);
> +		cgp->cg_cs.cs_nffree += i;
> +		fs->fs_cstotal.cs_nffree += i;
> +		fs->fs_cs(fs, cg).cs_nffree += i;
> +		/*
> +		 * add back in counts associated with the new frags
> +		 */
> +		blk = blkmap(fs, blksfree, bbase);
> +		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
> +		/*
> +		 * if a complete block has been reassembled, account for it
> +		 */
> +		fragno = fragstoblks(fs, bbase);
> +		if (ffs_isblock(fs, blksfree, fragno)) {
> +			cgp->cg_cs.cs_nffree -= fs->fs_frag;
> +			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
> +			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
> +			ffs_clusteracct(fs, cgp, fragno, 1);
> +			cgp->cg_cs.cs_nbfree++;
> +			fs->fs_cstotal.cs_nbfree++;
> +			fs->fs_cs(fs, cg).cs_nbfree++;
> +		}
> +	}
> +	return cgwrite(disk);
> +}
> +
> ino_t
> cgialloc(struct uufsd *disk)
> {
>
> Modified: projects/suj/head/lib/libufs/libufs.h
> ==============================================================================
> --- projects/suj/head/lib/libufs/libufs.h	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/lib/libufs/libufs.h	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -111,6 +111,7 @@ int berase(struct uufsd *, ufs2_daddr_t,
>  * cgroup.c
>  */
> ufs2_daddr_t cgballoc(struct uufsd *);
> +int cgbfree(struct uufsd *, ufs2_daddr_t, long);
> ino_t cgialloc(struct uufsd *);
> int cgread(struct uufsd *);
> int cgread1(struct uufsd *, int);
>
> Modified: projects/suj/head/sbin/fsck_ffs/pass4.c
> ==============================================================================
> --- projects/suj/head/sbin/fsck_ffs/pass4.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sbin/fsck_ffs/pass4.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -72,9 +72,6 @@ pass4(void)
> 		for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) {
> 			if (inumber < ROOTINO)
> 				continue;
> -			if (sblock.fs_flags & FS_SUJ &&
> -			    inumber == sblock.fs_sujournal)
> -				continue;
> 			idesc.id_number = inumber;
> 			switch (inoinfo(inumber)->ino_state) {
>
>
> Modified: projects/suj/head/sbin/fsck_ffs/suj.c
> ==============================================================================
> --- projects/suj/head/sbin/fsck_ffs/suj.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sbin/fsck_ffs/suj.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
> #include <stdlib.h>
> #include <stdint.h>
> #include <libufs.h>
> +#include <string.h>
> #include <strings.h>
> #include <err.h>
> #include <assert.h>
> @@ -63,6 +64,7 @@ struct suj_seg {
> struct suj_rec {
> 	TAILQ_ENTRY(suj_rec) sr_next;
> 	union jrec	*sr_rec;
> +	int		sr_alt;	/* Is alternate address? */
> };
> TAILQ_HEAD(srechd, suj_rec);
>
> @@ -127,6 +129,7 @@ TAILQ_HEAD(seghd, suj_seg) allsegs;
> uint64_t oldseq;
> static struct uufsd *disk = NULL;
> static struct fs *fs = NULL;
> +ino_t sujino;
>
> /*
>  * Summary statistics.
> @@ -191,8 +194,7 @@ closedisk(const char *devnam)
> 		fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
> 		fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
> 	}
> -	/* XXX Don't set clean for now, we don't trust the journal. */
> -	/* fs->fs_clean = 1; */
> +	fs->fs_clean = 1;
> 	fs->fs_time = time(NULL);
> 	fs->fs_mtime = time(NULL);
> 	if (sbwrite(disk, 0) == -1)
> @@ -1823,6 +1825,7 @@ ino_append(union jrec *rec)
> 	sino->si_hasrecs = 1;
> 	srec = errmalloc(sizeof(*srec));
> 	srec->sr_rec = rec;
> +	srec->sr_alt = 0;
> 	TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next);
> }
>
> @@ -1844,9 +1847,10 @@ ino_build_ref(struct suj_ino *sino, stru
>
> 	refrec = (struct jrefrec *)srec->sr_rec;
> 	if (debug)
> -		printf("ino_build: op %d, ino %d, nlink %d, parent %d, diroff %jd\n",
> -		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent,
> -		    refrec->jr_diroff);
> +		printf("ino_build: op %d, ino %d, nlink %d, "
> +		    "parent %d, diroff %jd\n",
> +		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
> +		    refrec->jr_parent, refrec->jr_diroff);
>
> 	/*
> 	 * Search for a mvrec that matches this offset.  Whether it's an add
> @@ -1871,16 +1875,19 @@ ino_build_ref(struct suj_ino *sino, stru
> 				rrn = errmalloc(sizeof(*refrec));
> 				*rrn = *refrec;
> 				rrn->jr_op = JOP_ADDREF;
> +				rrn->jr_diroff = mvrec->jm_oldoff;
> 				srn = errmalloc(sizeof(*srec));
> +				srn->sr_alt = 1;
> 				srn->sr_rec = (union jrec *)rrn;
> 				ino_build_ref(sino, srn);
> -				refrec->jr_diroff = mvrec->jm_oldoff;
> 			}
> 		}
> 	}
> 	/*
> 	 * We walk backwards so that adds and removes are evaluated in the
> -	 * correct order.
> +	 * correct order.  If a primary record conflicts with an alt keep
> +	 * the primary and discard the alt.  We must track this to keep
> +	 * the correct number of removes in the list.
> 	 */
> 	for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
> 	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
> @@ -1890,7 +1897,17 @@ ino_build_ref(struct suj_ino *sino, stru
> 			continue;
> 		if (debug)
> 			printf("Discarding dup.\n");
> -		rrn->jr_mode = refrec->jr_mode;
> +		if (srn->sr_alt == 0) {
> +			rrn->jr_mode = refrec->jr_mode;
> +			return;
> +		}
> +		/*
> +		 * Replace the record in place with the old nlink in case
> +		 * we replace the head of the list.  Abandon srec as a dup.
> +		 */
> +		refrec->jr_nlink = rrn->jr_nlink;
> +		srn->sr_rec = srec->sr_rec;
> +		srn->sr_alt = srec->sr_alt;
> 		return;
> 	}
> 	TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
> @@ -1930,9 +1947,12 @@ ino_move_ref(struct suj_ino *sino, struc
> 		/*
> 		 * When an entry is moved we don't know whether the write
> 		 * to move has completed yet.  To resolve this we create
> -		 * a new add dependency in the new location as if it were added
> -		 * twice.  Only one will succeed.
> +		 * a new add dependency in the new location as if it were
> +		 * added twice.  Only one will succeed.  Consider the
> +		 * new offset the primary location for the inode and the
> +		 * old offset the alt.
> 		 */
> +		srn->sr_alt = 1;
> 		refrec = errmalloc(sizeof(*refrec));
> 		refrec->jr_op = JOP_ADDREF;
> 		refrec->jr_ino = mvrec->jm_ino;
> @@ -1941,12 +1961,14 @@ ino_move_ref(struct suj_ino *sino, struc
> 		refrec->jr_mode = rrn->jr_mode;
> 		refrec->jr_nlink = rrn->jr_nlink;
> 		srn = errmalloc(sizeof(*srn));
> +		srn->sr_alt = 0;
> 		srn->sr_rec = (union jrec *)refrec;
> 		ino_build_ref(sino, srn);
> 		break;
> 	}
> 	/*
> -	 * Add this mvrec to the queue of pending mvs.
> +	 * Add this mvrec to the queue of pending mvs, possibly collapsing
> +	 * it with a prior move for the same inode and offset.
> 	 */
> 	for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
> 	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
> @@ -2195,19 +2217,25 @@ suj_verifyino(union dinode *ip)
>
> 	if (DIP(ip, di_nlink) != 1) {
> 		printf("Invalid link count %d for journal inode %d\n",
> -		    DIP(ip, di_nlink), fs->fs_sujournal);
> +		    DIP(ip, di_nlink), sujino);
> +		return (-1);
> +	}
> +
> +	if (DIP(ip, di_flags) != (SF_IMMUTABLE | SF_NOUNLINK)) {
> +		printf("Invalid flags 0x%X for journal inode %d\n",
> +		    DIP(ip, di_flags), sujino);
> 		return (-1);
> 	}
>
> -	if (DIP(ip, di_mode) != IFREG) {
> -		printf("Invalid mode %d for journal inode %d\n",
> -		    DIP(ip, di_mode), fs->fs_sujournal);
> +	if (DIP(ip, di_mode) != (IFREG | IREAD)) {
> +		printf("Invalid mode %o for journal inode %d\n",
> +		    DIP(ip, di_mode), sujino);
> 		return (-1);
> 	}
>
> 	if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) {
> 		printf("Invalid size %jd for journal inode %d\n",
> -		    DIP(ip, di_size), fs->fs_sujournal);
> +		    DIP(ip, di_size), sujino);
> 		return (-1);
> 	}
>
> @@ -2447,20 +2475,60 @@ restart:
> }
>
> /*
> + * Search a directory block for the SUJ_FILE.
> + */
> +static void
> +suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
> +{
> +	char block[MAXBSIZE];
> +	struct direct *dp;
> +	int bytes;
> +	int off;
> +
> +	if (sujino)
> +		return;
> +	bytes = lfragtosize(fs, frags);
> +	if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0)
> +		err(1, "Failed to read ROOTINO directory block %jd", blk);
> +	for (off = 0; off < bytes; off += dp->d_reclen) {
> +		dp = (struct direct *)&block[off];
> +		if (dp->d_reclen == 0)
> +			break;
> +		if (dp->d_ino == 0)
> +			continue;
> +		if (dp->d_namlen != strlen(SUJ_FILE))
> +			continue;
> +		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
> +			continue;
> +		sujino = dp->d_ino;
> +		return;
> +	}
> +}
> +
> +/*
>  * Orchestrate the verification of a filesystem via the softupdates journal.
>  */
> int
> suj_check(const char *filesys)
> {
> 	union dinode *jip;
> +	union dinode *ip;
> 	uint64_t blocks;
>
> 	opendisk(filesys);
> 	TAILQ_INIT(&allsegs);
> 	/*
> +	 * Find the journal inode.
> +	 */
> +	ip = ino_read(ROOTINO);
> +	sujino = 0;
> +	ino_visit(ip, ROOTINO, suj_find, 0);
> +	if (sujino == 0)
> +		errx(1, "Journal inode removed.  Use tunefs to re-create.");
> +	/*
> 	 * Fetch the journal inode and verify it.
> 	 */
> -	jip = ino_read(fs->fs_sujournal);
> +	jip = ino_read(sujino);
> 	printf("** SU+J Recovering %s\n", filesys);
> 	if (suj_verifyino(jip) != 0)
> 		return (-1);
> @@ -2469,11 +2537,11 @@ suj_check(const char *filesys)
> 	 * available journal blocks in with suj_read().
> 	 */
> 	printf("** Reading %jd byte journal from inode %d.\n",
> -	    DIP(jip, di_size), fs->fs_sujournal);
> +	    DIP(jip, di_size), sujino);
> 	suj_jblocks = jblocks_create();
> -	blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0);
> +	blocks = ino_visit(jip, sujino, suj_add_block, 0);
> 	if (blocks != numfrags(fs, DIP(jip, di_size)))
> -		errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal);
> +		errx(1, "Sparse journal inode %d.\n", sujino);
> 	suj_read();
> 	jblocks_destroy(suj_jblocks);
> 	suj_jblocks = NULL;
>
> Modified: projects/suj/head/sbin/mount/mount.c
> ==============================================================================
> --- projects/suj/head/sbin/mount/mount.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sbin/mount/mount.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -113,7 +113,6 @@ static struct opt {
> 	{ MNT_ACLS,		"acls" },
> 	{ MNT_NFS4ACLS,		"nfsv4acls" },
> 	{ MNT_GJOURNAL,		"gjournal" },
> -	{ MNT_SUJ,		"journal" }, /* always soft-updates, journal */
> 	{ 0, NULL }
> };
>
>
> Modified: projects/suj/head/sbin/tunefs/tunefs.c
> ==============================================================================
> --- projects/suj/head/sbin/tunefs/tunefs.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sbin/tunefs/tunefs.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
> #include <ufs/ufs/ufsmount.h>
> #include <ufs/ufs/dinode.h>
> #include <ufs/ffs/fs.h>
> +#include <ufs/ufs/dir.h>
>
> #include <ctype.h>
> #include <err.h>
> @@ -74,6 +75,7 @@ struct uufsd disk;
> void usage(void);
> void printfs(void);
> int journal_alloc(int64_t size);
> +void journal_clear(void);
> void sbdirty(void);
>
> int
> @@ -355,11 +357,11 @@ main(int argc, char *argv[])
> 			if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
> 				warnx("%s remains unchanged as disabled", name);
> 			} else {
> -				sbdirty();
> +				journal_clear();
>  				sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
> -				sblock.fs_sujournal = 0;
> 				sblock.fs_sujfree = 0;
> - 				warnx("%s cleared", name);
> + 				warnx("%s cleared, "
> +				    "remove .sujournal to reclaim space", name);
> 			}
>  		}
> 	}
> @@ -523,11 +525,9 @@ journal_balloc(void)
> {
> 	ufs2_daddr_t blk;
> 	struct cg *cgp;
> -	struct fs *fs;
> 	int valid;
>
> 	cgp = &disk.d_cg;
> -	fs = &disk.d_fs;
> 	for (;;) {
> 		blk = cgballoc(&disk);
> 		if (blk > 0)
> @@ -553,13 +553,231 @@ journal_balloc(void)
> 		warnx("Failed to find sufficient free blocks for the journal");
> 		return -1;
> 	}
> -	if (bwrite(&disk, fsbtodb(fs, blk), clrbuf, fs->fs_bsize) <= 0) {
> +	if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf,
> +	    sblock.fs_bsize) <= 0) {
> 		warn("Failed to initialize new block");
> 		return -1;
> 	}
> 	return (blk);
> }
>
> +/*
> + * Search a directory block for the SUJ_FILE.
> + */
> +static ino_t
> +dir_search(ufs2_daddr_t blk, int bytes)
> +{
> +	char block[MAXBSIZE];
> +	struct direct *dp;
> +	int off;
> +
> +	if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) {
> +		warn("Failed to read dir block");
> +		return (-1);
> +	}
> +	for (off = 0; off < bytes; off += dp->d_reclen) {
> +		dp = (struct direct *)&block[off];
> +		if (dp->d_reclen == 0)
> +			break;
> +		if (dp->d_ino == 0)
> +			continue;
> +		if (dp->d_namlen != strlen(SUJ_FILE))
> +			continue;
> +		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
> +			continue;
> +		return (dp->d_ino);
> +	}
> +
> +	return (0);
> +}
> +
> +/*
> + * Search in the ROOTINO for the SUJ_FILE.  If it exists we can not enable
> + * journaling.
> + */
> +static ino_t
> +journal_findfile(void)
> +{
> +	struct ufs1_dinode *dp1;
> +	struct ufs2_dinode *dp2;
> +	int mode;
> +	void *ip;
> +	int i;
> +
> +	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
> +		warn("Failed to get root inode");
> +		return (-1);
> +	}
> +	dp2 = ip;
> +	dp1 = ip;
> +	if (sblock.fs_magic == FS_UFS1_MAGIC) {
> +		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
> +			warnx("ROOTINO extends beyond direct blocks.");
> +			return (-1);
> +		}
> +		for (i = 0; i < NDADDR; i++) {
> +			if (dp1->di_db[i] == 0)
> +				break;
> +			if (dir_search(dp1->di_db[i],
> +			    sblksize(&sblock, (off_t)dp1->di_size, i)) != 0)
> +				return (-1);
> +		}
> +	} else {
> +		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
> +			warnx("ROOTINO extends beyond direct blocks.");
> +			return (-1);
> +		}
> +		for (i = 0; i < NDADDR; i++) {
> +			if (dp2->di_db[i] == 0)
> +				break;
> +			if (dir_search(dp2->di_db[i],
> +			    sblksize(&sblock, (off_t)dp2->di_size, i)) != 0)
> +				return (-1);
> +		}
> +	}
> +
> +	return (0);
> +}
> +
> +/*
> + * Insert the journal at inode 'ino' into directory blk 'blk' at the first
> + * free offset of 'off'.  DIRBLKSIZ blocks after off are initialized as
> + * empty.
> + */
> +static int
> +dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino)
> +{
> +	struct direct *dp;
> +	char block[MAXBSIZE];
> +
> +	if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
> +		warn("Failed to read dir block");
> +		return (-1);
> +	}
> +	bzero(&block[off], sblock.fs_bsize - off);
> +	dp = (struct direct *)&block[off];
> +	dp->d_ino = ino;
> +	dp->d_reclen = DIRBLKSIZ;
> +	dp->d_type = DT_REG;
> +	dp->d_namlen = strlen(SUJ_FILE);
> +	bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE));
> +	off += DIRBLKSIZ;
> +	for (; off < sblock.fs_bsize; off += DIRBLKSIZ) {
> +		dp = (struct direct *)&block[off];
> +		dp->d_ino = 0;
> +		dp->d_reclen = DIRBLKSIZ;
> +		dp->d_type = DT_UNKNOWN;
> +	}
> +	if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
> +		warn("Failed to write dir block");
> +		return (-1);
> +	}
> +	return (0);
> +}
> +
> +/*
> + * Extend a directory block in 'blk' by copying it to a full size block
> + * and inserting the new journal inode into .sujournal.
> + */
> +static int
> +dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino)
> +{
> +	char block[MAXBSIZE];
> +
> +	if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) {
> +		warn("Failed to read dir block");
> +		return (-1);
> +	}
> +	if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) {
> +		warn("Failed to write dir block");
> +		return (-1);
> +	}
> +
> +	return dir_insert(nblk, size, ino);
> +}
> +
> +/*
> + * Insert the journal file into the ROOTINO directory.  We always extend the
> + * last frag
> + */
> +static int
> +journal_insertfile(ino_t ino)
> +{
> +	struct ufs1_dinode *dp1;
> +	struct ufs2_dinode *dp2;
> +	void *ip;
> +	ufs2_daddr_t nblk;
> +	ufs2_daddr_t blk;
> +	ufs_lbn_t lbn;
> +	int size;
> +	int mode;
> +	int off;
> +
> +	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
> +		warn("Failed to get root inode");
> +		sbdirty();
> +		return (-1);
> +	}
> +	dp2 = ip;
> +	dp1 = ip;
> +	blk = 0;
> +	size = 0;
> +	nblk = journal_balloc();
> +	if (nblk <= 0)
> +		return (-1);
> +	/*
> +	 * For simplicity sake we aways extend the ROOTINO into a new
> +	 * directory block rather than searching for space and inserting
> +	 * into an existing block.  However, if the rootino has frags
> +	 * have to free them and extend the block.
> +	 */
> +	if (sblock.fs_magic == FS_UFS1_MAGIC) {
> +		lbn = lblkno(&sblock, dp1->di_size);
> +		off = blkoff(&sblock, dp1->di_size);
> +		blk = dp1->di_db[lbn];
> +		size = sblksize(&sblock, (off_t)dp1->di_size, lbn);
> +	} else {
> +		lbn = lblkno(&sblock, dp2->di_size);
> +		off = blkoff(&sblock, dp2->di_size);
> +		blk = dp2->di_db[lbn];
> +		size = sblksize(&sblock, (off_t)dp2->di_size, lbn);
> +	}
> +	if (off != 0) {
> +		if (dir_extend(blk, nblk, off, ino) == -1)
> +			return (-1);
> +	} else {
> +		blk = 0;
> +		if (dir_insert(nblk, 0, ino) == -1)
> +			return (-1);
> +	}
> +	if (sblock.fs_magic == FS_UFS1_MAGIC) {
> +		dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
> +		dp1->di_db[lbn] = nblk;
> +		dp1->di_size = lblktosize(&sblock, lbn+1);
> +	} else {
> +		dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
> +		dp2->di_db[lbn] = nblk;
> +		dp2->di_size = lblktosize(&sblock, lbn+1);
> +	}
> +	if (putino(&disk) < 0) {
> +		warn("Failed to write root inode");
> +		return (-1);
> +	}
> +	if (cgwrite(&disk) < 0) {
> +		warn("Failed to write updated cg");
> +		sbdirty();
> +		return (-1);
> +	}
> +	if (blk) {
> +		if (cgbfree(&disk, blk, size) < 0) {
> +			warn("Failed to write cg");
> +			return (-1);
> +		}
> +	}
> +
> +	return (0);
> +}
> +
> static int
> indir_fill(ufs2_daddr_t blk, int level, int *resid)
> {
> @@ -567,22 +785,20 @@ indir_fill(ufs2_daddr_t blk, int level,
> 	ufs1_daddr_t *bap1;
> 	ufs2_daddr_t *bap2;
> 	ufs2_daddr_t nblk;
> -	struct fs *fs;
> 	int ncnt;
> 	int cnt;
> 	int i;
>
> -	fs = &disk.d_fs;
> 	bzero(indirbuf, sizeof(indirbuf));
> 	bap1 = (ufs1_daddr_t *)indirbuf;
> 	bap2 = (void *)bap1;
> 	cnt = 0;
> -	for (i = 0; i < NINDIR(fs) && *resid != 0; i++) {
> +	for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) {
> 		nblk = journal_balloc();
> 		if (nblk <= 0)
> 			return (-1);
> 		cnt++;
> -		if (fs->fs_magic == FS_UFS1_MAGIC)
> +		if (sblock.fs_magic == FS_UFS1_MAGIC)
> 			*bap1++ = nblk;
> 		else
> 			*bap2++ = nblk;
> @@ -594,13 +810,47 @@ indir_fill(ufs2_daddr_t blk, int level,
> 		} else
> 			(*resid)--;
> 	}
> -	if (bwrite(&disk, fsbtodb(fs, blk), indirbuf, fs->fs_bsize) <= 0) {
> +	if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf,
> +	    sblock.fs_bsize) <= 0) {
> 		warn("Failed to write indirect");
> 		return (-1);
> 	}
> 	return (cnt);
> }
>
> +/*
> + * Clear the flag bits so the journal can be removed.
> + */
> +void
> +journal_clear(void)
> +{
> +	struct ufs1_dinode *dp1;
> +	struct ufs2_dinode *dp2;
> +	ino_t ino;
> +	int mode;
> +	void *ip;
> +
> +	ino = journal_findfile();
> +	if (ino <= 0) {
> +		warnx("Journal file does not exist");
> +		return;
> +	}
> +	if (getino(&disk, &ip, ino, &mode) != 0) {
> +		warn("Failed to get journal inode");
> +		return;
> +	}
> +	dp2 = ip;
> +	dp1 = ip;
> +	if (sblock.fs_magic == FS_UFS1_MAGIC)
> +		dp1->di_flags = 0;
> +	else
> +		dp2->di_flags = 0;
> +	if (putino(&disk) < 0) {
> +		warn("Failed to write journal inode");
> +		return;
> +	}
> +}
> +
> int
> journal_alloc(int64_t size)
> {
> @@ -609,32 +859,39 @@ journal_alloc(int64_t size)
> 	ufs2_daddr_t blk;
> 	void *ip;
> 	struct cg *cgp;
> -	struct fs *fs;
> 	int resid;
> 	ino_t ino;
> 	int blks;
> 	int mode;
> 	int i;
>
> -	fs = &disk.d_fs;
> 	cgp = &disk.d_cg;
> 	ino = 0;
>
> 	/*
> +	 * If the journal file exists we can't allocate it.
> +	 */
> +	ino = journal_findfile();
> +	if (ino > 0)
> +		warnx("Journal file %s already exists, please remove.",
> +		    SUJ_FILE);
> +	if (ino != 0)
> +		return (-1);
> +	/*
> 	 * If the user didn't supply a size pick one based on the filesystem
> 	 * size constrained with hardcoded MIN and MAX values.  We opt for
> 	 * 1/1024th of the filesystem up to MAX but not exceeding one CG and
> 	 * not less than the MIN.
> 	 */
> 	if (size == 0) {
> -		size = (fs->fs_size * fs->fs_bsize) / 1024;
> +		size = (sblock.fs_size * sblock.fs_bsize) / 1024;
> 		size = MIN(SUJ_MAX, size);
> -		if (size / fs->fs_fsize > fs->fs_fpg)
> -			size = fs->fs_fpg * fs->fs_fsize;
> +		if (size / sblock.fs_fsize > sblock.fs_fpg)
> +			size = sblock.fs_fpg * sblock.fs_fsize;
> 		size = MAX(SUJ_MIN, size);
> 	}
> -	resid = blocks = size / fs->fs_bsize;
> -	if (fs->fs_cstotal.cs_nbfree < blocks) {
> +	resid = blocks = size / sblock.fs_bsize;
> +	if (sblock.fs_cstotal.cs_nbfree < blocks) {
> 		warn("Insufficient free space for %jd byte journal", size);
> 		return (-1);
> 	}
> @@ -647,9 +904,9 @@ journal_alloc(int64_t size)
> 			continue;
> 		/*
> 		 * Try to minimize fragmentation by requiring at least a
> -		 * 1/8th of the blocks be present in each cg we use.
> +		 * 1/16th of the blocks be present in each cg we use.
> 		 */
> -		if (cgp->cg_cs.cs_nbfree < blocks / 8)
> +		if (cgp->cg_cs.cs_nbfree < blocks / 16)
> 			continue;
> 		ino = cgialloc(&disk);
> 		if (ino <= 0)
> @@ -668,22 +925,24 @@ journal_alloc(int64_t size)
> 		 */
> 		dp2 = ip;
> 		dp1 = ip;
> -		if (fs->fs_magic == FS_UFS1_MAGIC) {
> +		if (sblock.fs_magic == FS_UFS1_MAGIC) {
> 			bzero(dp1, sizeof(*dp1));
> 			dp1->di_size = size;
> -			dp1->di_mode = IFREG;
> +			dp1->di_mode = IFREG | IREAD;
> 			dp1->di_nlink = 1;
> +			dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
> 		} else {
> 			bzero(dp2, sizeof(*dp2));
> 			dp2->di_size = size;
> -			dp2->di_mode = IFREG;
> +			dp2->di_mode = IFREG | IREAD;
> 			dp2->di_nlink = 1;
> +			dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
> 		}
> 		for (i = 0; i < NDADDR && resid; i++, resid--) {
> 			blk = journal_balloc();
> 			if (blk <= 0)
> 				goto out;
> -			if (fs->fs_magic == FS_UFS1_MAGIC) {
> +			if (sblock.fs_magic == FS_UFS1_MAGIC) {
> 				dp1->di_db[i] = blk;
> 				dp1->di_blocks++;
> 			} else {
> @@ -700,7 +959,7 @@ journal_alloc(int64_t size)
> 				sbdirty();
> 				goto out;
> 			}
> -			if (fs->fs_magic == FS_UFS1_MAGIC) {
> +			if (sblock.fs_magic == FS_UFS1_MAGIC) {
> 				dp1->di_ib[i] = blk;
> 				dp1->di_blocks += blks;
> 			} else {
> @@ -708,10 +967,10 @@ journal_alloc(int64_t size)
> 				dp2->di_blocks += blks;
> 			}
> 		}
> -		if (fs->fs_magic == FS_UFS1_MAGIC)
> -			dp1->di_blocks *= fs->fs_bsize / disk.d_bsize;
> +		if (sblock.fs_magic == FS_UFS1_MAGIC)
> +			dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize;
> 		else
> -			dp2->di_blocks *= fs->fs_bsize / disk.d_bsize;
> +			dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize;
> 		if (putino(&disk) < 0) {
> 			warn("Failed to write inode");
> 			sbdirty();
> @@ -722,8 +981,11 @@ journal_alloc(int64_t size)
> 			sbdirty();
> 			return (-1);
> 		}
> -		fs->fs_sujournal = ino;
> -		fs->fs_sujfree = 0;
> +		if (journal_insertfile(ino) < 0) {
> +			sbdirty();
> +			return (-1);
> +		}
> +		sblock.fs_sujfree = 0;
> 		return (0);
> 	}
> 	warnx("Insufficient contiguous free space for the journal.");
>
> Modified: projects/suj/head/sys/sys/mount.h
> ==============================================================================
> --- projects/suj/head/sys/sys/mount.h	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sys/sys/mount.h	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -240,7 +240,6 @@ void          __mnt_vnode_markerfree(str
> #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
> #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
> #define	MNT_NFS4ACLS	0x00000010
> -#define	MNT_SUJ		0x00000080	/* softdep journaling */
>
> /*
>  * NFS export related mount flags.
> @@ -277,7 +276,7 @@ void          __mnt_vnode_markerfree(str
> 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
> 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
> 			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
> -			MNT_NFS4ACLS	| MNT_SUJ)
> +			MNT_NFS4ACLS)
>
> /* Mask of flags that can be updated. */
> #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
> @@ -326,6 +325,7 @@ void          __mnt_vnode_markerfree(str
> #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
> #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
> #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
> +#define	MNTK_SUJ	0x00000100	/* Softdep journaling enabled */
> #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
> #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
> #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
>
> Modified: projects/suj/head/sys/ufs/ffs/ffs_alloc.c
> ==============================================================================
> --- projects/suj/head/sys/ufs/ffs/ffs_alloc.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sys/ufs/ffs/ffs_alloc.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -1851,6 +1851,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
> 	ino_t inum;
> 	struct workhead *dephd;
> {
> +	struct mount *mp;
> 	struct cg *cgp;
> 	struct buf *bp;
> 	ufs1_daddr_t fragno, cgbno;
> @@ -1965,7 +1966,8 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
> 	fs->fs_fmod = 1;
> 	ACTIVECLEAR(fs, cg);
> 	UFS_UNLOCK(ump);
> -	if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP)
> +	mp = UFSTOVFS(ump);
> +	if (mp->mnt_flag & MNT_SOFTDEP)
> 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
> 		    numfrags(fs, size), dephd);
> 	bdwrite(bp);
>
> Modified: projects/suj/head/sys/ufs/ffs/ffs_softdep.c
> ==============================================================================
> --- projects/suj/head/sys/ufs/ffs/ffs_softdep.c	Tue Jan 26 05:17:03 2010	(r203011)
> +++ projects/suj/head/sys/ufs/ffs/ffs_softdep.c	Tue Jan 26 06:36:10 2010	(r203012)
> @@ -1902,7 +1902,7 @@ softdep_unmount(mp)
> 	struct mount *mp;
> {
>
> -	if (mp->mnt_flag & MNT_SUJ)
> +	if (mp->mnt_kern_flag & MNTK_SUJ)
> 		journal_unmount(mp);
> }
>
> @@ -2044,16 +2044,36 @@ journal_mount(mp, fs, cred)
> 	struct fs *fs;
> 	struct ucred *cred;
> {
> +	struct componentname cnp;
> 	struct jblocks *jblocks;
> +	struct vnode *dvp;
> 	struct vnode *vp;
> 	struct inode *ip;
> 	ufs2_daddr_t blkno;
> +	ino_t sujournal;
> 	int bcount;
> 	int error;
> 	int i;
>
> -	mp->mnt_flag |= MNT_SUJ;
> -	error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp);
> +	mp->mnt_kern_flag |= MNTK_SUJ;
> +	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
> +	if (error)
> +		return (error);
> +	bzero(&cnp, sizeof(cnp));
> +	cnp.cn_nameiop = LOOKUP;
> +	cnp.cn_flags = ISLASTCN;
> +	cnp.cn_thread = curthread;
> +	cnp.cn_cred = curthread->td_ucred;
> +	cnp.cn_pnbuf = SUJ_FILE;
> +	cnp.cn_nameptr = SUJ_FILE;
> +	cnp.cn_namelen = strlen(SUJ_FILE);
> +	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
> +	vput(dvp);
> +	if (error != 0) {
> +		printf("Failed to find journal.  Use tunefs to create one\n");
> +		return (error);
> +	}
> +	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, &vp);
> 	if (error)
> 		return (error);
> 	ip = VTOI(vp);
> @@ -2075,9 +2095,18 @@ journal_mount(mp, fs, cred)
> 	}
> 	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
> 	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
> -	DIP_SET(ip, i_modrev, fs->fs_mtime);
> -	ip->i_flags |= IN_MODIFIED;
> -	ffs_update(vp, 1);
> +	/*
> +	 * Only validate the journal contents if the filesystem is clean,
> +	 * otherwise we write the logs but they'll never be used.  If the
> +	 * filesystem was still dirty when we mounted it the journal is
> +	 * invalid and a new journal can only be valid if it starts from a
> +	 * clean mount.
> +	 */
> +	if (fs->fs_clean) {
> +		DIP_SET(ip, i_modrev, fs->fs_mtime);
> +		ip->i_flags |= IN_MODIFIED;
> +		ffs_update(vp, 1);
> +	}
> 	VFSTOUFS(mp)->softdep_jblocks = jblocks;
> out:
> 	vput(vp);
> @@ -2159,6 +2188,11 @@ remove_from_journal(wk)
> 	ump->softdep_on_journal -= 1;
> }
>
> +/*
> + * Check for journal space as well as dependency limits so the prelink
> + * code can throttle both journaled and non-journaled filesystems.
> + * Threshold is 0 for low and 1 for min.
> + */
> static int
> journal_space(ump, thresh)
> 	struct ufsmount *ump;
> @@ -2167,7 +2201,20 @@ journal_space(ump, thresh)
> 	struct jblocks *jblocks;
> 	int avail;
>
> +	/*
> +	 * We use a tighter restriction here to prevent request_cleanup()
> +	 * running in threads from running into locks we currently hold.
> +	 */
> +	if (num_inodedep > (max_softdeps / 10) * 9)
> +		return (0);
> +
> 	jblocks = ump->softdep_jblocks;
> +	if (jblocks == NULL)
> +		return (1);
> +	if (thresh)
> +		thresh = jblocks->jb_min;
> +	else
> +		thresh = jblocks->jb_low;
> 	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
> 	avail = jblocks->jb_free - avail;
>
> @@ -2210,15 +2257,13 @@ softdep_prealloc(vp, waitok)
> 	struct vnode *vp;
> 	int waitok;
> {
> -	struct jblocks *jblocks;
> 	struct ufsmount *ump;
>
> 	if (DOINGSUJ(vp) == 0)
> 		return (0);
> 	ump = VFSTOUFS(vp->v_mount);
> -	jblocks = ump->softdep_jblocks;
> 	ACQUIRE_LOCK(&lk);
> -	if (journal_space(ump, jblocks->jb_low)) {
> +	if (journal_space(ump, 0)) {
> 		FREE_LOCK(&lk);
> 		return (0);
> 	}
> @@ -2233,9 +2278,9 @@ softdep_prealloc(vp, waitok)
> 	ffs_syncvnode(vp, waitok);
> 	ACQUIRE_LOCK(&lk);
> 	process_removes(vp);
> -	if (journal_space(ump, jblocks->jb_low) == 0) {
> +	if (journal_space(ump, 0) == 0) {
> 		softdep_speedup();
> -		if (journal_space(ump, jblocks->jb_min) == 0)
> +		if (journal_space(ump, 1) == 0)
> 			journal_suspend(ump);
> 	}
> 	FREE_LOCK(&lk);
> @@ -2243,18 +2288,22 @@ softdep_prealloc(vp, waitok)
> 	return (0);
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?alpine.BSF.2.00.1001252041490.1027>