Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 14 Nov 2012 06:37:43 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r243018 - head/sys/ufs/ffs
Message-ID:  <201211140637.qAE6bhpp016910@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Wed Nov 14 06:37:43 2012
New Revision: 243018
URL: http://svnweb.freebsd.org/changeset/base/243018

Log:
   - Fix a truncation bug with softdep journaling that could leak blocks on
     crash.  When truncating a file that never made it to disk we use the
     canceled allocation dependencies to hold the journal records until
     the truncation completes.  Previously allocdirect dependencies on
     the id_bufwait list were not considered and their journal space
     could expire before the bitmaps were written.  Cancel them and attach
     them to the freeblks as we do for other allocdirects.
   - Add KTR traces that were used to debug this problem.
   - When adding jsegdeps, always use jwork_insert() so we don't have more
     than one segdep on a given jwork list.
  
  Sponsored by:	EMC / Isilon Storage Division

Modified:
  head/sys/ufs/ffs/ffs_softdep.c

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Wed Nov 14 06:31:47 2012	(r243017)
+++ head/sys/ufs/ffs/ffs_softdep.c	Wed Nov 14 06:37:43 2012	(r243018)
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/buf.h>
 #include <sys/kdb.h>
 #include <sys/kthread.h>
+#include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -92,6 +93,8 @@ __FBSDID("$FreeBSD$");
 
 #include <ddb/ddb.h>
 
+#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
+
 #ifndef SOFTUPDATES
 
 int
@@ -770,6 +773,34 @@ struct pagedep_hashhead;
 struct bmsafemap_hashhead;
 
 /*
+ * Private journaling structures.
+ */
+struct jblocks {
+	struct jseglst	jb_segs;	/* TAILQ of current segments. */
+	struct jseg	*jb_writeseg;	/* Next write to complete. */
+	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
+	struct jextent	*jb_extent;	/* Extent array. */
+	uint64_t	jb_nextseq;	/* Next sequence number. */
+	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
+	uint8_t		jb_needseg;	/* Need a forced segment. */
+	uint8_t		jb_suspended;	/* Did journal suspend writes? */
+	int		jb_avail;	/* Available extents. */
+	int		jb_used;	/* Last used extent. */
+	int		jb_head;	/* Allocator head. */
+	int		jb_off;		/* Allocator extent offset. */
+	int		jb_blocks;	/* Total disk blocks covered. */
+	int		jb_free;	/* Total disk blocks free. */
+	int		jb_min;		/* Minimum free space. */
+	int		jb_low;		/* Low on space. */
+	int		jb_age;		/* Insertion time of oldest rec. */
+};
+
+struct jextent {
+	ufs2_daddr_t	je_daddr;	/* Disk block address. */
+	int		je_blocks;	/* Disk block count. */
+};
+
+/*
  * Internal function prototypes.
  */
 static	void softdep_error(char *, int);
@@ -2268,19 +2299,15 @@ static void
 indirblk_insert(freework)
 	struct freework *freework;
 {
-	struct freeblks *freeblks;
-	struct jsegdep *jsegdep;
-	struct worklist *wk;
+	struct jblocks *jblocks;
+	struct jseg *jseg;
 
-	freeblks = freework->fw_freeblks;
-	LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
-		if (wk->wk_type == D_JSEGDEP)
-			break;
-	if (wk == NULL)
+	jblocks = VFSTOUFS(freework->fw_list.wk_mp)->softdep_jblocks;
+	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
+	if (jseg == NULL)
 		return;
 	
-	jsegdep = WK_JSEGDEP(wk);
-	LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs, freework, fw_segs);
+	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
 	TAILQ_INSERT_HEAD(INDIR_HASH(freework->fw_list.wk_mp,
 	    freework->fw_blkno), freework, fw_next);
 	freework->fw_state &= ~DEPCOMPLETE;
@@ -2433,31 +2460,6 @@ softdep_unmount(mp)
 	journal_unmount(mp);
 }
 
-struct jblocks {
-	struct jseglst	jb_segs;	/* TAILQ of current segments. */
-	struct jseg	*jb_writeseg;	/* Next write to complete. */
-	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
-	struct jextent	*jb_extent;	/* Extent array. */
-	uint64_t	jb_nextseq;	/* Next sequence number. */
-	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
-	uint8_t		jb_needseg;	/* Need a forced segment. */
-	uint8_t		jb_suspended;	/* Did journal suspend writes? */
-	int		jb_avail;	/* Available extents. */
-	int		jb_used;	/* Last used extent. */
-	int		jb_head;	/* Allocator head. */
-	int		jb_off;		/* Allocator extent offset. */
-	int		jb_blocks;	/* Total disk blocks covered. */
-	int		jb_free;	/* Total disk blocks free. */
-	int		jb_min;		/* Minimum free space. */
-	int		jb_low;		/* Low on space. */
-	int		jb_age;		/* Insertion time of oldest rec. */
-};
-
-struct jextent {
-	ufs2_daddr_t	je_daddr;	/* Disk block address. */
-	int		je_blocks;	/* Disk block count. */
-};
-
 static struct jblocks *
 jblocks_create(void)
 {
@@ -3663,7 +3665,7 @@ handle_written_jnewblk(jnewblk)
 		 */
 		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
 		freefrag->ff_jdep = NULL;
-		WORKLIST_INSERT(&freefrag->ff_jwork, &jsegdep->jd_list);
+		jwork_insert(&freefrag->ff_jwork, jsegdep);
 		break;
 	case D_FREEWORK:
 		/*
@@ -3671,8 +3673,7 @@ handle_written_jnewblk(jnewblk)
 		 */
 		freework = WK_FREEWORK(jnewblk->jn_dep);
 		freework->fw_jnewblk = NULL;
-		WORKLIST_INSERT(&freework->fw_freeblks->fb_jwork,
-		    &jsegdep->jd_list);
+		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
 		break;
 	default:
 		panic("handle_written_jnewblk: Unknown type %d.",
@@ -3702,6 +3703,7 @@ cancel_jfreefrag(jfreefrag)
 	jfreefrag->fr_freefrag = NULL;
 	free_jfreefrag(jfreefrag);
 	freefrag->ff_state |= DEPCOMPLETE;
+	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
 }
 
 /*
@@ -3765,7 +3767,7 @@ handle_written_jblkdep(jblkdep)
 	jblkdep->jb_jsegdep = NULL;
 	freeblks = jblkdep->jb_freeblks;
 	LIST_REMOVE(jblkdep, jb_deps);
-	WORKLIST_INSERT(&freeblks->fb_jwork, &jsegdep->jd_list);
+	jwork_insert(&freeblks->fb_jwork, jsegdep);
 	/*
 	 * If the freeblks is all journaled, we can add it to the worklist.
 	 */
@@ -3968,6 +3970,7 @@ cancel_jfreeblk(freeblks, blkno)
 	}
 	if (jblkdep == NULL)
 		return;
+	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
 	free_jsegdep(jblkdep->jb_jsegdep);
 	LIST_REMOVE(jblkdep, jb_deps);
 	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
@@ -4208,6 +4211,7 @@ cancel_jnewblk(jnewblk, wkhd)
 {
 	struct jsegdep *jsegdep;
 
+	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
 	jsegdep = jnewblk->jn_jsegdep;
 	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
 		panic("cancel_jnewblk: Invalid state");
@@ -4899,6 +4903,10 @@ softdep_setup_blkmapdep(bp, mp, newblkno
 		}
 #endif
 	}
+
+	CTR3(KTR_SUJ,
+	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
+	    newblkno, frags, oldfrags);
 	ACQUIRE_LOCK(&lk);
 	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
 		panic("softdep_setup_blkmapdep: found block");
@@ -5060,6 +5068,10 @@ softdep_setup_allocdirect(ip, off, newbl
 	else
 		freefrag = NULL;
 
+	CTR6(KTR_SUJ,
+	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
+	    "off %jd newsize %ld oldsize %d",
+	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
 	ACQUIRE_LOCK(&lk);
 	if (off >= NDADDR) {
 		if (lbn > 0)
@@ -5338,6 +5350,8 @@ newfreefrag(ip, blkno, size, lbn)
 	struct freefrag *freefrag;
 	struct fs *fs;
 
+	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
+	    ip->i_number, blkno, size, lbn);
 	fs = ip->i_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
@@ -5373,6 +5387,9 @@ handle_workitem_freefrag(freefrag)
 	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 	struct workhead wkhd;
 
+	CTR3(KTR_SUJ,
+	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
+	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
 	/*
 	 * It would be illegal to add new completion items to the
 	 * freefrag after it was schedule to be done so it must be
@@ -5591,6 +5608,9 @@ softdep_setup_allocindir_page(ip, lbn, b
 	if (lbn != nbp->b_lblkno)
 		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
 		    lbn, bp->b_lblkno);
+	CTR4(KTR_SUJ,
+	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
+	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 	mp = UFSTOVFS(ip->i_ump);
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
@@ -5629,6 +5649,9 @@ softdep_setup_allocindir_meta(nbp, ip, b
 	ufs_lbn_t lbn;
 	int dflags;
 
+	CTR3(KTR_SUJ,
+	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
+	    ip->i_number, newblkno, ptrno);
 	lbn = nbp->b_lblkno;
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
@@ -6233,6 +6256,7 @@ softdep_journal_freeblocks(ip, cred, len
 	int flags;		/* IO_EXT and/or IO_NORMAL */
 {
 	struct freeblks *freeblks, *fbn;
+	struct worklist *wk, *wkn;
 	struct inodedep *inodedep;
 	struct jblkdep *jblkdep;
 	struct allocdirect *adp, *adpn;
@@ -6267,6 +6291,8 @@ softdep_journal_freeblocks(ip, cred, len
 	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
 	    length == 0)
 		needj = 0;
+	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
+	    ip->i_number, length, needj);
 	FREE_LOCK(&lk);
 	/*
 	 * Calculate the lbn that we are truncating to.  This results in -1
@@ -6420,6 +6446,21 @@ softdep_journal_freeblocks(ip, cred, len
 			cancel_allocdirect(&inodedep->id_extupdt, adp,
 			    freeblks);
 	/*
+	 * Scan the bufwait list for newblock dependencies that will never
+	 * make it to disk.
+	 */
+	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
+		if (wk->wk_type != D_ALLOCDIRECT)
+			continue;
+		adp = WK_ALLOCDIRECT(wk);
+		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
+		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
+			cancel_jfreeblk(freeblks, adp->ad_newblkno);
+			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
+			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
+		}
+	}
+	/*
 	 * Add journal work.
 	 */
 	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
@@ -6558,6 +6599,8 @@ softdep_setup_freeblocks(ip, length, fla
 	ufs_lbn_t tmpval;
 	ufs_lbn_t lbn;
 
+	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
+	    ip->i_number, length);
 	fs = ip->i_fs;
 	mp = UFSTOVFS(ip->i_ump);
 	if (length != 0)
@@ -7083,6 +7126,8 @@ cancel_newblk(newblk, wk, wkhd)
 {
 	struct jnewblk *jnewblk;
 
+	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
+	    
 	newblk->nb_state |= GOINGAWAY;
 	/*
 	 * Previously we traversed the completedhd on each indirdep
@@ -7451,6 +7496,9 @@ freework_freeblock(freework)
 	}
 	FREE_LOCK(&lk);
 	freeblks_free(ump, freeblks, btodb(bsize));
+	CTR4(KTR_SUJ,
+	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
 	ACQUIRE_LOCK(&lk);
@@ -7884,6 +7932,9 @@ indir_trunc(freework, dbn, lbn)
 				    &freedep->fd_list);
 				freedeps++;
 			}
+			CTR3(KTR_SUJ,
+			    "indir_trunc: ino %d blkno %jd size %ld",
+			    freeblks->fb_inum, nb, fs->fs_bsize);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
 			    fs->fs_bsize, freeblks->fb_inum,
 			    freeblks->fb_vtype, &wkhd);
@@ -7919,6 +7970,9 @@ indir_trunc(freework, dbn, lbn)
 	 * If we're not journaling we can free the indirect now.
 	 */
 	dbn = dbtofsb(fs, dbn);
+	CTR3(KTR_SUJ,
+	    "indir_trunc 2: ino %d blkno %jd size %ld",
+	    freeblks->fb_inum, dbn, fs->fs_bsize);
 	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
 	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
 	/* Non SUJ softdep does single-threaded truncations. */
@@ -10356,6 +10410,10 @@ softdep_setup_blkfree(mp, bp, blkno, fra
 	int i;
 #endif
 
+	CTR3(KTR_SUJ,
+	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
+	    blkno, frags, wkhd);
+
 	ACQUIRE_LOCK(&lk);
 	/* Lookup the bmsafemap so we track when it is dirty. */
 	fs = VFSTOUFS(mp)->um_fs;
@@ -10367,6 +10425,9 @@ softdep_setup_blkfree(mp, bp, blkno, fra
 	 */
 	if (wkhd) {
 		while ((wk = LIST_FIRST(wkhd)) != NULL) {
+			CTR2(KTR_SUJ,
+			    "softdep_setup_blkfree: blkno %jd wk type %d",
+			    blkno, wk->wk_type);
 			WORKLIST_REMOVE(wk);
 			if (wk->wk_type != D_JNEWBLK) {
 				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201211140637.qAE6bhpp016910>