Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 9 Apr 2011 17:59:16 -1000 (HST)
From:      Jeff Roberson <jroberson@jroberson.net>
To:        Jeff Roberson <jeff@FreeBSD.org>
Cc:        svn-src-head@freebsd.org, svn-src-all@freebsd.org, src-committers@freebsd.org
Subject:   Re: svn commit: r220511 - head/sys/ufs/ffs
Message-ID:  <alpine.BSF.2.00.1104091756440.1480@desktop>
In-Reply-To: <201104100349.p3A3nsAd034153@svn.freebsd.org>
References:  <201104100349.p3A3nsAd034153@svn.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Sun, 10 Apr 2011, Jeff Roberson wrote:

> Author: jeff
> Date: Sun Apr 10 03:49:53 2011
> New Revision: 220511
> URL: http://svn.freebsd.org/changeset/base/220511
>
> Log:
>  Fix a long standing SUJ performance problem:

This brought my dbench performance to within 10-15% of softupdates on 
a real disk depending on concurrency.  There are cases where it 
outperforms softupdates as well.  Over time I can eliminate the 
extra blocking IO waits that cause the remaining degradation on this test. 
For now I'm going to focus on the mksnap bug that has been reported in 
several forms.

Thanks,
Jeff

>
>   - Keep a hash of indirect blocks that have recently been freed and are
>     still referenced in the journal.
>   - Lookup blocks in this hash before forcing a new block write to wait on
>     the journal entry to hit the disk.  This is only necessary to avoid
>     confusion between old identities as indirects and new identities as
>     file blocks.
>   - Don't free jseg structures until the journal has written a record that
>     invalidates it.  This keeps the indirect block information around for
>     as long as is required to be safe.
>   - Force an empty journal block write when required to flush out stale
>     journal data that is simply waiting for the oldest valid sequence
>     number to advance beyond it.
>
> Modified:
>  head/sys/ufs/ffs/ffs_softdep.c
>  head/sys/ufs/ffs/softdep.h
>
> Modified: head/sys/ufs/ffs/ffs_softdep.c
> ==============================================================================
> --- head/sys/ufs/ffs/ffs_softdep.c	Sun Apr 10 01:54:42 2011	(r220510)
> +++ head/sys/ufs/ffs/ffs_softdep.c	Sun Apr 10 03:49:53 2011	(r220511)
> @@ -753,8 +753,7 @@ static	void handle_written_jnewblk(struc
> static	void handle_written_jfreeblk(struct jfreeblk *);
> static	void handle_written_jfreefrag(struct jfreefrag *);
> static	void complete_jseg(struct jseg *);
> -static	void jseg_write(struct ufsmount *ump, struct jblocks *, struct jseg *,
> -	    uint8_t *);
> +static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
> static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
> static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
> static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
> @@ -769,6 +768,7 @@ static	void handle_allocdirect_partdone(
> static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
> 	    struct workhead *);
> static	void indirdep_complete(struct indirdep *);
> +static	int indirblk_inseg(struct mount *, ufs2_daddr_t);
> static	void handle_allocindir_partdone(struct allocindir *);
> static	void initiate_write_filepage(struct pagedep *, struct buf *);
> static	void initiate_write_indirdep(struct indirdep*, struct buf *);
> @@ -802,7 +802,9 @@ static	void free_newdirblk(struct newdir
> static	void free_jremref(struct jremref *);
> static	void free_jaddref(struct jaddref *);
> static	void free_jsegdep(struct jsegdep *);
> -static	void free_jseg(struct jseg *);
> +static	void free_jsegs(struct jblocks *);
> +static	void rele_jseg(struct jseg *);
> +static	void free_jseg(struct jseg *, struct jblocks *);
> static	void free_jnewblk(struct jnewblk *);
> static	void free_jfreeblk(struct jfreeblk *);
> static	void free_jfreefrag(struct jfreefrag *);
> @@ -872,7 +874,7 @@ static	int journal_unsuspend(struct ufsm
> static	void softdep_prelink(struct vnode *, struct vnode *);
> static	void add_to_journal(struct worklist *);
> static	void remove_from_journal(struct worklist *);
> -static	void softdep_process_journal(struct mount *, int);
> +static	void softdep_process_journal(struct mount *, struct worklist *, int);
> static	struct jremref *newjremref(struct dirrem *, struct inode *,
> 	    struct inode *ip, off_t, nlink_t);
> static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
> @@ -1376,7 +1378,7 @@ softdep_process_worklist(mp, full)
> 	ump = VFSTOUFS(mp);
> 	ACQUIRE_LOCK(&lk);
> 	starttime = time_second;
> -	softdep_process_journal(mp, full?MNT_WAIT:0);
> +	softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
> 	while (ump->softdep_on_worklist > 0) {
> 		if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
> 			break;
> @@ -1999,6 +2001,37 @@ newblk_lookup(mp, newblkno, flags, newbl
> }
>
> /*
> + * Structures and routines associated with indir caching.
> + */
> +struct workhead *indir_hashtbl;
> +u_long	indir_hash;		/* size of hash table - 1 */
> +#define	INDIR_HASH(mp, blkno) \
> +	(&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
> +
> +static int
> +indirblk_inseg(mp, blkno)
> +	struct mount *mp;
> +	ufs2_daddr_t blkno;
> +{
> +	struct freework *freework;
> +	struct workhead *wkhd;
> +	struct worklist *wk;
> +
> +	wkhd = INDIR_HASH(mp, blkno);
> +	LIST_FOREACH(wk, wkhd, wk_list) {
> +		freework = WK_FREEWORK(wk);
> +		if (freework->fw_blkno == blkno &&
> +		    freework->fw_list.wk_mp == mp) {
> +			LIST_REMOVE(freework, fw_next);
> +			WORKLIST_REMOVE(&freework->fw_list);
> +			WORKITEM_FREE(freework, D_FREEWORK);
> +			return (1);
> +		}
> +	}
> +	return (0);
> +}
> +
> +/*
>  * Executed during filesystem system initialization before
>  * mounting any filesystems.
>  */
> @@ -2012,6 +2045,7 @@ softdep_initialize()
> 	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
> 	newblk_hashtbl = hashinit(desiredvnodes / 5,  M_NEWBLK, &newblk_hash);
> 	bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
> +	indir_hashtbl = hashinit(desiredvnodes / 10, M_FREEWORK, &indir_hash);
>
> 	/* initialise bioops hack */
> 	bioops.io_start = softdep_disk_io_initiation;
> @@ -2120,9 +2154,12 @@ softdep_unmount(mp)
> struct jblocks {
> 	struct jseglst	jb_segs;	/* TAILQ of current segments. */
> 	struct jseg	*jb_writeseg;	/* Next write to complete. */
> +	struct jseg	*jb_oldestseg;	/* Oldest segment with valid entries. */
> 	struct jextent	*jb_extent;	/* Extent array. */
> 	uint64_t	jb_nextseq;	/* Next sequence number. */
> -	uint64_t	jb_oldestseq;	/* Oldest active sequence number. */
> +	uint64_t	jb_oldestwrseq;	/* Oldest written sequence number. */
> +	uint8_t		jb_needseg;	/* Need a forced segment. */
> +	uint8_t		jb_suspended;	/* Did journal suspend writes? */
> 	int		jb_avail;	/* Available extents. */
> 	int		jb_used;	/* Last used extent. */
> 	int		jb_head;	/* Allocator head. */
> @@ -2132,7 +2169,6 @@ struct jblocks {
> 	int		jb_min;		/* Minimum free space. */
> 	int		jb_low;		/* Low on space. */
> 	int		jb_age;		/* Insertion time of oldest rec. */
> -	int		jb_suspended;	/* Did journal suspend writes? */
> };
>
> struct jextent {
> @@ -2575,9 +2611,8 @@ softdep_prelink(dvp, vp)
> }
>
> static void
> -jseg_write(ump, jblocks, jseg, data)
> +jseg_write(ump, jseg, data)
> 	struct ufsmount *ump;
> -	struct jblocks *jblocks;
> 	struct jseg *jseg;
> 	uint8_t *data;
> {
> @@ -2585,7 +2620,7 @@ jseg_write(ump, jblocks, jseg, data)
>
> 	rec = (struct jsegrec *)data;
> 	rec->jsr_seq = jseg->js_seq;
> -	rec->jsr_oldest = jblocks->jb_oldestseq;
> +	rec->jsr_oldest = jseg->js_oldseq;
> 	rec->jsr_cnt = jseg->js_cnt;
> 	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
> 	rec->jsr_crc = 0;
> @@ -2722,8 +2757,9 @@ jtrunc_write(jtrunc, jseg, data)
>  * Flush some journal records to disk.
>  */
> static void
> -softdep_process_journal(mp, flags)
> +softdep_process_journal(mp, needwk, flags)
> 	struct mount *mp;
> +	struct worklist *needwk;
> 	int flags;
> {
> 	struct jblocks *jblocks;
> @@ -2755,17 +2791,23 @@ softdep_process_journal(mp, flags)
> 	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
> 	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
> 	segwritten = 0;
> -	while ((cnt = ump->softdep_on_journal) != 0) {
> +	for (;;) {
> +		cnt = ump->softdep_on_journal;
> 		/*
> -		 * Create a new segment to hold as many as 'cnt' journal
> -		 * entries and add them to the segment.  Notice cnt is
> -		 * off by one to account for the space required by the
> -		 * jsegrec.  If we don't have a full block to log skip it
> -		 * unless we haven't written anything.
> +		 * Criteria for writing a segment:
> +		 * 1) We have a full block.
> +		 * 2) We're called from jwait() and haven't found the
> +		 *    journal item yet.
> +		 * 3) Always write if needseg is set.
> +		 * 4) If we are called from process_worklist and have
> +		 *    not yet written anything we write a partial block
> +		 *    to enforce a 1 second maximum latency on journal
> +		 *    entries.
> 		 */
> -		cnt++;
> -		if (cnt < jrecmax && segwritten)
> +		if (cnt < (jrecmax - 1) && needwk == NULL &&
> +		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
> 			break;
> +		cnt++;
> 		/*
> 		 * Verify some free journal space.  softdep_prealloc() should
> 	 	 * guarantee that we don't run out so this is indicative of
> @@ -2783,6 +2825,7 @@ softdep_process_journal(mp, flags)
> 		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
> 		workitem_alloc(&jseg->js_list, D_JSEG, mp);
> 		LIST_INIT(&jseg->js_entries);
> +		LIST_INIT(&jseg->js_indirs);
> 		jseg->js_state = ATTACHED;
> 		jseg->js_jblocks = jblocks;
> 		bp = geteblk(fs->fs_bsize, 0);
> @@ -2794,7 +2837,8 @@ softdep_process_journal(mp, flags)
> 		 * the caller will loop if the entry it cares about is
> 		 * not written.
> 		 */
> -		if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
> +		cnt = ump->softdep_on_journal;
> +		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
> 			bp->b_flags |= B_INVAL | B_NOCACHE;
> 			WORKITEM_FREE(jseg, D_JSEG);
> 			FREE_LOCK(&lk);
> @@ -2806,8 +2850,9 @@ softdep_process_journal(mp, flags)
> 		 * Calculate the disk block size required for the available
> 		 * records rounded to the min size.
> 		 */
> -		cnt = ump->softdep_on_journal;
> -		if (cnt < jrecmax)
> +		if (cnt == 0)
> +			size = devbsize;
> +		else if (cnt < jrecmax)
> 			size = howmany(cnt, jrecmin) * devbsize;
> 		else
> 			size = fs->fs_bsize;
> @@ -2827,15 +2872,15 @@ softdep_process_journal(mp, flags)
> 		 * Initialize our jseg with cnt records.  Assign the next
> 		 * sequence number to it and link it in-order.
> 		 */
> -		cnt = MIN(ump->softdep_on_journal,
> -		    (size / devbsize) * jrecmin);
> +		cnt = MIN(cnt, (size / devbsize) * jrecmin);
> 		jseg->js_buf = bp;
> 		jseg->js_cnt = cnt;
> 		jseg->js_refs = cnt + 1;	/* Self ref. */
> 		jseg->js_size = size;
> 		jseg->js_seq = jblocks->jb_nextseq++;
> -		if (TAILQ_EMPTY(&jblocks->jb_segs))
> -			jblocks->jb_oldestseq = jseg->js_seq;
> +		if (jblocks->jb_oldestseg == NULL)
> +			jblocks->jb_oldestseg = jseg;
> +		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
> 		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
> 		if (jblocks->jb_writeseg == NULL)
> 			jblocks->jb_writeseg = jseg;
> @@ -2846,12 +2891,16 @@ softdep_process_journal(mp, flags)
> 		off = 0;
> 		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
> 		    != NULL) {
> +			if (cnt == 0)
> +				break;
> 			/* Place a segment header on every device block. */
> 			if ((off % devbsize) == 0) {
> -				jseg_write(ump, jblocks, jseg, data);
> +				jseg_write(ump, jseg, data);
> 				off += JREC_SIZE;
> 				data = bp->b_data + off;
> 			}
> +			if (wk == needwk)
> +				needwk = NULL;
> 			remove_from_journal(wk);
> 			wk->wk_state |= IOSTARTED;
> 			WORKLIST_INSERT(&jseg->js_entries, wk);
> @@ -2882,23 +2931,28 @@ softdep_process_journal(mp, flags)
> 				    TYPENAME(wk->wk_type));
> 				/* NOTREACHED */
> 			}
> -			if (--cnt == 0)
> -				break;
> 			off += JREC_SIZE;
> 			data = bp->b_data + off;
> +			cnt--;
> 		}
> 		/*
> 		 * Write this one buffer and continue.
> 		 */
> +		segwritten = 1;
> +		jblocks->jb_needseg = 0;
> 		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
> 		FREE_LOCK(&lk);
> 		BO_LOCK(bp->b_bufobj);
> 		bgetvp(ump->um_devvp, bp);
> 		BO_UNLOCK(bp->b_bufobj);
> -		if (flags == MNT_NOWAIT)
> -			bawrite(bp);
> -		else
> +		/*
> +		 * We only do the blocking wait once we find the journal
> +		 * entry we're looking for.
> +		 */
> +		if (needwk == NULL && flags & MNT_WAIT)
> 			bwrite(bp);
> +		else
> +			bawrite(bp);
> 		ACQUIRE_LOCK(&lk);
> 	}
> 	/*
> @@ -2949,7 +3003,7 @@ complete_jseg(jseg)
> 			break;
> 		case D_JMVREF:
> 			/* No jsegdep here. */
> -			free_jseg(jseg);
> +			rele_jseg(jseg);
> 			jmvref = WK_JMVREF(wk);
> 			LIST_REMOVE(jmvref, jm_deps);
> 			free_pagedep(jmvref->jm_pagedep);
> @@ -2977,7 +3031,7 @@ complete_jseg(jseg)
> 			wakeup(wk);
> 	}
> 	/* Release the self reference so the structure may be freed. */
> -	free_jseg(jseg);
> +	rele_jseg(jseg);
> }
>
> /*
> @@ -3009,11 +3063,16 @@ handle_written_jseg(jseg, bp)
> 		return;
> 	/* Iterate through available jsegs processing their entries. */
> 	do {
> +		jblocks->jb_oldestwrseq = jseg->js_oldseq;
> 		jsegn = TAILQ_NEXT(jseg, js_next);
> 		complete_jseg(jseg);
> 		jseg = jsegn;
> 	} while (jseg && jseg->js_state & DEPCOMPLETE);
> 	jblocks->jb_writeseg = jseg;
> +	/*
> +	 * Attempt to free jsegs now that oldestwrseq may have advanced.
> +	 */
> +	free_jsegs(jblocks);
> }
>
> static inline struct jsegdep *
> @@ -3682,6 +3741,8 @@ cancel_jnewblk(jnewblk, wkhd)
> 	struct jsegdep *jsegdep;
>
> 	jsegdep = jnewblk->jn_jsegdep;
> +	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
> +		panic("cancel_jnewblk: Invalid state");
> 	jnewblk->jn_jsegdep  = NULL;
> 	jnewblk->jn_dep = NULL;
> 	jnewblk->jn_state |= GOINGAWAY;
> @@ -3709,34 +3770,97 @@ free_jfreeblk(jfreeblk)
> }
>
> /*
> - * Release one reference to a jseg and free it if the count reaches 0.  This
> - * should eventually reclaim journal space as well.
> + * Free a single jseg once it is no longer referenced in memory or on
> + * disk.  Reclaim journal blocks and dependencies waiting for the segment
> + * to disappear.
>  */
> static void
> -free_jseg(jseg)
> +free_jseg(jseg, jblocks)
> 	struct jseg *jseg;
> +	struct jblocks *jblocks;
> {
> +	struct freework *freework;
> +
> +	/*
> +	 * Free freework structures that were lingering to indicate freed
> +	 * indirect blocks that forced journal write ordering on reallocate.
> +	 */
> +	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) {
> +		LIST_REMOVE(freework, fw_next);
> +		WORKLIST_REMOVE(&freework->fw_list);
> +		WORKITEM_FREE(freework, D_FREEWORK);
> +	}
> +	if (jblocks->jb_oldestseg == jseg)
> +		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
> +	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
> +	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
> +	KASSERT(LIST_EMPTY(&jseg->js_entries),
> +	    ("free_jseg: Freed jseg has valid entries."));
> +	WORKITEM_FREE(jseg, D_JSEG);
> +}
> +
> +/*
> + * Free all jsegs that meet the criteria for being reclaimed and update
> + * oldestseg.
> + */
> +static void
> +free_jsegs(jblocks)
> 	struct jblocks *jblocks;
> +{
> +	struct jseg *jseg;
>
> -	KASSERT(jseg->js_refs > 0,
> -	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
> -	if (--jseg->js_refs != 0)
> -		return;
> 	/*
> 	 * Free only those jsegs which have none allocated before them to
> 	 * preserve the journal space ordering.
> 	 */
> -	jblocks = jseg->js_jblocks;
> 	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
> -		jblocks->jb_oldestseq = jseg->js_seq;
> -		if (jseg->js_refs != 0)
> +		/*
> +		 * Only reclaim space when nothing depends on this journal
> +		 * set and another set has written that it is no longer
> +		 * valid.
> +		 */
> +		if (jseg->js_refs != 0) {
> +			jblocks->jb_oldestseg = jseg;
> +			return;
> +		}
> +		if (!LIST_EMPTY(&jseg->js_indirs) &&
> +		    jseg->js_seq >= jblocks->jb_oldestwrseq)
> 			break;
> -		TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
> -		jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
> -		KASSERT(LIST_EMPTY(&jseg->js_entries),
> -		    ("free_jseg: Freed jseg has valid entries."));
> -		WORKITEM_FREE(jseg, D_JSEG);
> +		free_jseg(jseg, jblocks);
> 	}
> +	/*
> +	 * If we exited the loop above we still must discover the
> +	 * oldest valid segment.
> +	 */
> +	if (jseg)
> +		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
> +		     jseg = TAILQ_NEXT(jseg, js_next))
> +			if (jseg->js_refs != 0)
> +				break;
> +	jblocks->jb_oldestseg = jseg;
> +	/*
> +	 * The journal has no valid records but some jsegs may still be
> +	 * waiting on oldestwrseq to advance.  We force a small record
> +	 * out to permit these lingering records to be reclaimed.
> +	 */
> +	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
> +		jblocks->jb_needseg = 1;
> +}
> +
> +/*
> + * Release one reference to a jseg and free it if the count reaches 0.  This
> + * should eventually reclaim journal space as well.
> + */
> +static void
> +rele_jseg(jseg)
> +	struct jseg *jseg;
> +{
> +
> +	KASSERT(jseg->js_refs > 0,
> +	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
> +	if (--jseg->js_refs != 0)
> +		return;
> +	free_jsegs(jseg->js_jblocks);
> }
>
> /*
> @@ -3748,7 +3872,7 @@ free_jsegdep(jsegdep)
> {
>
> 	if (jsegdep->jd_seg)
> -		free_jseg(jsegdep->jd_seg);
> +		rele_jseg(jsegdep->jd_seg);
> 	WORKITEM_FREE(jsegdep, D_JSEGDEP);
> }
>
> @@ -3769,7 +3893,7 @@ jwait(wk)
> 	 * this point.  The caller may call back in and re-issue the request.
> 	 */
> 	if ((wk->wk_state & IOSTARTED) == 0) {
> -		softdep_process_journal(wk->wk_mp, MNT_WAIT);
> +		softdep_process_journal(wk->wk_mp, wk, MNT_WAIT);
> 		return;
> 	}
> 	wk->wk_state |= IOWAITING;
> @@ -6004,7 +6128,9 @@ freework_freeblock(freework)
> 	LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
> 	jnewblk = freework->fw_jnewblk;
> 	if (jnewblk != NULL) {
> -		cancel_jnewblk(jnewblk, &wkhd);
> +		/* Could've already been canceled in indir_trunc(). */
> +		if ((jnewblk->jn_state & GOINGAWAY) == 0)
> +			cancel_jnewblk(jnewblk, &wkhd);
> 		needj = 0;
> 	} else if (needj)
> 		WORKLIST_INSERT(&wkhd, &freework->fw_list);
> @@ -6068,16 +6194,40 @@ handle_written_freework(freework)
> {
> 	struct freeblks *freeblks;
> 	struct freework *parent;
> +	struct jsegdep *jsegdep;
> +	struct worklist *wk;
> +	int needj;
>
> +	needj = 0;
> 	freeblks = freework->fw_freeblks;
> 	parent = freework->fw_parent;
> +	/*
> +	 * SUJ needs to wait for the segment referencing freed indirect
> +	 * blocks to expire so that we know the checker will not confuse
> +	 * a re-allocated indirect block with its old contents.
> +	 */
> +	if (freework->fw_lbn <= -NDADDR &&
> +	    freework->fw_list.wk_mp->mnt_kern_flag & MNTK_SUJ) {
> +		LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
> +			if (wk->wk_type == D_JSEGDEP)
> +				break;
> +		if (wk) {
> +			jsegdep = WK_JSEGDEP(wk);
> +			LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs,
> +			    freework, fw_next);
> +			WORKLIST_INSERT(INDIR_HASH(freework->fw_list.wk_mp,
> +			    freework->fw_blkno), &freework->fw_list);
> +			needj = 1;
> +		}
> +	}
> 	if (parent) {
> 		if (--parent->fw_ref != 0)
> 			parent = NULL;
> 		freeblks = NULL;
> 	} else if (--freeblks->fb_ref != 0)
> 		freeblks = NULL;
> -	WORKITEM_FREE(freework, D_FREEWORK);
> +	if (needj == 0)
> +		WORKITEM_FREE(freework, D_FREEWORK);
> 	/*
> 	 * Don't delay these block frees or it takes an intolerable amount
> 	 * of time to process truncates and free their journal entries.
> @@ -6251,6 +6401,10 @@ indir_trunc(freework, dbn, lbn)
> 		LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
> 		LIST_FOREACH_SAFE(jnewblk, &indirdep->ir_jnewblkhd,
> 		    jn_indirdeps, jnewblkn) {
> +			/*
> +			 * XXX This cancel may cause some lengthy delay
> +			 * before the record is reclaimed below.
> +			 */
> 			LIST_REMOVE(jnewblk, jn_indirdeps);
> 			cancel_jnewblk(jnewblk, &wkhd);
> 		}
> @@ -8165,13 +8319,15 @@ softdep_disk_io_initiation(bp)
> 		case D_ALLOCINDIR:
> 			/*
> 			 * We have to wait for the jnewblk to be journaled
> -			 * before we can write to a block otherwise the
> -			 * contents may be confused with an earlier file
> +			 * before we can write to a block if the contents
> +			 * may be confused with an earlier file's indirect
> 			 * at recovery time.  Handle the marker as described
> 			 * above.
> 			 */
> 			newblk = WK_NEWBLK(wk);
> -			if (newblk->nb_jnewblk != NULL) {
> +			if (newblk->nb_jnewblk != NULL &&
> +			    indirblk_inseg(newblk->nb_list.wk_mp,
> +			    newblk->nb_newblkno)) {
> 				LIST_REMOVE(&marker, wk_list);
> 				LIST_INSERT_BEFORE(wk, &marker, wk_list);
> 				stat_jwait_newblk++;
>
> Modified: head/sys/ufs/ffs/softdep.h
> ==============================================================================
> --- head/sys/ufs/ffs/softdep.h	Sun Apr 10 01:54:42 2011	(r220510)
> +++ head/sys/ufs/ffs/softdep.h	Sun Apr 10 03:49:53 2011	(r220511)
> @@ -538,7 +538,7 @@ struct freeblks {
> struct freework {
> 	struct	worklist fw_list;		/* Delayed worklist. */
> #	define	fw_state fw_list.wk_state
> -	LIST_ENTRY(freework) fw_next;		/* Queue for freeblk list. */
> +	LIST_ENTRY(freework) fw_next;		/* For seg journal list. */
> 	struct	jnewblk  *fw_jnewblk;		/* Journal entry to cancel. */
> 	struct	freeblks *fw_freeblks;		/* Root of operation. */
> 	struct	freework *fw_parent;		/* Parent indirect. */
> @@ -888,10 +888,12 @@ struct jseg {
> 	struct	worklist js_list;	/* b_deps link for journal */
> #	define	js_state js_list.wk_state
> 	struct	workhead js_entries;	/* Entries awaiting write */
> +	LIST_HEAD(, freework) js_indirs;/* List of indirects in this seg. */
> 	TAILQ_ENTRY(jseg) js_next;	/* List of all unfinished segments. */
> 	struct	jblocks *js_jblocks;	/* Back pointer to block/seg list */
> 	struct	buf *js_buf;		/* Buffer while unwritten */
> 	uint64_t js_seq;		/* Journal record sequence number. */
> +	uint64_t js_oldseq;		/* Oldest valid sequence number. */
> 	int	js_size;		/* Size of journal record in bytes. */
> 	int	js_cnt;			/* Total items allocated. */
> 	int	js_refs;		/* Count of js_entries items. */
>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?alpine.BSF.2.00.1104091756440.1480>