From owner-svn-src-head@FreeBSD.ORG Thu Nov 8 01:41:05 2012 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id 269F66C8; Thu, 8 Nov 2012 01:41:05 +0000 (UTC) (envelope-from jeff@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) by mx1.freebsd.org (Postfix) with ESMTP id 0778A8FC0A; Thu, 8 Nov 2012 01:41:05 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.5/8.14.5) with ESMTP id qA81f45x050502; Thu, 8 Nov 2012 01:41:04 GMT (envelope-from jeff@svn.freebsd.org) Received: (from jeff@localhost) by svn.freebsd.org (8.14.5/8.14.5/Submit) id qA81f4ZU050500; Thu, 8 Nov 2012 01:41:04 GMT (envelope-from jeff@svn.freebsd.org) Message-Id: <201211080141.qA81f4ZU050500@svn.freebsd.org> From: Jeff Roberson Date: Thu, 8 Nov 2012 01:41:04 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r242734 - head/sys/ufs/ffs X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 08 Nov 2012 01:41:05 -0000 Author: jeff Date: Thu Nov 8 01:41:04 2012 New Revision: 242734 URL: http://svnweb.freebsd.org/changeset/base/242734 Log: - Implement BIO_FLUSH support around journal entries. This will not 100% solve power loss problems with dishonest write caches. However, it should improve the situation and force a full fsck when it is unable to resolve with the journal. - Resolve a case where the journal could wrap in an unsafe way causing us to prematurely lose journal entries in very specific scenarios. Discussed with: mckusick MFC after: 1 month Modified: head/sys/ufs/ffs/ffs_softdep.c Modified: head/sys/ufs/ffs/ffs_softdep.c ============================================================================== --- head/sys/ufs/ffs/ffs_softdep.c Thu Nov 8 01:38:30 2012 (r242733) +++ head/sys/ufs/ffs/ffs_softdep.c Thu Nov 8 01:41:04 2012 (r242734) @@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #ifndef SOFTUPDATES @@ -802,6 +804,7 @@ static void handle_written_jnewblk(struc static void handle_written_jblkdep(struct jblkdep *); static void handle_written_jfreefrag(struct jfreefrag *); static void complete_jseg(struct jseg *); +static void complete_jsegs(struct jseg *); static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); static void jremref_write(struct jremref *, struct jseg *, uint8_t *); @@ -1227,6 +1230,7 @@ static struct callout softdep_callout; static int req_pending; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ +static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ /* * runtime statistics @@ -1310,6 +1314,8 @@ SYSCTL_INT(_debug_softdep, OID_AUTO, cle &stat_cleanup_retries, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, &stat_cleanup_failures, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, + &softdep_flushcache, 0, ""); SYSCTL_DECL(_vfs_ffs); @@ -3078,6 +3084,67 @@ softdep_flushjournal(mp) FREE_LOCK(&lk); } +static void softdep_synchronize_completed(struct bio *); +static void softdep_synchronize(struct bio *, struct ufsmount *, void *); + +static void +softdep_synchronize_completed(bp) + struct bio *bp; +{ + struct jseg *oldest; + struct jseg *jseg; + + /* + * caller1 marks the last segment written before we issued the + * synchronize cache. + */ + jseg = bp->bio_caller1; + oldest = NULL; + ACQUIRE_LOCK(&lk); + /* + * Mark all the journal entries waiting on the synchronize cache + * as completed so they may continue on. + */ + while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { + jseg->js_state |= COMPLETE; + oldest = jseg; + jseg = TAILQ_PREV(jseg, jseglst, js_next); + } + /* + * Restart deferred journal entry processing from the oldest + * completed jseg. + */ + if (oldest) + complete_jsegs(oldest); + + FREE_LOCK(&lk); + g_destroy_bio(bp); +} + +/* + * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering + * barriers. The journal must be written prior to any blocks that depend + * on it and the journal can not be released until the blocks have be + * written. This code handles both barriers simultaneously. + */ +static void +softdep_synchronize(bp, ump, caller1) + struct bio *bp; + struct ufsmount *ump; + void *caller1; +{ + + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = ump->um_cp->provider->mediasize; + bp->bio_length = 0; + bp->bio_done = softdep_synchronize_completed; + bp->bio_caller1 = caller1; + g_io_request(bp, + (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private); +} + /* * Flush some journal records to disk. */ @@ -3092,8 +3159,10 @@ softdep_process_journal(mp, needwk, flag struct worklist *wk; struct jseg *jseg; struct buf *bp; + struct bio *bio; uint8_t *data; struct fs *fs; + int shouldflush; int segwritten; int jrecmin; /* Minimum records per block. */ int jrecmax; /* Maximum records per block. */ @@ -3104,6 +3173,9 @@ softdep_process_journal(mp, needwk, flag if (MOUNTEDSUJ(mp) == 0) return; + shouldflush = softdep_flushcache; + bio = NULL; + jseg = NULL; ump = VFSTOUFS(mp); fs = ump->um_fs; jblocks = ump->softdep_jblocks; @@ -3152,6 +3224,10 @@ softdep_process_journal(mp, needwk, flag LIST_INIT(&jseg->js_entries); LIST_INIT(&jseg->js_indirs); jseg->js_state = ATTACHED; + if (shouldflush == 0) + jseg->js_state |= COMPLETE; + else if (bio == NULL) + bio = g_alloc_bio(); jseg->js_jblocks = jblocks; bp = geteblk(fs->fs_bsize, 0); ACQUIRE_LOCK(&lk); @@ -3284,6 +3360,17 @@ softdep_process_journal(mp, needwk, flag ACQUIRE_LOCK(&lk); } /* + * If we wrote a segment issue a synchronize cache so the journal + * is reflected on disk before the data is written. Since reclaiming + * journal space also requires writing a journal record this + * process also enforces a barrier before reclamation. + */ + if (segwritten && shouldflush) { + softdep_synchronize(bio, ump, + TAILQ_LAST(&jblocks->jb_segs, jseglst)); + } else if (bio) + g_destroy_bio(bio); + /* * If we've suspended the filesystem because we ran out of journal * space either try to sync it here to make some progress or * unsuspend it if we already have. @@ -3366,25 +3453,17 @@ complete_jseg(jseg) } /* - * Mark a jseg as DEPCOMPLETE and throw away the buffer. Handle jseg - * completions in order only. + * Determine which jsegs are ready for completion processing. Waits for + * synchronize cache to complete as well as forcing in-order completion + * of journal entries. */ static void -handle_written_jseg(jseg, bp) +complete_jsegs(jseg) struct jseg *jseg; - struct buf *bp; { struct jblocks *jblocks; struct jseg *jsegn; - if (jseg->js_refs == 0) - panic("handle_written_jseg: No self-reference on %p", jseg); - jseg->js_state |= DEPCOMPLETE; - /* - * We'll never need this buffer again, set flags so it will be - * discarded. - */ - bp->b_flags |= B_INVAL | B_NOCACHE; jblocks = jseg->js_jblocks; /* * Don't allow out of order completions. If this isn't the first @@ -3393,12 +3472,12 @@ handle_written_jseg(jseg, bp) if (jseg != jblocks->jb_writeseg) return; /* Iterate through available jsegs processing their entries. */ - do { + while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { jblocks->jb_oldestwrseq = jseg->js_oldseq; jsegn = TAILQ_NEXT(jseg, js_next); complete_jseg(jseg); jseg = jsegn; - } while (jseg && jseg->js_state & DEPCOMPLETE); + } jblocks->jb_writeseg = jseg; /* * Attempt to free jsegs now that oldestwrseq may have advanced. @@ -3406,6 +3485,27 @@ handle_written_jseg(jseg, bp) free_jsegs(jblocks); } +/* + * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle + * the final completions. + */ +static void +handle_written_jseg(jseg, bp) + struct jseg *jseg; + struct buf *bp; +{ + + if (jseg->js_refs == 0) + panic("handle_written_jseg: No self-reference on %p", jseg); + jseg->js_state |= DEPCOMPLETE; + /* + * We'll never need this buffer again, set flags so it will be + * discarded. + */ + bp->b_flags |= B_INVAL | B_NOCACHE; + complete_jsegs(jseg); +} + static inline struct jsegdep * inoref_jseg(inoref) struct inoref *inoref; @@ -4191,8 +4291,13 @@ free_jsegs(jblocks) jblocks->jb_oldestseg = jseg; return; } - if (!LIST_EMPTY(&jseg->js_indirs) && - jseg->js_seq >= jblocks->jb_oldestwrseq) + if (jseg->js_seq > jblocks->jb_oldestwrseq) + break; + /* + * We can free jsegs that didn't write entries when + * oldestwrseq == js_seq. + */ + if (jseg->js_cnt != 0) break; free_jseg(jseg, jblocks); }