Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 17 Oct 2016 21:44:41 +0000 (UTC)
From:      Kirk McKusick <mckusick@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r307533 - in stable/11: share/man/man3 sys/sys sys/ufs/ffs
Message-ID:  <201610172144.u9HLif26041731@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mckusick
Date: Mon Oct 17 21:44:41 2016
New Revision: 307533
URL: https://svnweb.freebsd.org/changeset/base/307533

Log:
  MFC r304230:
  Add two new macros, SLIST_CONCAT and LIST_CONCAT.
  
  MFC r304239:
  Bug 211013 reports that a write error to a UFS filesystem running
  with softupdates panics the kernel.
  
  PR: 211013

Modified:
  stable/11/share/man/man3/queue.3
  stable/11/sys/sys/queue.h
  stable/11/sys/ufs/ffs/ffs_softdep.c
  stable/11/sys/ufs/ffs/softdep.h
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/share/man/man3/queue.3
==============================================================================
--- stable/11/share/man/man3/queue.3	Mon Oct 17 21:35:13 2016	(r307532)
+++ stable/11/share/man/man3/queue.3	Mon Oct 17 21:44:41 2016	(r307533)
@@ -28,12 +28,13 @@
 .\"	@(#)queue.3	8.2 (Berkeley) 1/24/94
 .\" $FreeBSD$
 .\"
-.Dd June 24, 2015
+.Dd August 15, 2016
 .Dt QUEUE 3
 .Os
 .Sh NAME
 .Nm SLIST_CLASS_ENTRY ,
 .Nm SLIST_CLASS_HEAD ,
+.Nm SLIST_CONCAT ,
 .Nm SLIST_EMPTY ,
 .Nm SLIST_ENTRY ,
 .Nm SLIST_FIRST ,
@@ -75,6 +76,7 @@
 .Nm STAILQ_SWAP ,
 .Nm LIST_CLASS_ENTRY ,
 .Nm LIST_CLASS_HEAD ,
+.Nm LIST_CONCAT ,
 .Nm LIST_EMPTY ,
 .Nm LIST_ENTRY ,
 .Nm LIST_FIRST ,
@@ -125,6 +127,7 @@ lists and tail queues
 .\"
 .Fn SLIST_CLASS_ENTRY "CLASSTYPE"
 .Fn SLIST_CLASS_HEAD "HEADNAME" "CLASSTYPE"
+.Fn SLIST_CONCAT "SLIST_HEAD *head1" "SLIST_HEAD *head2" "TYPE" "SLIST_ENTRY NAME"
 .Fn SLIST_EMPTY "SLIST_HEAD *head"
 .Fn SLIST_ENTRY "TYPE"
 .Fn SLIST_FIRST "SLIST_HEAD *head"
@@ -168,6 +171,7 @@ lists and tail queues
 .\"
 .Fn LIST_CLASS_ENTRY "CLASSTYPE"
 .Fn LIST_CLASS_HEAD "HEADNAME" "CLASSTYPE"
+.Fn LIST_CONCAT "LIST_HEAD *head1" "LIST_HEAD *head2" "TYPE" "LIST_ENTRY NAME"
 .Fn LIST_EMPTY "LIST_HEAD *head"
 .Fn LIST_ENTRY "TYPE"
 .Fn LIST_FIRST "LIST_HEAD *head"
@@ -249,6 +253,8 @@ Singly-linked lists add the following fu
 .Bl -enum -compact -offset indent
 .It
 O(n) removal of any entry in the list.
+.It
+O(n) concatenation of two lists.
 .El
 .Pp
 Singly-linked tail queues add the following functionality:
@@ -296,6 +302,8 @@ Linked lists are the simplest of the dou
 They add the following functionality over the above:
 .Bl -enum -compact -offset indent
 .It
+O(n) concatenation of two lists.
+.It
 They may be traversed backwards.
 .El
 However:
@@ -401,6 +409,19 @@ evaluates to an initializer for the list
 .Fa head .
 .Pp
 The macro
+.Nm SLIST_CONCAT
+concatenates the list headed by
+.Fa head2
+onto the end of the one headed by
+.Fa head1
+removing all entries from the former.
+Use of this macro should be avoided as it traverses the entirety of the
+.Fa head1
+list.
+A singly-linked tail queue should be used if this macro is needed in
+high-usage code paths or to operate on long lists.
+.Pp
+The macro
 .Nm SLIST_EMPTY
 evaluates to true if there are no elements in the list.
 .Pp
@@ -508,6 +529,9 @@ The macro
 removes the element
 .Fa elm
 from the list.
+Use of this macro should be avoided as it traverses the entire list.
+A doubly-linked list should be used if this macro is needed in
+high-usage code paths or to operate on long lists.
 .Pp
 The macro
 .Nm SLIST_SWAP
@@ -724,6 +748,9 @@ The macro
 removes the element
 .Fa elm
 from the tail queue.
+Use of this macro should be avoided as it traverses the entire list.
+A doubly-linked tail queue should be used if this macro is needed in
+high-usage code paths or to operate on long tail queues.
 .Pp
 The macro
 .Nm STAILQ_SWAP
@@ -823,6 +850,19 @@ evaluates to an initializer for the list
 .Fa head .
 .Pp
 The macro
+.Nm LIST_CONCAT
+concatenates the list headed by
+.Fa head2
+onto the end of the one headed by
+.Fa head1
+removing all entries from the former.
+Use of this macro should be avoided as it traverses the entirety of the
+.Fa head1
+list.
+A tail queue should be used if this macro is needed in
+high-usage code paths or to operate on long lists.
+.Pp
+The macro
 .Nm LIST_EMPTY
 evaluates to true if there are no elements in the list.
 .Pp

Modified: stable/11/sys/sys/queue.h
==============================================================================
--- stable/11/sys/sys/queue.h	Mon Oct 17 21:35:13 2016	(r307532)
+++ stable/11/sys/sys/queue.h	Mon Oct 17 21:44:41 2016	(r307533)
@@ -76,6 +76,10 @@
  *
  * For details on the use of these macros, see the queue(3) manual page.
  *
+ * Below is a summary of implemented functions where:
+ *  +  means the macro is available
+ *  -  means the macro is not available
+ *  s  means the macro is available but is slow (runs in O(n) time)
  *
  *				SLIST	LIST	STAILQ	TAILQ
  * _HEAD			+	+	+	+
@@ -101,10 +105,10 @@
  * _INSERT_BEFORE		-	+	-	+
  * _INSERT_AFTER		+	+	+	+
  * _INSERT_TAIL			-	-	+	+
- * _CONCAT			-	-	+	+
+ * _CONCAT			s	s	+	+
  * _REMOVE_AFTER		+	-	+	-
  * _REMOVE_HEAD			+	-	+	-
- * _REMOVE			+	+	+	+
+ * _REMOVE			s	+	s	+
  * _SWAP			+	+	+	+
  *
  */
@@ -183,6 +187,19 @@ struct {								\
 /*
  * Singly-linked List functions.
  */
+#define SLIST_CONCAT(head1, head2, type, field) do {			\
+	QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1);		\
+	if (curelm == NULL) {						\
+		if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL)	\
+			SLIST_INIT(head2);				\
+	} else if (SLIST_FIRST(head2) != NULL) {			\
+		while (SLIST_NEXT(curelm, field) != NULL)		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) = SLIST_FIRST(head2);		\
+		SLIST_INIT(head2);					\
+	}								\
+} while (0)
+
 #define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
 
 #define	SLIST_FIRST(head)	((head)->slh_first)
@@ -447,6 +464,23 @@ struct {								\
 #define	QMD_LIST_CHECK_PREV(elm, field)
 #endif /* (_KERNEL && INVARIANTS) */
 
+#define LIST_CONCAT(head1, head2, type, field) do {			      \
+	QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1);			      \
+	if (curelm == NULL) {						      \
+		if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) {	      \
+			LIST_FIRST(head2)->field.le_prev =		      \
+			    &LIST_FIRST((head1));			      \
+			LIST_INIT(head2);				      \
+		}							      \
+	} else if (LIST_FIRST(head2) != NULL) {				      \
+		while (LIST_NEXT(curelm, field) != NULL)		      \
+			curelm = LIST_NEXT(curelm, field);		      \
+		LIST_NEXT(curelm, field) = LIST_FIRST(head2);		      \
+		LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \
+		LIST_INIT(head2);					      \
+	}								      \
+} while (0)
+
 #define	LIST_EMPTY(head)	((head)->lh_first == NULL)
 
 #define	LIST_FIRST(head)	((head)->lh_first)

Modified: stable/11/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- stable/11/sys/ufs/ffs/ffs_softdep.c	Mon Oct 17 21:35:13 2016	(r307532)
+++ stable/11/sys/ufs/ffs/ffs_softdep.c	Mon Oct 17 21:44:41 2016	(r307533)
@@ -752,16 +752,16 @@ static	int flush_newblk_dep(struct vnode
 static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int sync_cgs(struct mount *, int);
-static	int handle_written_filepage(struct pagedep *, struct buf *);
+static	int handle_written_filepage(struct pagedep *, struct buf *, int);
 static	int handle_written_sbdep(struct sbdep *, struct buf *);
 static	void initiate_write_sbdep(struct sbdep *);
 static	void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_indirdep(struct indirdep *, struct buf *,
-	    struct buf**);
-static	int handle_written_inodeblock(struct inodedep *, struct buf *);
+	    struct buf**, int);
+static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
 static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
 	    uint8_t *);
-static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
+static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
 static	void handle_written_jaddref(struct jaddref *);
 static	void handle_written_jremref(struct jremref *);
 static	void handle_written_jseg(struct jseg *, struct buf *);
@@ -10903,6 +10903,10 @@ initiate_write_bmsafemap(bmsafemap, bp)
 	struct fs *fs;
 	ino_t ino;
 
+	/*
+	 * If this is a background write, we did this at the time that
+	 * the copy was made, so do not need to do it again.
+	 */
 	if (bmsafemap->sm_state & IOSTARTED)
 		return;
 	bmsafemap->sm_state |= IOSTARTED;
@@ -10976,10 +10980,39 @@ softdep_disk_write_complete(bp)
 
 	/*
 	 * If an error occurred while doing the write, then the data
-	 * has not hit the disk and the dependencies cannot be unrolled.
+	 * has not hit the disk and the dependencies cannot be processed.
+	 * But we do have to go through and roll forward any dependencies
+	 * that were rolled back before the disk write.
 	 */
-	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
+	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
+		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
+			switch (wk->wk_type) {
+
+			case D_PAGEDEP:
+				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
+				continue;
+
+			case D_INODEDEP:
+				handle_written_inodeblock(WK_INODEDEP(wk),
+				    bp, 0);
+				continue;
+
+			case D_BMSAFEMAP:
+				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
+				    bp, 0);
+				continue;
+
+			case D_INDIRDEP:
+				handle_written_indirdep(WK_INDIRDEP(wk),
+				    bp, &sbp, 0);
+				continue;
+			default:
+				/* nothing to roll forward */
+				continue;
+			}
+		}
 		return;
+	}
 	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
 		return;
 	ump = VFSTOUFS(wk->wk_mp);
@@ -10999,17 +11032,20 @@ softdep_disk_write_complete(bp)
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
-			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
+			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_INODEDEP:
-			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
+			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_BMSAFEMAP:
-			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
+			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
@@ -11028,7 +11064,8 @@ softdep_disk_write_complete(bp)
 			continue;
 
 		case D_INDIRDEP:
-			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
+			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
+			    WRITESUCCEEDED))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
@@ -11328,12 +11365,17 @@ handle_bufwait(inodedep, refhd)
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
- * splbio interrupts blocked.
+ * interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int 
-handle_written_inodeblock(inodedep, bp)
+handle_written_inodeblock(inodedep, bp, flags)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
+	int flags;
 {
 	struct freefile *freefile;
 	struct allocdirect *adp, *nextadp;
@@ -11363,7 +11405,8 @@ handle_written_inodeblock(inodedep, bp)
 	/*
 	 * Leave this inodeblock dirty until it's in the list.
 	 */
-	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
+	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
+	    (flags & WRITESUCCEEDED)) {
 		struct inodedep *inon;
 
 		inon = TAILQ_NEXT(inodedep, id_unlinked);
@@ -11402,7 +11445,8 @@ handle_written_inodeblock(inodedep, bp)
 			goto bufwait;
 		return (1);
 	}
-	inodedep->id_state |= COMPLETE;
+	if (flags & WRITESUCCEEDED)
+		inodedep->id_state |= COMPLETE;
 	/*
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
@@ -11517,6 +11561,13 @@ handle_written_inodeblock(inodedep, bp)
 		bdirty(bp);
 bufwait:
 	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0)
+		return (hadchanges);
+	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
@@ -11573,11 +11624,20 @@ bufwait:
 	return (hadchanges);
 }
 
+/*
+ * Perform needed roll-forwards and kick off any dependencies that
+ * can now be processed.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
+ */
 static int
-handle_written_indirdep(indirdep, bp, bpp)
+handle_written_indirdep(indirdep, bp, bpp, flags)
 	struct indirdep *indirdep;
 	struct buf *bp;
 	struct buf **bpp;
+	int flags;
 {
 	struct allocindir *aip;
 	struct buf *sbp;
@@ -11602,6 +11662,16 @@ handle_written_indirdep(indirdep, bp, bp
 	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
 	indirdep->ir_state |= ATTACHED;
 	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0) {
+		stat_indir_blk_ptrs++;
+		bdirty(bp);
+		return (1);
+	}
+	/*
 	 * Move allocindirs with written pointers to the completehd if
 	 * the indirdep's pointer is not yet written.  Otherwise
 	 * free them here.
@@ -11755,11 +11825,16 @@ jnewblk_rollforward(jnewblk, fs, cgp, bl
  * Complete a write to a bmsafemap structure.  Roll forward any bitmap
  * changes if it's not a background write.  Set all written dependencies 
  * to DEPCOMPLETE and free the structure if possible.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int
-handle_written_bmsafemap(bmsafemap, bp)
+handle_written_bmsafemap(bmsafemap, bp, flags)
 	struct bmsafemap *bmsafemap;
 	struct buf *bp;
+	int flags;
 {
 	struct newblk *newblk;
 	struct inodedep *inodedep;
@@ -11775,15 +11850,20 @@ handle_written_bmsafemap(bmsafemap, bp)
 	int chgs;
 
 	if ((bmsafemap->sm_state & IOSTARTED) == 0)
-		panic("initiate_write_bmsafemap: Not started\n");
+		panic("handle_written_bmsafemap: Not started\n");
 	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
 	chgs = 0;
 	bmsafemap->sm_state &= ~IOSTARTED;
 	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
 	/*
-	 * Release journal work that was waiting on the write.
+	 * If write was successful, release journal work that was waiting
+	 * on the write. Otherwise move the work back.
 	 */
-	handle_jwork(&bmsafemap->sm_freewr);
+	if (flags & WRITESUCCEEDED)
+		handle_jwork(&bmsafemap->sm_freewr);
+	else
+		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+		    worklist, wk_list);
 
 	/*
 	 * Restore unwritten inode allocation pending jaddref writes.
@@ -11833,6 +11913,20 @@ handle_written_bmsafemap(bmsafemap, bp)
 			free_jnewblk(jnewblk);
 		}
 	}
+	/*
+	 * If the write did not succeed, we have done all the roll-forward
+	 * operations, but we cannot take the actions that will allow its
+	 * dependencies to be processed.
+	 */
+	if ((flags & WRITESUCCEEDED) == 0) {
+		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
+		    newblk, nb_deps);
+		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
+		    worklist, wk_list);
+		if (foreground)
+			bdirty(bp);
+		return (1);
+	}
 	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
 		newblk->nb_state |= DEPCOMPLETE;
 		newblk->nb_state &= ~ONDEPLIST;
@@ -11936,12 +12030,17 @@ free_pagedep(pagedep)
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
  * Note that this routine is always called from interrupt level
- * with further splbio interrupts blocked.
+ * with further interrupts from this device blocked.
+ *
+ * If the write did not succeed, we will do all the roll-forward
+ * operations, but we will not take the actions that will allow its
+ * dependencies to be processed.
  */
 static int 
-handle_written_filepage(pagedep, bp)
+handle_written_filepage(pagedep, bp, flags)
 	struct pagedep *pagedep;
 	struct buf *bp;		/* buffer containing the written page */
+	int flags;
 {
 	struct dirrem *dirrem;
 	struct diradd *dap, *nextdap;
@@ -11951,6 +12050,8 @@ handle_written_filepage(pagedep, bp)
 	if ((pagedep->pd_state & IOSTARTED) == 0)
 		panic("handle_written_filepage: not started");
 	pagedep->pd_state &= ~IOSTARTED;
+	if ((flags & WRITESUCCEEDED) == 0)
+		goto rollforward;
 	/*
 	 * Process any directory removals that have been committed.
 	 */
@@ -11970,6 +12071,7 @@ handle_written_filepage(pagedep, bp)
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap, NULL);
+rollforward:
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
@@ -12002,7 +12104,7 @@ handle_written_filepage(pagedep, bp)
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
-	if (chgs) {
+	if (chgs || (flags & WRITESUCCEEDED) == 0) {
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_dir_entry++;
 		bdirty(bp);

Modified: stable/11/sys/ufs/ffs/softdep.h
==============================================================================
--- stable/11/sys/ufs/ffs/softdep.h	Mon Oct 17 21:35:13 2016	(r307532)
+++ stable/11/sys/ufs/ffs/softdep.h	Mon Oct 17 21:44:41 2016	(r307533)
@@ -140,6 +140,7 @@
 #define	UNLINKPREV	0x100000 /* inodedep is pointed at in the unlink list */
 #define	UNLINKONLIST	0x200000 /* inodedep is in the unlinked list on disk */
 #define	UNLINKLINKS	(UNLINKNEXT | UNLINKPREV)
+#define	WRITESUCCEEDED	0x400000 /* the disk write completed successfully */
 
 #define	ALLCOMPLETE	(ATTACHED | COMPLETE | DEPCOMPLETE)
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201610172144.u9HLif26041731>