Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 16 Dec 2015 19:27:20 +0000 (UTC)
From:      Alan Somers <asomers@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r292351 - in projects/zfsd/head/tests/sys/cddl/zfs: include tests/redundancy
Message-ID:  <201512161927.tBGJRKeT045376@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: asomers
Date: Wed Dec 16 19:27:20 2015
New Revision: 292351
URL: https://svnweb.freebsd.org/changeset/base/292351

Log:
  Fix several redundancy test reliability & debuggability issues.
  
  tests/sys/cddl/zfs/include/libtest.kshlib:
  	- Add a generic wait_for mechanism that takes a timeout, dt, and a
  	  command+args to run until it returns true.
  	- Add an is_pool_state <pool> <state> command.
  
  tests/sys/cddl/zfs/tests/redundancy/redundancy.kshlib:
  	- cleanup: Always log the verbose status of the pool.  This is
  	  cleaner than doing it in umpteen different error cases, and in any
  	  case we don't usually bother generating output for success.
  	- sync_pool: Don't check the pool state.  Make this the
  	  responsibility of the several callers, which need to check for
  	  different states.
  	- damage_devs: In the damage-all-even-labels case, repeat the effort
  	  to damage every vdev until it goes UNAVAIL.  Previously, sometimes
  	  the pool would sync its labels to the device after the damage had
  	  been done but before the pool found the damage itself.
  	- clear_errors: Wait for the pool to become healthy rather than
  	  requiring the 'zpool clear' to have this effect; resilvering will
  	  not necessarily complete immediately; it's issued async from
  	  clear.
  	- remove_devs: After removing the device files, wait for each of
  	  them to become UNAVAIL in the pool config.
  
  Submitted by:	Will
  Sponsored by:	Spectra Logic Corp

Modified:
  projects/zfsd/head/tests/sys/cddl/zfs/include/libtest.kshlib
  projects/zfsd/head/tests/sys/cddl/zfs/tests/redundancy/redundancy.kshlib

Modified: projects/zfsd/head/tests/sys/cddl/zfs/include/libtest.kshlib
==============================================================================
--- projects/zfsd/head/tests/sys/cddl/zfs/include/libtest.kshlib	Wed Dec 16 19:23:10 2015	(r292350)
+++ projects/zfsd/head/tests/sys/cddl/zfs/include/libtest.kshlib	Wed Dec 16 19:27:20 2015	(r292351)
@@ -1297,6 +1297,41 @@ function reexport_pool
 }
 
 #
+# Wait for something to return true, checked by the caller.
+#
+function wait_for_checked # timeout dt <method> [args...]
+{
+	typeset timeout=$1
+	typeset dt=$2
+	shift; shift
+	typeset -i start=$(date '+%s')
+	typeset -i endtime
+
+	((endtime = start + timeout))
+	while :; do
+		$*
+		[ $? -eq 0 ] && return
+		curtime=$(date '+%s')
+		[ $curtime -gt $endtime ] && return 1
+		sleep $dt
+	done
+	return 0
+}
+
+#
+# Wait for something to return true.
+#
+function wait_for # timeout dt <method> [args...]
+{
+	typeset timeout=$1
+	typeset dt=$2
+	shift; shift
+
+	wait_for_checked $timeout $dt $* || \
+		log_fail "ERROR: Timed out waiting for: $*"
+}
+
+#
 # Verify a given disk is online or offline
 #
 # Return 0 is pool/disk matches expected state, 1 otherwise
@@ -1330,6 +1365,26 @@ function wait_for_state_exit
 }
 
 #
+# Wait for a given disk to enter a state
+#
+function wait_for_state_enter
+{
+	typeset -i timeout=$1
+	typeset pool=$2
+	typeset disk=$3
+	typeset state=$4
+
+	log_note "Waiting up to $timeout seconds for $disk to become $state ..."
+	for ((; $timeout > 0; timeout=$timeout-1)); do
+		check_state $pool "$disk" "$state"
+		[ $? -eq 0 ] && return
+		$SLEEP 1
+	done
+	log_must $ZPOOL status $pool
+	log_fail "ERROR: Disk $disk not marked as $state in $pool"
+}
+
+#
 # Get the mountpoint of snapshot
 # as its mountpoint
 #
@@ -1615,6 +1670,12 @@ function is_pool_scrub_stopped #pool
 	return $?
 }
 
+function is_pool_state # pool state
+{
+	check_pool_status "$1" "state" "$2"
+	return $?
+}
+
 #
 # Erase the partition tables and destroy any zfs labels
 # 

Modified: projects/zfsd/head/tests/sys/cddl/zfs/tests/redundancy/redundancy.kshlib
==============================================================================
--- projects/zfsd/head/tests/sys/cddl/zfs/tests/redundancy/redundancy.kshlib	Wed Dec 16 19:23:10 2015	(r292350)
+++ projects/zfsd/head/tests/sys/cddl/zfs/tests/redundancy/redundancy.kshlib	Wed Dec 16 19:27:20 2015	(r292351)
@@ -30,6 +30,8 @@
 
 function cleanup
 {
+	# Log the status of the pool to assist failures.
+	poolexists $TESTPOOL && $ZPOOL status -v $TESTPOOL
 	destroy_pool $TESTPOOL
 	typeset dir
 	for dir in $TESTDIR $BASEDIR; do
@@ -184,6 +186,7 @@ function sync_pool #pool
 	log_must $SLEEP 2
 	# Flush all the pool data.
 	typeset -i ret
+
 	# If the OS has detected corruption on the pool, it will have
 	# automatically initiated a scrub.  In that case, our "zpool scrub"
 	# command will fail.  So we ignore its exit status and just check that
@@ -191,12 +194,6 @@ function sync_pool #pool
 	$ZPOOL scrub $pool >/dev/null 2>&1
 	is_pool_scrubbing $pool || is_pool_scrubbed $pool || \
 		log_fail "$ZPOOL scrub $pool failed." 
-
-	# The pool has been damaged; the sync should notice this fact.
-	log_note "Waiting for pool to sync..."
-	while ! is_pool_scrubbed $pool || is_pool_resilvered $pool; do
-		log_must $SLEEP 2
-	done
 }
 
 #
@@ -214,10 +211,28 @@ function replace_missing_devs
 	for vdev in $@; do
 		log_must $MKFILE $DEV_SIZE $vdev
 		log_must $ZPOOL replace -f $pool $vdev $vdev
-		while ! is_pool_resilvered $pool; do
-			log_must $SLEEP 2
-		done
+		wait_for 20 2 is_pool_resilvered $pool
+	done
+}
+
+#
+# Damage the labels of the specified devices.  Returns 0 if all such devices
+# are UNAVAIL, 1 otherwise.
+#
+function damage_dev_labels # pool <vdev> [vdev ...]
+{
+	typeset pool=$1
+	typeset -i ret=0
+	shift
+
+	for vdev in $*; do
+		check_state $pool $vdev UNAVAIL && continue
+		log_must $MKFILE $DEV_SIZE $vdev
+		ret=1
 	done
+	[ $ret -eq 0 ] && return $ret
+	sync_pool $pool
+	return $ret
 }
 
 #
@@ -243,14 +258,16 @@ function damage_devs
 		for dev in $vdevs; do 
 			bs_count=$($LS -l $dev | $AWK '{print $5}')
 			(( bs_count = bs_count/1024 - 512 ))
-			$DD if=/dev/zero of=$dev seek=512 bs=1024 \
-				count=$bs_count conv=notrunc >/dev/null 2>&1
+			log_must $DD if=/dev/zero of=$dev seek=512 bs=1024 \
+				count=$bs_count conv=notrunc
 		done	
+		sync_pool $pool
 	else
-		log_must $MKFILE $DEV_SIZE $vdevs
+		# The pool can be syncing, thus fixing its labels.  So we
+		# have to keep trying until all the devices go offline.
+		wait_for 20 2 damage_dev_labels $pool $vdevs
 	fi
 
-	sync_pool $pool
 	log_note "Pool $pool vdevs $vdevs damage completed."
 }
 
@@ -264,12 +281,10 @@ function clear_errors
 	typeset pool=$1
 
 	log_must $ZPOOL clear $pool
+	# The pool may need to resilver (issued async by 'zpool clear'),
+	# give it a chance to do so.
+	wait_for 30 2 is_healthy $pool 
 
-	if ! is_healthy $pool ; then
-		$ZPOOL status -x $pool
-		log_note "$pool should be healthy."
-		return 1
-	fi
 	if ! is_data_valid $pool ; then
 		$ZPOOL status -x $pool
 		log_note "Data should be valid in $pool."
@@ -296,6 +311,9 @@ function remove_devs
 	log_must $RM -f $vdevs
 
 	sync_pool $pool
+	for vdev in $vdevs; do
+		wait_for 20 2 check_state $pool $vdev UNAVAIL
+	done
 }
 
 #



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201512161927.tBGJRKeT045376>