Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 29 Jul 2014 07:08:33 +0000 (UTC)
From:      Xin LI <delphij@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r269223 - vendor-sys/illumos/dist/common/avl vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys vendor/illumo...
Message-ID:  <201407290708.s6T78XTR000184@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: delphij
Date: Tue Jul 29 07:08:32 2014
New Revision: 269223
URL: http://svnweb.freebsd.org/changeset/base/269223

Log:
  4873 zvol unmap calls can take a very long time for larger datasets
  Reviewed by: George Wilson <george@delphix.com>
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
  Reviewed by: Basil Crow <basil.crow@delphix.com>
  Reviewed by: Dan McDonald <danmcd@omniti.com>
  Approved by: Robert Mustacchi <rm@joyent.com>
  
  illumos/illumos-gate@0f6d88aded0d165f5954688a9b13bac76c38da84

Modified:
  vendor-sys/illumos/dist/common/avl/avl.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
  vendor-sys/illumos/dist/uts/common/sys/avl.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/common/avl/avl.c

Modified: vendor-sys/illumos/dist/common/avl/avl.c
==============================================================================
--- vendor-sys/illumos/dist/common/avl/avl.c	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/common/avl/avl.c	Tue Jul 29 07:08:32 2014	(r269223)
@@ -24,6 +24,10 @@
  */
 
 /*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+/*
  * AVL - generic AVL tree implementation for kernel use
  *
  * A complete description of AVL trees can be found in many CS textbooks.
@@ -85,6 +89,12 @@
  *	  is a modified "avl_node_t *".  The bottom bit (normally 0 for a
  *	  pointer) is set to indicate if that the new node has a value greater
  *	  than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
  */
 
 #include <sys/types.h>
@@ -864,6 +874,24 @@ avl_update(avl_tree_t *t, void *obj)
 	return (B_FALSE);
 }
 
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+	avl_node_t *temp_node;
+	ulong_t temp_numnodes;
+
+	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+	ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+	temp_node = tree1->avl_root;
+	temp_numnodes = tree1->avl_numnodes;
+	tree1->avl_root = tree2->avl_root;
+	tree1->avl_numnodes = tree2->avl_numnodes;
+	tree2->avl_root = temp_node;
+	tree2->avl_numnodes = temp_numnodes;
+}
+
 /*
  * initialize a new AVL tree
  */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Tue Jul 29 07:08:32 2014	(r269223)
@@ -69,6 +69,9 @@ dbuf_cons(void *vdb, void *unused, int k
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	refcount_create(&db->db_holds);
+
+	db->db_creation = gethrtime();
+
 	return (0);
 }
 
@@ -330,7 +333,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
-		    !list_is_empty(&dn->dn_dbufs));
+		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
@@ -803,18 +806,30 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
  * receive; see comment below for details.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db, *db_next;
+	dmu_buf_impl_t *db, *db_next, db_search;
 	uint64_t txg = tx->tx_txg;
+	avl_index_t where;
 
-	if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
-		end = dn->dn_maxblkid;
-	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
+		end_blkid = dn->dn_maxblkid;
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+	db_search.db_level = 0;
+	db_search.db_blkid = start_blkid;
+	db_search.db_creation = 0;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
-	if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
+	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 		/* There can't be any dbufs in this range; no need to search. */
+#ifdef DEBUG
+		db = avl_find(&dn->dn_dbufs, &db_search, &where);
+		ASSERT3P(db, ==, NULL);
+		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+		ASSERT(db == NULL || db->db_level > 0);
+#endif
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return;
 	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
@@ -828,14 +843,18 @@ dbuf_free_range(dnode_t *dn, uint64_t st
 		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
 
-	for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
+	db = avl_find(&dn->dn_dbufs, &db_search, &where);
+	ASSERT3P(db, ==, NULL);
+	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+	for (; db != NULL; db = db_next) {
+		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-		if (db->db_level != 0)
-			continue;
-		if (db->db_blkid < start || db->db_blkid > end)
-			continue;
+		if (db->db_level != 0 || db->db_blkid > end_blkid) {
+			break;
+		}
+		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
@@ -1585,7 +1604,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-		list_remove(&dn->dn_dbufs, db);
+		avl_remove(&dn->dn_dbufs, db);
 		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
@@ -1748,7 +1767,7 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
-	list_insert_head(&dn->dn_dbufs, db);
+	avl_add(&dn->dn_dbufs, db);
 	if (db->db_level == 0 && db->db_blkid >=
 	    dn->dn_unlisted_l0_blkid)
 		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
@@ -1807,7 +1826,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 			DB_DNODE_ENTER(db);
 			dn = DB_DNODE(db);
 			mutex_enter(&dn->dn_dbufs_mtx);
-			list_remove(&dn->dn_dbufs, db);
+			avl_remove(&dn->dn_dbufs, db);
 			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
 			mutex_exit(&dn->dn_dbufs_mtx);
 			DB_DNODE_EXIT(db);
@@ -1825,7 +1844,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	db->db_parent = NULL;
 	db->db_buf = NULL;
 
-	ASSERT(!list_link_active(&db->db_link));
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c	Tue Jul 29 07:08:32 2014	(r269223)
@@ -59,6 +59,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT
 
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+	const dmu_buf_impl_t *d1 = x1;
+	const dmu_buf_impl_t *d2 = x2;
+
+	if (d1->db_level < d2->db_level) {
+		return (-1);
+	} else if (d1->db_level > d2->db_level) {
+		return (1);
+	}
+
+	if (d1->db_blkid < d2->db_blkid) {
+		return (-1);
+	} else if (d1->db_blkid > d2->db_blkid) {
+		return (1);
+	}
+
+	/*
+	 * If a dbuf is being evicted while dn_dbufs_mutex is not held, we set
+	 * the db_state to DB_EVICTING but do not remove it from dn_dbufs. If
+	 * another thread creates a dbuf of the same blkid before the dbuf is
+	 * removed from dn_dbufs, we can reach a state where there are two
+	 * dbufs of the same blkid and level in db_dbufs. To maintain the avl
+	 * invariant that there cannot be duplicate items, we distinguish
+	 * between these two dbufs based on the time they were created.
+	 */
+	if (d1->db_creation < d2->db_creation) {
+		return (-1);
+	} else if (d1->db_creation > d2->db_creation) {
+		return (1);
+	} else {
+		ASSERT3P(d1, ==, d2);
+		return (0);
+	}
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -113,7 +150,7 @@ dnode_cons(void *arg, void *unused, int 
 
 	dn->dn_dbufs_count = 0;
 	dn->dn_unlisted_l0_blkid = 0;
-	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	dn->dn_moved = 0;
@@ -166,7 +203,7 @@ dnode_dest(void *arg, void *unused)
 
 	ASSERT0(dn->dn_dbufs_count);
 	ASSERT0(dn->dn_unlisted_l0_blkid);
-	list_destroy(&dn->dn_dbufs);
+	avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -502,7 +539,7 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(dn->dn_next_nblkptr[i]);
@@ -687,8 +724,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *n
 	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-	ASSERT(list_is_empty(&ndn->dn_dbufs));
-	list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ASSERT(avl_is_empty(&ndn->dn_dbufs));
+	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
 	ndn->dn_dbufs_count = odn->dn_dbufs_count;
 	ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
 	ndn->dn_bonus = odn->dn_bonus;
@@ -722,7 +759,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *n
 	 */
 	odn->dn_dbuf = NULL;
 	odn->dn_handle = NULL;
-	list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 	odn->dn_dbufs_count = 0;
 	odn->dn_unlisted_l0_blkid = 0;
@@ -1231,7 +1268,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx
 		return;
 	}
 
-	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
 	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1304,7 +1342,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db, *db_next;
+	dmu_buf_impl_t *db;
 	int err;
 
 	if (size == 0)
@@ -1327,9 +1365,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-
+	for (db = avl_first(&dn->dn_dbufs); db != NULL;
+	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
 		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c	Tue Jul 29 07:08:32 2014	(r269223)
@@ -400,16 +400,13 @@ dnode_evict_dbufs(dnode_t *dn)
 	int pass = 0;
 
 	do {
-		dmu_buf_impl_t *db, marker;
+		dmu_buf_impl_t *db, *db_next;
 		int evicting = FALSE;
 
 		progress = FALSE;
 		mutex_enter(&dn->dn_dbufs_mtx);
-		list_insert_tail(&dn->dn_dbufs, &marker);
-		db = list_head(&dn->dn_dbufs);
-		for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-			list_remove(&dn->dn_dbufs, db);
-			list_insert_tail(&dn->dn_dbufs, db);
+		for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+			db_next = AVL_NEXT(&dn->dn_dbufs, db);
 #ifdef	DEBUG
 			DB_DNODE_ENTER(db);
 			ASSERT3P(DB_DNODE(db), ==, dn);
@@ -429,7 +426,6 @@ dnode_evict_dbufs(dnode_t *dn)
 			}
 
 		}
-		list_remove(&dn->dn_dbufs, &marker);
 		/*
 		 * NB: we need to drop dn_dbufs_mtx between passes so
 		 * that any DB_EVICTING dbufs can make progress.
@@ -500,7 +496,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
 	ASSERT3P(dn->dn_bonus, ==, NULL);
 
 	/*

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 29 07:08:32 2014	(r269223)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -213,11 +213,14 @@ typedef struct dmu_buf_impl {
 	/* pointer to most recent dirty record for this buffer */
 	dbuf_dirty_record_t *db_last_dirty;
 
+	/* Creation time of dbuf (see comment in dbuf_compare). */
+	hrtime_t db_creation;
+
 	/*
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
-	list_node_t db_link;
+	avl_node_t db_link;
 
 	/* Data which is unique to data (leaf) blocks: */
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h	Tue Jul 29 07:08:32 2014	(r269223)
@@ -211,7 +211,7 @@ typedef struct dnode {
 	refcount_t dn_holds;
 
 	kmutex_t dn_dbufs_mtx;
-	list_t dn_dbufs;		/* descendent dbufs */
+	avl_tree_t dn_dbufs;		/* descendent dbufs */
 
 	/* protected by dn_struct_rwlock */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */

Modified: vendor-sys/illumos/dist/uts/common/sys/avl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/sys/avl.h	Tue Jul 29 06:57:13 2014	(r269222)
+++ vendor-sys/illumos/dist/uts/common/sys/avl.h	Tue Jul 29 07:08:32 2014	(r269223)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef	_AVL_H
 #define	_AVL_H
 
@@ -260,6 +264,11 @@ extern boolean_t avl_update_lt(avl_tree_
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
 /*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
  * Return the number of nodes in the tree
  */
 extern ulong_t avl_numnodes(avl_tree_t *tree);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201407290708.s6T78XTR000184>