Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 31 Jul 2018 00:58:22 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r336959 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201807310058.w6V0wMZw010410@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Tue Jul 31 00:58:21 2018
New Revision: 336959
URL: https://svnweb.freebsd.org/changeset/base/336959

Log:
  MFV r336958: 9337 zfs get all is slow due to uncached metadata
  
  This project's goal is to make read-heavy channel programs and zfs(1m)
  administrative commands faster by caching all the metadata that they will
  need in the dbuf layer. This will prevent the data from being evicted, so
  that any future call to i.e. zfs get all won't have to go to disk (very
  much).
  
  illumos/illumos-gate@adb52d9262f45a04318fc6e188fe2b7f59d989a5
  
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Thomas Caputi <tcaputi@datto.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author:     Matthew Ahrens <mahrens@delphix.com>

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Tue Jul 31 00:58:21 2018	(r336959)
@@ -49,6 +49,7 @@
 #include <sys/abd.h>
 #include <sys/vdev.h>
 #include <sys/cityhash.h>
+#include <sys/spa_impl.h>
 
 uint_t zfs_dbuf_evict_key;
 
@@ -74,24 +75,58 @@ static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ *    that represent the metadata that describes filesystems/snapshots/
+ *    bookmarks/properties/etc. We only evict from this cache when we export a
+ *    pool, to short-circuit as much I/O as possible for all administrative
+ *    commands that need the metadata. There is no eviction policy for this
+ *    cache, because we try to only include types in it which would occupy a
+ *    very small amount of space per object but create a large impact on the
+ *    performance of these commands. Instead, after it reaches a maximum size
+ *    (which should only happen on very small memory systems with a very large
+ *    number of filesystem objects), we stop taking new dbufs into the
+ *    metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
+ *    are not currently held but have been recently released. These dbufs
+ *    are not eligible for arc eviction until they are aged out of the cache.
+ *    Dbufs that are aged out of the cache will be immediately destroyed and
+ *    become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
  */
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+	multilist_t *cache;
+	refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
 int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
 
 /*
- * The dbuf cache uses a three-stage eviction policy:
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
@@ -404,6 +439,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 }
 
 /*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+	DB_DNODE_ENTER(db);
+	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	/* Check if this dbuf is one of the types we care about */
+	if (DMU_OT_IS_METADATA_CACHED(type)) {
+		/* If we hit this, then we set something up wrong in dmu_ot */
+		ASSERT(DMU_OT_IS_METADATA(type));
+
+		/*
+		 * Sanity check for small-memory systems: don't allocate too
+		 * much memory for this purpose.
+		 */
+		if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+		    dbuf_metadata_cache_max_bytes) {
+			dbuf_metadata_cache_overflow++;
+			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+			    dmu_buf_impl_t *, db);
+			return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
@@ -438,7 +508,7 @@ dbuf_cache_above_hiwater(void)
 	uint64_t dbuf_cache_hiwater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
 }
 
@@ -448,7 +518,7 @@ dbuf_cache_above_lowater(void)
 	uint64_t dbuf_cache_lowater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
 }
 
@@ -458,8 +528,9 @@ dbuf_cache_above_lowater(void)
 static void
 dbuf_evict_one(void)
 {
-	int idx = multilist_get_random_index(dbuf_cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(
+	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
@@ -482,8 +553,10 @@ dbuf_evict_one(void)
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		(void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
 		    db->db.db_size, db);
+		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 	} else {
 		multilist_sublist_unlock(mls);
@@ -570,7 +643,8 @@ dbuf_evict_notify(void)
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
-	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+	if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_max_bytes) {
 		if (dbuf_cache_above_hiwater())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
@@ -610,15 +684,21 @@ retry:
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	/*
-	 * Setup the parameters for the dbuf cache. We set the size of the
-	 * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
-	 * has been set in /etc/system and it's not greater than the size of
-	 * the ARC, then we honor that value.
+	 * Setup the parameters for the dbuf caches. We set the sizes of the
+	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+	 * of the size of the ARC, respectively. If the values are set in
+	 * /etc/system and they're not greater than the size of the ARC, then
+	 * we honor that value.
 	 */
 	if (dbuf_cache_max_bytes == 0 ||
 	    dbuf_cache_max_bytes >= arc_max_bytes())  {
 		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
 	}
+	if (dbuf_metadata_cache_max_bytes == 0 ||
+	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+		dbuf_metadata_cache_max_bytes =
+		    arc_max_bytes() >> dbuf_metadata_cache_shift;
+	}
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@@ -626,10 +706,13 @@ retry:
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 
-	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_cache_link),
-	    dbuf_cache_multilist_index_func);
-	refcount_create(&dbuf_cache_size);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		dbuf_caches[dcs].cache =
+		    multilist_create(sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_cache_link),
+		    dbuf_cache_multilist_index_func);
+		refcount_create(&dbuf_caches[dcs].size);
+	}
 
 	tsd_create(&zfs_dbuf_evict_key, NULL);
 	dbuf_evict_thread_exit = B_FALSE;
@@ -663,8 +746,10 @@ dbuf_fini(void)
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
-	refcount_destroy(&dbuf_cache_size);
-	multilist_destroy(dbuf_cache);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		refcount_destroy(&dbuf_caches[dcs].size);
+		multilist_destroy(dbuf_caches[dcs].cache);
+	}
 }
 
 /*
@@ -2051,9 +2136,15 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -2107,6 +2198,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	kmem_cache_free(dbuf_kmem_cache, db);
@@ -2245,6 +2337,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
+		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
@@ -2277,6 +2370,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
+	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
@@ -2619,9 +2713,15 @@ top:
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(refcount_is_zero(&db->db_holds));
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
@@ -2838,12 +2938,22 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 			    db->db_pending_evict) {
 				dbuf_destroy(db);
 			} else if (!multilist_link_active(&db->db_cache_link)) {
-				multilist_insert(dbuf_cache, db);
-				(void) refcount_add_many(&dbuf_cache_size,
+				ASSERT3U(db->db_caching_status, ==,
+				    DB_NO_CACHE);
+
+				dbuf_cached_state_t dcs =
+				    dbuf_include_in_metadata_cache(db) ?
+				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+				db->db_caching_status = dcs;
+
+				multilist_insert(dbuf_caches[dcs].cache, db);
+				(void) refcount_add_many(&dbuf_caches[dcs].size,
 				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
 
-				dbuf_evict_notify();
+				if (db->db_caching_status == DB_DBUF_CACHE) {
+					dbuf_evict_notify();
+				}
 			}
 
 			if (do_arc_evict)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Tue Jul 31 00:58:21 2018	(r336959)
@@ -79,60 +79,60 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_per
 int zfs_object_remap_one_indirect_delay_ticks = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
-	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
-	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
-	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
-	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
-	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
-	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
+	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
+	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
+	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
+	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
+	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
+	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Tue Jul 31 00:58:21 2018	(r336959)
@@ -498,6 +498,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
+	/*
+	 * These properties will be filled in by the logic in zfs_get_zplprop()
+	 * when they are queried for the first time.
+	 */
+	os->os_version = OBJSET_PROP_UNINITIALIZED;
+	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
 
 	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Tue Jul 31 00:58:21 2018	(r336959)
@@ -83,6 +83,13 @@ typedef enum dbuf_states {
 	DB_EVICTING
 } dbuf_states_t;
 
+typedef enum dbuf_cached_state {
+	DB_NO_CACHE = -1,
+	DB_DBUF_CACHE,
+	DB_DBUF_METADATA_CACHE,
+	DB_CACHE_MAX
+} dbuf_cached_state_t;
+
 struct dnode;
 struct dmu_tx;
 
@@ -229,10 +236,11 @@ typedef struct dmu_buf_impl {
 	 */
 	avl_node_t db_link;
 
-	/*
-	 * Link in dbuf_cache.
-	 */
+	/* Link in dbuf_cache or dbuf_metadata_cache */
 	multilist_node_t db_cache_link;
+
+	/* Tells us which dbuf cache this dbuf is in, if any */
+	dbuf_cached_state_t db_caching_status;
 
 	/* Data which is unique to data (leaf) blocks: */
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Tue Jul 31 00:58:21 2018	(r336959)
@@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap {
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t).
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata) \
 	(DMU_OT_NEWTYPE | \
@@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap {
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
+#define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
@@ -810,6 +814,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t siz
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
+	boolean_t		ot_dbuf_metadata_cache;
 	char			*ot_name;
 } dmu_object_type_info_t;
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Tue Jul 31 00:58:21 2018	(r336959)
@@ -39,6 +39,7 @@
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -69,6 +70,7 @@ typedef struct objset_phys {
 	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
+#define	OBJSET_PROP_UNINITIALIZED	((uint64_t)-1)
 struct objset {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
@@ -100,6 +102,16 @@ struct objset {
 	zfs_sync_type_t os_sync;
 	zfs_redundant_metadata_type_t os_redundant_metadata;
 	int os_recordsize;
+	/*
+	 * The next four values are used as a cache of whatever's on disk, and
+	 * are initialized the first time these properties are queried. Before
+	 * being initialized with their real values, their values are
+	 * OBJSET_PROP_UNINITIALIZED.
+	 */
+	uint64_t os_version;
+	uint64_t os_normalization;
+	uint64_t os_utf8only;
+	uint64_t os_casesensitivity;
 
 	/*
 	 * Pointer is constant; the blkptr it points to is protected by

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	Tue Jul 31 00:56:41 2018	(r336958)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	Tue Jul 31 00:58:21 2018	(r336959)
@@ -2630,6 +2630,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
@@ -2642,17 +2643,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
-	const char *pname;
-	int error = ENOENT;
+	uint64_t *cached_copy = NULL;
 
 	/*
-	 * Look up the file system's value for the property.  For the
-	 * version property, we look up a slightly different string.
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
 	 */
-	if (prop == ZFS_PROP_VERSION)
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION) {
 		pname = ZPL_VERSION_STR;
-	else
+	} else {
 		pname = zfs_prop_to_name(prop);
+	}
 
 	if (os != NULL) {
 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
@@ -2677,6 +2708,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_
 		}
 		error = 0;
 	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
 	return (error);
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201807310058.w6V0wMZw010410>