Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 14 Oct 2016 12:03:04 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r307318 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
Message-ID:  <201610141203.u9EC34mk074056@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Fri Oct 14 12:03:04 2016
New Revision: 307318
URL: https://svnweb.freebsd.org/changeset/base/307318

Log:
  MFV r307314:
  6988 spa_sync() spends half its time in dmu_objset_do_userquota_updates
  
  Using a benchmark which creates 2 million files in one TXG, I observe
  that the thread running spa_sync() is on CPU almost the entire time we
  are syncing, and therefore can be a performance bottleneck. About 50% of
  the time in spa_sync() is in dmu_objset_do_userquota_updates().
  
  The problem is that dmu_objset_do_userquota_updates() calls
  zap_increment_int(DMU_USERUSED_OBJECT) once for every file that was
  modified (or created). In this benchmark, all the files are owned by the
  same user/group, so all 2 million calls to zap_increment_int() are
  modifying the same entry in the zap. The same issue exists for the
  DMU_GROUPUSED_OBJECT.
  
  We should keep an in-memory map from user to space delta while we are
  syncing, and when we finish, iterate over the in-memory map and modify
  the ZAP once per entry. This reduces the number of calls to
  zap_increment_int() from "number of objects modified" to "number of
  owners/groups of modified files".
  
  This reduced the time spent in spa_sync() in the file create benchmark
  by ~33%, from 11 seconds to 7 seconds.
  
  Closes #107
  
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Steve Gonczi <steve.gonczi@delphix.com>
  Reviewed by: Ned Bass <bass6@llnl.gov>
  Reviewed by: Jinshan Xiong <jinshan.xiong@intel.com>
  Author: Matthew Ahrens <mahrens@delphix.com>
  
  openzfs/openzfs@5fc46359c569369d87728ca09f8705cdff6cc8e2

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Fri Oct 14 12:01:33 2016	(r307317)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Fri Oct 14 12:03:04 2016	(r307318)
@@ -1207,18 +1207,83 @@ dmu_objset_userused_enabled(objset_t *os
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
+typedef struct userquota_node {
+	uint64_t uqn_id;
+	int64_t uqn_delta;
+	avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+	avl_tree_t uqc_user_deltas;
+	avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+	const userquota_node_t *luqn = l;
+	const userquota_node_t *ruqn = r;
+
+	if (luqn->uqn_id < ruqn->uqn_id)
+		return (-1);
+	if (luqn->uqn_id > ruqn->uqn_id)
+		return (1);
+	return (0);
+}
+
 static void
-do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
-    uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
+{
+	void *cookie;
+	userquota_node_t *uqn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_user_deltas);
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
+{
+	userquota_node_t search = { .uqn_id = id };
+	avl_index_t idx;
+
+	userquota_node_t *uqn = avl_find(avl, &search, &idx);
+	if (uqn == NULL) {
+		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+		uqn->uqn_id = id;
+		avl_insert(avl, uqn, idx);
+	}
+	uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract)
 {
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
-		    user, delta, tx));
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
-		    group, delta, tx));
+
+		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
+		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
 	}
 }
 
@@ -1227,9 +1292,15 @@ dmu_objset_do_userquota_updates(objset_t
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
+	userquota_cache_t cache = { 0 };
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
+	avl_create(&cache.uqc_user_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+	avl_create(&cache.uqc_group_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
 	while (dn = list_head(list)) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
@@ -1239,32 +1310,26 @@ dmu_objset_do_userquota_updates(objset_t
 
 		/* Allocate the user/groupused objects if necessary. */
 		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
-			VERIFY(0 == zap_create_claim(os,
+			VERIFY0(zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-			VERIFY(0 == zap_create_claim(os,
+			VERIFY0(zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
-		/*
-		 * We intentionally modify the zap object even if the
-		 * net delta is zero.  Otherwise
-		 * the block of the zap obj could be shared between
-		 * datasets but need to be different between them after
-		 * a bprewrite.
-		 */
-
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
-			do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
-			    dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+			do_userquota_update(&cache,
+			    dn->dn_oldused, dn->dn_oldflags,
+			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
-			do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+			do_userquota_update(&cache,
+			    DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
-			    dn->dn_newgid, B_FALSE, tx);
+			    dn->dn_newgid, B_FALSE);
 		}
 
 		mutex_enter(&dn->dn_mtx);
@@ -1285,6 +1350,7 @@ dmu_objset_do_userquota_updates(objset_t
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
+	do_userquota_cacheflush(os, &cache, tx);
 }
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201610141203.u9EC34mk074056>