Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 20 Apr 2015 00:24:33 +0000 (UTC)
From:      Xin LI <delphij@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-user@freebsd.org
Subject:   svn commit: r281757 - in user/delphij/zfs-arc-rebase: cddl/contrib/opensolaris/lib/libzpool/common/sys sys/cddl/compat/opensolaris/sys sys/cddl/contrib/opensolaris/uts/common sys/cddl/contrib/opens...
Message-ID:  <201504200024.t3K0OXhk000726@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: delphij
Date: Mon Apr 20 00:24:32 2015
New Revision: 281757
URL: https://svnweb.freebsd.org/changeset/base/281757

Log:
  MFV r277431,281755:
  
  Reduce ARC lock contention by using finer grained locking in the ARC
  LRU lists by using separate lists and their own locks for each ARC
  state, and rework ARC reclaimation logic.
  
  Illumos issue:
      5497 lock contention on arcs_mtx

Added:
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/multilist.c
     - copied unchanged from r277431, vendor-sys/illumos/dist/uts/common/fs/zfs/multilist.c
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/multilist.h
     - copied unchanged from r277431, vendor-sys/illumos/dist/uts/common/fs/zfs/sys/multilist.h
Modified:
  user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
  user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/proc.h
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
Directory Properties:
  user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/   (props changed)
  user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/   (props changed)

Modified: user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
==============================================================================
--- user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	Sun Apr 19 23:55:59 2015	(r281756)
+++ user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h	Mon Apr 20 00:24:32 2015	(r281757)
@@ -545,6 +545,7 @@ extern void delay(clock_t ticks);
 	} while (0);
 
 #define	max_ncpus	64
+#define	boot_ncpus	(sysconf(_SC_NPROCESSORS_ONLN))
 
 #define	minclsyspri	60
 #define	maxclsyspri	99

Modified: user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/proc.h
==============================================================================
--- user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/proc.h	Sun Apr 19 23:55:59 2015	(r281756)
+++ user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/proc.h	Mon Apr 20 00:24:32 2015	(r281757)
@@ -47,6 +47,7 @@
 #define	maxclsyspri	PVM
 #define	max_ncpus	mp_ncpus
 #define	boot_max_ncpus	mp_ncpus
+#define	boot_ncpus	mp_ncpus
 
 #define	TS_RUN	0
 

Modified: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
==============================================================================
--- user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/Makefile.files	Sun Apr 19 23:55:59 2015	(r281756)
+++ user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/Makefile.files	Mon Apr 20 00:24:32 2015	(r281757)
@@ -21,9 +21,9 @@
 
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
 # Copyright (c) 2013 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+# Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
 #
 #
 # This Makefile defines all file modules for the directory uts/common
@@ -32,6 +32,7 @@
 
 ZFS_COMMON_OBJS +=		\
 	arc.o			\
+	blkptr.o		\
 	bplist.o		\
 	blkptr.o		\
 	bpobj.o			\
@@ -65,6 +66,7 @@ ZFS_COMMON_OBJS +=		\
 	lz4.o			\
 	lzjb.o			\
 	metaslab.o		\
+	multilist.o		\
 	range_tree.o		\
 	refcount.o		\
 	rrwlock.o		\

Modified: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sun Apr 19 23:55:59 2015	(r281756)
+++ user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Mon Apr 20 00:24:32 2015	(r281757)
@@ -129,6 +129,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
 #endif
@@ -149,21 +150,39 @@ int arc_procfd;
 #endif
 #endif /* illumos */
 
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
+static kmutex_t		arc_reclaim_lock;
+static kcondvar_t	arc_reclaim_thread_cv;
+static boolean_t	arc_reclaim_thread_exit;
+static kcondvar_t	arc_reclaim_waiters_cv;
+
+static kmutex_t		arc_user_evicts_lock;
+static kcondvar_t	arc_user_evicts_cv;
+static boolean_t	arc_user_evicts_thread_exit;
 
 uint_t arc_reduce_dnlc_percent = 3;
 
 /*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
  */
-int arc_evict_iterations = 100;
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int		zfs_arc_overflow_shift = 8;
+
 /* shift of arc_c for calculating both min and max arc_p */
 static int		arc_p_min_shift = 4;
 
@@ -316,10 +335,19 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta
  */
 
 typedef struct arc_state {
-	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
-	uint64_t arcs_size;	/* total amount of data in this state */
-	kmutex_t arcs_mtx;
+	/*
+	 * list of evictable buffers
+	 */
+	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of evictable data in this state
+	 */
+	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of data in this state; this includes: evictable,
+	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+	 */
+	uint64_t arcs_size;
 } arc_state_t;
 
 /* The 6 states: */
@@ -347,8 +375,6 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_mfu_ghost_hits;
 	kstat_named_t arcstat_allocated;
 	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_stolen;
-	kstat_named_t arcstat_recycle_miss;
 	/*
 	 * Number of buffers that could not be evicted because the hash lock
 	 * was held by another thread.  The lock may not necessarily be held
@@ -362,9 +388,15 @@ typedef struct arc_stats {
 	 * not from the spa we're trying to evict from.
 	 */
 	kstat_named_t arcstat_evict_skip;
+	/*
+	 * Number of times arc_evict_state() was unable to evict enough
+	 * buffers to reach it's target amount.
+	 */
+	kstat_named_t arcstat_evict_not_enough;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
+	kstat_named_t arcstat_evict_l2_skip;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -515,11 +547,12 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_writes_lock_retry;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
 	kstat_named_t arcstat_l2_evict_l1cached;
 	kstat_named_t arcstat_l2_free_on_write;
+	kstat_named_t arcstat_l2_cdata_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
@@ -535,7 +568,6 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_write_buffer_iter;
 	kstat_named_t arcstat_l2_write_pios;
 	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
-	kstat_named_t arcstat_l2_write_buffer_list_iter;
 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 	kstat_named_t arcstat_memory_throttle_count;
 	kstat_named_t arcstat_duplicate_buffers;
@@ -564,13 +596,13 @@ static arc_stats_t arc_stats = {
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "allocated",			KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "stolen",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -609,11 +641,12 @@ static arc_stats_t arc_stats = {
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
+	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
@@ -629,7 +662,6 @@ static arc_stats_t arc_stats = {
 	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
-	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
 	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
@@ -781,7 +813,7 @@ typedef struct l1arc_buf_hdr {
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
+	multilist_node_t	b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
@@ -852,7 +884,6 @@ sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HAN
 #endif
 
 static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 
 #define	GHOST_STATE(state)	\
@@ -1071,8 +1102,7 @@ static uint8_t l2arc_thread_exit;
 
 static void arc_get_data_buf(arc_buf_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
@@ -1253,6 +1283,7 @@ hdr_full_cons(void *vbuf, void *unused, 
 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	refcount_create(&hdr->b_l1hdr.b_refcnt);
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
@@ -1297,6 +1328,7 @@ hdr_full_dest(void *vbuf, void *unused)
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
@@ -1333,7 +1365,7 @@ hdr_recl(void *unused)
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 }
 
 static void
@@ -1411,18 +1443,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 	} else {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 		ASSERT0(hdr->b_l1hdr.b_datacnt);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
 		/*
-		 * We might be removing the L1hdr of a buffer which was just
-		 * written out to L2ARC. If such a buffer is compressed then we
-		 * need to free its b_tmp_cdata before destroying the header.
-		 */
-		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(hdr);
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
+		 */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		/*
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+		 * might try to be accessed, even though it was removed.
+		 */
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
 		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
 	}
 	/*
@@ -1641,14 +1686,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
 	    (state != arc_anon)) {
 		/* We don't use the L2-only state list. */
 		if (state != arc_l2c_only) {
+			arc_buf_contents_t type = arc_buf_type(hdr);
 			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
-			list_t *list = &state->arcs_list[arc_buf_type(hdr)];
-			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+			multilist_t *list = &state->arcs_list[type];
+			uint64_t *size = &state->arcs_lsize[type];
+
+			multilist_remove(list, hdr);
 
-			ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-			mutex_enter(&state->arcs_mtx);
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(list, hdr);
 			if (GHOST_STATE(state)) {
 				ASSERT0(hdr->b_l1hdr.b_datacnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -1657,7 +1701,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
 			ASSERT(delta > 0);
 			ASSERT3U(*size, >=, delta);
 			atomic_add_64(size, -delta);
-			mutex_exit(&state->arcs_mtx);
 		}
 		/* remove the prefetch flag if we get a reference */
 		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -1680,22 +1723,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmu
 	 */
 	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
-		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		multilist_t *list = &state->arcs_list[type];
+		uint64_t *size = &state->arcs_lsize[type];
+
+		multilist_insert(list, hdr);
 
-		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-		mutex_enter(&state->arcs_mtx);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-		list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 		atomic_add_64(size, hdr->b_size *
 		    hdr->b_l1hdr.b_datacnt);
-		mutex_exit(&state->arcs_mtx);
 	}
 	return (cnt);
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
@@ -1739,15 +1781,10 @@ arc_change_state(arc_state_t *new_state,
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
-			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
 			uint64_t *size = &old_state->arcs_lsize[buftype];
 
-			if (use_mutex)
-				mutex_enter(&old_state->arcs_mtx);
-
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(&old_state->arcs_list[buftype], hdr);
+			multilist_remove(&old_state->arcs_list[buftype], hdr);
 
 			/*
 			 * If prefetching out of the ghost cache,
@@ -1760,12 +1797,8 @@ arc_change_state(arc_state_t *new_state,
 			}
 			ASSERT3U(*size, >=, from_delta);
 			atomic_add_64(size, -from_delta);
-
-			if (use_mutex)
-				mutex_exit(&old_state->arcs_mtx);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
-			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
 			uint64_t *size = &new_state->arcs_lsize[buftype];
 
 			/*
@@ -1775,10 +1808,7 @@ arc_change_state(arc_state_t *new_state,
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			if (use_mutex)
-				mutex_enter(&new_state->arcs_mtx);
-
-			list_insert_head(&new_state->arcs_list[buftype], hdr);
+			multilist_insert(&new_state->arcs_list[buftype], hdr);
 
 			/* ghost elements have a ghost size */
 			if (GHOST_STATE(new_state)) {
@@ -1787,9 +1817,6 @@ arc_change_state(arc_state_t *new_state,
 				to_delta = hdr->b_size;
 			}
 			atomic_add_64(size, to_delta);
-
-			if (use_mutex)
-				mutex_exit(&new_state->arcs_mtx);
 		}
 	}
 
@@ -1811,8 +1838,8 @@ arc_change_state(arc_state_t *new_state,
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.
 	 */
-	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
-	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1905,6 +1932,7 @@ arc_buf_alloc(spa_t *spa, int32_t size, 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_datacnt = 1;
+	hdr->b_l1hdr.b_tmp_cdata = NULL;
 
 	arc_get_data_buf(buf);
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -2034,6 +2062,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta
 	    data, metadata, hits);
 }
 
+static void
+arc_buf_free_on_write(void *data, size_t size,
+    void (*free_func)(void *, size_t))
+{
+	l2arc_data_free_t *df;
+
+	df = kmem_alloc(sizeof (*df), KM_SLEEP);
+	df->l2df_data = data;
+	df->l2df_size = size;
+	df->l2df_func = free_func;
+	mutex_enter(&l2arc_free_on_write_mtx);
+	list_insert_head(l2arc_free_on_write, df);
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
 /*
  * Free the arc data buffer.  If it is an l2arc write in progress,
  * the buffer is placed on l2arc_free_on_write to be freed later.
@@ -2044,26 +2087,74 @@ arc_buf_data_free(arc_buf_t *buf, void (
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (HDR_L2_WRITING(hdr)) {
-		l2arc_data_free_t *df;
-		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
-		df->l2df_data = buf->b_data;
-		df->l2df_size = hdr->b_size;
-		df->l2df_func = free_func;
-		mutex_enter(&l2arc_free_on_write_mtx);
-		list_insert_head(l2arc_free_on_write, df);
-		mutex_exit(&l2arc_free_on_write_mtx);
+		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else {
 		free_func(buf->b_data, hdr->b_size);
 	}
 }
 
+static void
+arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L2HDR(hdr));
+	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
+
+	/*
+	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
+	 * that doesn't exist, the header is in the arc_l2c_only state,
+	 * and there isn't anything to free (it's already been freed).
+	 */
+	if (!HDR_HAS_L1HDR(hdr))
+		return;
+
+	/*
+	 * The header isn't being written to the l2arc device, thus it
+	 * shouldn't have a b_tmp_cdata to free.
+	 */
+	if (!HDR_L2_WRITING(hdr)) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+		return;
+	}
+
+	/*
+	 * The header does not have compression enabled. This can be due
+	 * to the buffer not being compressible, or because we're
+	 * freeing the buffer before the second phase of
+	 * l2arc_write_buffer() has started (which does the compression
+	 * step). In either case, b_tmp_cdata does not point to a
+	 * separately compressed buffer, so there's nothing to free (it
+	 * points to the same buffer as the arc_buf_t's b_data field).
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
+		return;
+	}
+
+	/*
+	 * There's nothing to free since the buffer was all zero's and
+	 * compressed to a zero length buffer.
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+		return;
+	}
+
+	ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
+
+	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+	    hdr->b_size, zio_data_buf_free);
+
+	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
+	hdr->b_l1hdr.b_tmp_cdata = NULL;
+}
+
 /*
  * Free up buf->b_data and if 'remove' is set, then pull the
  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
  */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
 {
 	arc_buf_t **bufp;
 
@@ -2078,17 +2169,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
 		arc_buf_unwatch(buf);
 #endif
 
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				arc_buf_data_free(buf, zio_buf_free);
-				arc_space_return(size, ARC_SPACE_META);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				arc_buf_data_free(buf, zio_data_buf_free);
-				arc_space_return(size, ARC_SPACE_DATA);
-			}
+		if (type == ARC_BUFC_METADATA) {
+			arc_buf_data_free(buf, zio_buf_free);
+			arc_space_return(size, ARC_SPACE_META);
+		} else {
+			ASSERT(type == ARC_BUFC_DATA);
+			arc_buf_data_free(buf, zio_data_buf_free);
+			arc_space_return(size, ARC_SPACE_DATA);
 		}
-		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+		/* protected by hash lock, if in the hash table */
+		if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
 			uint64_t *cnt = &state->arcs_lsize[type];
 
 			ASSERT(refcount_is_zero(
@@ -2158,6 +2249,12 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		    hdr->b_size, 0);
 		list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
 
+		/*
+		 * We don't want to leak the b_tmp_cdata buffer that was
+		 * allocated in l2arc_write_buffers()
+		 */
+		arc_buf_l2_cdata_free(hdr);
+
 		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 
@@ -2180,20 +2277,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 
 			if (buf->b_efunc != NULL) {
-				mutex_enter(&arc_eviction_mtx);
+				mutex_enter(&arc_user_evicts_lock);
 				mutex_enter(&buf->b_evict_lock);
 				ASSERT(buf->b_hdr != NULL);
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    FALSE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
 				hdr->b_l1hdr.b_buf = buf->b_next;
 				buf->b_hdr = &arc_eviction_hdr;
 				buf->b_next = arc_eviction_list;
 				arc_eviction_list = buf;
 				mutex_exit(&buf->b_evict_lock);
-				mutex_exit(&arc_eviction_mtx);
+				cv_signal(&arc_user_evicts_cv);
+				mutex_exit(&arc_user_evicts_lock);
 			} else {
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    TRUE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
 			}
 		}
 #ifdef ZFS_DEBUG
@@ -2206,7 +2302,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
@@ -2232,7 +2328,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 
 		(void) remove_reference(hdr, hash_lock, tag);
 		if (hdr->b_l1hdr.b_datacnt > 1) {
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		} else {
 			ASSERT(buf == hdr->b_l1hdr.b_buf);
 			ASSERT(buf->b_efunc == NULL);
@@ -2246,16 +2342,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 		 * this buffer unless the write completes before we finish
 		 * decrementing the reference count.
 		 */
-		mutex_enter(&arc_eviction_mtx);
+		mutex_enter(&arc_user_evicts_lock);
 		(void) remove_reference(hdr, NULL, tag);
 		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
+		mutex_exit(&arc_user_evicts_lock);
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
 		if (remove_reference(hdr, NULL, tag) > 0)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		else
 			arc_hdr_destroy(hdr);
 	}
@@ -2284,7 +2380,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void*
 	(void) remove_reference(hdr, hash_lock, tag);
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
 		ASSERT(buf->b_efunc == NULL);
@@ -2345,429 +2441,675 @@ arc_buf_eviction_needed(arc_buf_t *buf)
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-	arc_buf_hdr_t marker = { 0 };
-	int count = 0;
-
-	ASSERT(state == arc_mru || state == arc_mfu);
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
 
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	/*
-	 * The ghost list lock must be acquired first in order to prevent
-	 * a 3 party deadlock:
-	 *
-	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
-	 *    l2ad_mtx in arc_hdr_realloc
-	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
-	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
-	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
-	 *
-	 * This situation is avoided by acquiring the ghost list lock first.
-	 */
-	mutex_enter(&evicted_state->arcs_mtx);
-	mutex_enter(&state->arcs_mtx);
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 
-	/*
-	 * Decide which "type" (data vs metadata) to recycle from.
-	 *
-	 * If we are over the metadata limit, recycle from metadata.
-	 * If we are under the metadata minimum, recycle from data.
-	 * Otherwise, recycle from whichever type has the oldest (least
-	 * recently accessed) header.
-	 */
-	if (recycle) {
-		arc_buf_hdr_t *data_hdr =
-		    list_tail(&state->arcs_list[ARC_BUFC_DATA]);
-		arc_buf_hdr_t *metadata_hdr =
-		    list_tail(&state->arcs_list[ARC_BUFC_METADATA]);
-		arc_buf_contents_t realtype;
-
-		if (data_hdr == NULL) {
-			realtype = ARC_BUFC_METADATA;
-		} else if (metadata_hdr == NULL) {
-			realtype = ARC_BUFC_DATA;
-		} else if (arc_meta_used >= arc_meta_limit) {
-			realtype = ARC_BUFC_METADATA;
-		} else if (arc_meta_used <= arc_meta_min) {
-			realtype = ARC_BUFC_DATA;
-		} else if (HDR_HAS_L1HDR(data_hdr) &&
-		    HDR_HAS_L1HDR(metadata_hdr) &&
-		    data_hdr->b_l1hdr.b_arc_access <
-		    metadata_hdr->b_l1hdr.b_arc_access) {
-			realtype = ARC_BUFC_DATA;
-		} else {
-			realtype = ARC_BUFC_METADATA;
+		/*
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. it's b_tmp_cdata field) during it's write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing it's L1 piece) until the header is
+		 * done being written to the l2arc.
+		 */
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
 		}
-		if (realtype != type) {
+
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += hdr->b_size;
+
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+		if (HDR_HAS_L2HDR(hdr)) {
 			/*
-			 * If we want to evict from a different list,
-			 * we can not recycle, because DATA vs METADATA
-			 * buffers are segregated into different kmem
-			 * caches (and vmem arenas).
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
 			 */
-			type = realtype;
-			recycle = B_FALSE;
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			/*
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
+			 */
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
+		} else {
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
 		}
+		return (bytes_evicted);
 	}
 
-	list_t *list = &state->arcs_list[type];
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(list, hdr);
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(hdr) ||
-		    (spa && hdr->b_spa != spa) ||
-		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-		    arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    arc_min_prefetch_lifespan)) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
+	}
+
+	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+	ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
 		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && hdr->b_size != bytes &&
-		    hdr_prev && hdr_prev->b_size == bytes)
-			continue;
+		if (buf->b_data != NULL)
+			bytes_evicted += hdr->b_size;
+		if (buf->b_efunc != NULL) {
+			mutex_enter(&arc_user_evicts_lock);
+			arc_buf_destroy(buf, FALSE);
+			hdr->b_l1hdr.b_buf = buf->b_next;
+			buf->b_hdr = &arc_eviction_hdr;
+			buf->b_next = arc_eviction_list;
+			arc_eviction_list = buf;
+			cv_signal(&arc_user_evicts_cv);
+			mutex_exit(&arc_user_evicts_lock);
+			mutex_exit(&buf->b_evict_lock);
+		} else {
+			mutex_exit(&buf->b_evict_lock);
+			arc_buf_destroy(buf, TRUE);
+		}
+	}
 
-		/* ignore markers */
-		if (hdr->b_spa == 0)
-			continue;
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+	} else {
+		if (l2arc_write_eligible(hdr->b_spa, hdr))
+			ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+		else
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+	}
+
+	if (hdr->b_l1hdr.b_datacnt == 0) {
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	}
+
+	return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
+{
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	int evict_count = 0;
+
+	ASSERT3P(marker, !=, NULL);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	mls = multilist_sublist_lock(ml, idx);
+
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
 
 		/*
-		 * It may take a long time to evict all the bufs requested.
-		 * To avoid blocking all arc activity, periodically drop
-		 * the arcs_mtx and give other threads a chance to run
-		 * before reacquiring the lock.
-		 *
-		 * If we are looking for a buffer to recycle, we are in
-		 * the hot code path, so don't sleep.
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
+		 */
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
 		 */
-		if (!recycle && count++ > arc_evict_iterations) {
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(&state->arcs_mtx);
-			mutex_exit(&evicted_state->arcs_mtx);
-			kpreempt(KPREEMPT_SYNC);
-			mutex_enter(&evicted_state->arcs_mtx);
-			mutex_enter(&state->arcs_mtx);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-			count = 0;
+		if (hdr->b_spa == 0)
+			continue;
+
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
-			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
-			while (hdr->b_l1hdr.b_buf) {
-				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-				if (!mutex_tryenter(&buf->b_evict_lock)) {
-					missed += 1;
-					break;
-				}
-				if (buf->b_data != NULL) {
-					bytes_evicted += hdr->b_size;
-					if (recycle &&
-					    arc_buf_type(hdr) == type &&
-					    hdr->b_size == bytes &&
-					    !HDR_L2_WRITING(hdr)) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc != NULL) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					hdr->b_l1hdr.b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-					mutex_exit(&buf->b_evict_lock);
-				} else {
-					mutex_exit(&buf->b_evict_lock);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
 
-			if (HDR_HAS_L2HDR(hdr)) {
-				ARCSTAT_INCR(arcstat_evict_l2_cached,
-				    hdr->b_size);
-			} else {
-				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
-					ARCSTAT_INCR(arcstat_evict_l2_eligible,
-					    hdr->b_size);
-				} else {
-					ARCSTAT_INCR(
-					    arcstat_evict_l2_ineligible,
-					    hdr->b_size);
-				}
-			}
+		/*
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
+		 */
+		ASSERT(!MUTEX_HELD(hash_lock));

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201504200024.t3K0OXhk000726>