Date: Thu, 22 Feb 2018 03:54:59 +0000 (UTC) From: Alexander Motin <mav@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r329805 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs Message-ID: <201802220354.w1M3sx6Q014574@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mav Date: Thu Feb 22 03:54:59 2018 New Revision: 329805 URL: https://svnweb.freebsd.org/changeset/base/329805 Log: MFV r329803: 9080 recursive enter of vdev_indirect_rwlock from vdev_indirect_remap() illumos/illumos-gate@bdfded42e66b9fc1395ff2401aa2952f7c44ae34 A scenario came up where a callback executed by vdev_indirect_remap() on a vdev, calls vdev_indirect_remap() on the same vdev and tries to reacquire vdev_indirect_rwlock that was already acquired from the first call to vdev_indirect_remap(). The specific scenario, is that we want to remap a block pointer that is snapshoted but its dataset's remap_deadlist is not cached. So in order to add it we issue a read through a vdev_indirect_remap() on the same vdev, which brings up the aforementioned issue. Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Author: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c Directory Properties: head/sys/cddl/contrib/opensolaris/ (props changed) Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c Thu Feb 22 03:53:49 2018 (r329804) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c Thu Feb 22 03:54:59 2018 (r329805) @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -854,6 +854,57 @@ rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, } /* + * Given an indirect vdev and an extent on that vdev, it duplicates the + * physical entries of the indirect mapping that correspond to the extent + * to a new array and returns a pointer to it. In addition, copied_entries + * is populated with the number of mapping entries that were duplicated. + * + * Note that the function assumes that the caller holds vdev_indirect_rwlock. + * This ensures that the mapping won't change due to condensing as we + * copy over its contents. + * + * Finally, since we are doing an allocation, it is up to the caller to + * free the array allocated in this function. + */ +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, + uint64_t asize, uint64_t *copied_entries) +{ + vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t entries = 0; + + ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); + + vdev_indirect_mapping_entry_phys_t *first_mapping = + vdev_indirect_mapping_entry_for_offset(vim, offset); + ASSERT3P(first_mapping, !=, NULL); + + vdev_indirect_mapping_entry_phys_t *m = first_mapping; + while (asize > 0) { + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + + ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); + ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); + + uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); + uint64_t inner_size = MIN(asize, size - inner_offset); + + offset += inner_size; + asize -= inner_size; + entries++; + m++; + } + + size_t copy_length = entries * sizeof (*first_mapping); + duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); + bcopy(first_mapping, duplicate_mappings, copy_length); + *copied_entries = entries; + + return (duplicate_mappings); +} + +/* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on @@ -893,24 +944,42 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; + uint64_t num_entries = 0; + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT(rs->rs_asize > 0); + /* - * Note: this can be called from open context - * (eg. zio_read()), so we need the rwlock to prevent - * the mapping from being changed by condensing. + * Note: As this function can be called from open context + * (e.g. zio_read()), we need the following rwlock to + * prevent the mapping from being changed by condensing. + * + * So we grab the lock and we make a copy of the entries + * that are relevant to the extent that we are working on. + * Once that is done, we drop the lock and iterate over + * our copy of the mapping. Once we are done with the with + * the remap segment and we free it, we also free our copy + * of the indirect mapping entries that are relevant to it. + * + * This way we don't need to wait until the function is + * finished with a segment, to condense it. In addition, we + * don't need a recursive rwlock for the case that a call to + * vdev_indirect_remap() needs to call itself (through the + * codepath of its callback) for the same vdev in the middle + * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; ASSERT3P(vim, !=, NULL); - ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - ASSERT(rs->rs_asize > 0); - vdev_indirect_mapping_entry_phys_t *mapping = - vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); + vdev_indirect_mapping_duplicate_adjacent_entries(v, + rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); + ASSERT3U(num_entries, >, 0); + rw_exit(&v->vdev_indirect_rwlock); - while (rs->rs_asize > 0) { + for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the @@ -919,20 +988,23 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6 * function is only called for frees, which also only * happen from syncing context. */ + vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; - uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); - uint64_t dst_offset = - DVA_GET_OFFSET(&mapping->vimep_dst); - uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); + ASSERT3P(m, !=, NULL); + ASSERT3U(rs->rs_asize, >, 0); + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); + uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); + ASSERT3U(rs->rs_offset, >=, - DVA_MAPPING_GET_SRC_OFFSET(mapping)); + DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, - DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); + DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - - DVA_MAPPING_GET_SRC_OFFSET(mapping); + DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); @@ -973,10 +1045,10 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6 rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; - mapping++; } + VERIFY0(rs->rs_asize); - rw_exit(&v->vdev_indirect_rwlock); + kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802220354.w1M3sx6Q014574>