Date: Sun, 19 Apr 2026 07:53:08 +0000 From: Martin Matuska <mm@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: d8fbbd371ca1 - main - zfs: merge openzfs/zfs@1644e2ffd Message-ID: <69e489e4.18a32.645fb2c3@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch main has been updated by mm: URL: https://cgit.FreeBSD.org/src/commit/?id=d8fbbd371ca11d9ad4b29b9d3a316885a5da0b15 commit d8fbbd371ca11d9ad4b29b9d3a316885a5da0b15 Merge: 1c50cb1d7562 1644e2ffd264 Author: Martin Matuska <mm@FreeBSD.org> AuthorDate: 2026-04-18 22:21:01 +0000 Commit: Martin Matuska <mm@FreeBSD.org> CommitDate: 2026-04-18 22:22:45 +0000 zfs: merge openzfs/zfs@1644e2ffd Notable upstream pull request merges: #18148 d1b0a6982 draid: add failure domains support #18167 f203fedde Add zoned_uid property with additive least privilege authorization #18191 -multiple FreeBSD: Fix a couple of races involving zvol creation and teardown #18213 33ed68fc2 zpool create: report which device caused failure #18235 931deb290 Prevent range tree corruption race by updating dnode_sync() #18282 b44a3ecf4 zpool: Change zpool offline spares policy #18310 -multiple Fix s_active leak in zfsvfs_hold() when z_unmounted is true #18351 ce837a28e Bridge speculative and prescient prefetchers #18380 fc659bd6d draid: fix import failure after disks replacements #18385 16858492e FreeBSD: Implement relatime property #18390 a22b3f670 abd: Fix stats asymmetry in case of Direct I/O #18399 7b1682a82 Add support for POSIX_FADV_DONTNEED #18403 5cb95ad89 fix memleak in spa_errlog.c #18405 0752cf067 draid: allow seq resilver reads from degraded vdevs #18407 e635d27eb Add ability to set user properties while changing encryption key #18414 2abf469be draid: fix cksum errors after rebuild with degraded disks #18415 -multiple Fix snapshot automount deadlock during concurrent zfs recv #18421 1644e2ffd Fix read corruption after block clone after truncate Obtained from: OpenZFS OpenZFS commit: 1644e2ffd2640fa3e2c191ceaf048a5fc8399493 .../openzfs/.github/workflows/checkstyle.yaml | 7 +- sys/contrib/openzfs/.github/workflows/codeql.yml | 2 +- .../.github/workflows/scripts/generate-ci-type.py | 33 +- .../.github/workflows/scripts/qemu-1-setup.sh | 21 + .../.github/workflows/scripts/qemu-2-start.sh | 8 +- .../.github/workflows/scripts/qemu-3-deps-vm.sh | 22 +- .../.github/workflows/scripts/qemu-4-build-vm.sh | 11 +- .../.github/workflows/scripts/qemu-7-prepare.sh | 22 +- .../workflows/scripts/qemu-9-summary-page.sh | 4 +- sys/contrib/openzfs/.github/workflows/smatch.yml | 4 +- sys/contrib/openzfs/.github/workflows/zfs-arm.yml | 40 ++ .../.github/workflows/zfs-qemu-packages.yml | 23 +- sys/contrib/openzfs/.github/workflows/zfs-qemu.yml | 42 +- sys/contrib/openzfs/.github/workflows/zloop.yml | 7 +- sys/contrib/openzfs/AUTHORS | 1 + sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c | 4 +- sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c | 146 +++++- sys/contrib/openzfs/cmd/zfs/zfs_main.c | 7 +- sys/contrib/openzfs/cmd/zinject/translate.c | 35 +- sys/contrib/openzfs/cmd/zinject/zinject.c | 4 +- sys/contrib/openzfs/cmd/zpool/zpool_main.c | 21 +- sys/contrib/openzfs/cmd/zpool/zpool_vdev.c | 234 ++++++++-- sys/contrib/openzfs/cmd/ztest.c | 11 +- sys/contrib/openzfs/config/deb.am | 20 +- .../config/kernel-copy-from-user-inatomic.m4 | 30 -- sys/contrib/openzfs/config/kernel.m4 | 2 - .../contrib/debian/openzfs-zfsutils.install | 1 + .../contrib/pyzfs/libzfs_core/_constants.py | 4 + sys/contrib/openzfs/contrib/pyzfs/setup.py.in | 4 +- sys/contrib/openzfs/include/libzfs.h | 2 + .../openzfs/include/os/freebsd/spl/sys/zone.h | 73 +++ .../include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 + .../include/os/freebsd/zfs/sys/zfs_znode_impl.h | 4 +- .../openzfs/include/os/linux/spl/sys/zone.h | 58 +++ .../include/os/linux/zfs/sys/zfs_vfsops_os.h | 14 +- sys/contrib/openzfs/include/sys/dbuf.h | 2 + sys/contrib/openzfs/include/sys/dmu.h | 6 + sys/contrib/openzfs/include/sys/dmu_zfetch.h | 1 + sys/contrib/openzfs/include/sys/dnode.h | 13 + sys/contrib/openzfs/include/sys/dsl_crypt.h | 3 +- sys/contrib/openzfs/include/sys/fs/zfs.h | 8 + sys/contrib/openzfs/include/sys/spa.h | 2 +- sys/contrib/openzfs/include/sys/spa_impl.h | 1 + sys/contrib/openzfs/include/sys/vdev_draid.h | 7 +- sys/contrib/openzfs/include/sys/vdev_raidz_impl.h | 1 + sys/contrib/openzfs/include/zfeature_common.h | 1 + sys/contrib/openzfs/lib/libzfs/libzfs.abi | 43 +- sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c | 52 ++- sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c | 10 +- sys/contrib/openzfs/lib/libzfs/libzfs_pool.c | 170 +++++-- sys/contrib/openzfs/lib/libzfs/libzfs_status.c | 43 +- sys/contrib/openzfs/lib/libzfs/libzfs_util.c | 6 + .../openzfs/lib/libzfs/os/linux/libzfs_pool_os.c | 19 +- sys/contrib/openzfs/man/Makefile.am | 16 +- sys/contrib/openzfs/man/man1/dbufstat.1 | 233 ++++++++++ sys/contrib/openzfs/man/man4/zfs.4 | 11 + sys/contrib/openzfs/man/man7/vdevprops.7 | 26 +- sys/contrib/openzfs/man/man7/zfsprops.7 | 92 ++++ sys/contrib/openzfs/man/man7/zpool-features.7 | 27 ++ sys/contrib/openzfs/man/man7/zpoolconcepts.7 | 36 +- sys/contrib/openzfs/man/man8/zfs-load-key.8 | 9 +- sys/contrib/openzfs/man/man8/zfs-set.8 | 20 +- sys/contrib/openzfs/man/man8/zfs-zone.8 | 15 +- sys/contrib/openzfs/man/man8/zinject.8 | 10 +- sys/contrib/openzfs/man/man8/zpool-create.8 | 35 ++ sys/contrib/openzfs/man/man8/zpool-list.8 | 4 +- sys/contrib/openzfs/man/man8/zpool-offline.8 | 7 +- sys/contrib/openzfs/man/man8/zpool-resilver.8 | 3 + sys/contrib/openzfs/man/man8/zpool-scrub.8 | 4 +- sys/contrib/openzfs/man/man8/zpool-status.8 | 4 +- .../openzfs/module/os/freebsd/zfs/zfs_vfsops.c | 8 + .../openzfs/module/os/freebsd/zfs/zfs_vnops_os.c | 4 +- .../openzfs/module/os/freebsd/zfs/zfs_znode_os.c | 43 ++ .../openzfs/module/os/freebsd/zfs/zvol_os.c | 84 ++-- sys/contrib/openzfs/module/os/linux/spl/spl-zone.c | 413 +++++++++++++++-- .../openzfs/module/os/linux/zfs/spa_misc_os.c | 50 +- .../openzfs/module/os/linux/zfs/zfs_ctldir.c | 57 ++- .../openzfs/module/os/linux/zfs/zfs_ioctl_os.c | 4 + .../openzfs/module/os/linux/zfs/zfs_vfsops.c | 240 +--------- sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c | 39 +- .../openzfs/module/os/linux/zfs/zpl_super.c | 509 ++++++++++++++++++++- sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c | 2 +- .../openzfs/module/zcommon/zfeature_common.c | 13 + sys/contrib/openzfs/module/zcommon/zfs_prop.c | 15 +- sys/contrib/openzfs/module/zcommon/zpool_prop.c | 6 + sys/contrib/openzfs/module/zfs/abd.c | 3 +- sys/contrib/openzfs/module/zfs/dbuf.c | 78 +++- sys/contrib/openzfs/module/zfs/ddt_log.c | 3 +- sys/contrib/openzfs/module/zfs/dmu.c | 79 ++++ sys/contrib/openzfs/module/zfs/dmu_zfetch.c | 69 ++- sys/contrib/openzfs/module/zfs/dnode.c | 2 + sys/contrib/openzfs/module/zfs/dnode_sync.c | 105 +++-- sys/contrib/openzfs/module/zfs/dsl_crypt.c | 15 +- sys/contrib/openzfs/module/zfs/dsl_deleg.c | 13 +- sys/contrib/openzfs/module/zfs/spa.c | 58 ++- sys/contrib/openzfs/module/zfs/spa_errlog.c | 2 +- sys/contrib/openzfs/module/zfs/spa_log_spacemap.c | 8 +- sys/contrib/openzfs/module/zfs/space_map.c | 3 +- sys/contrib/openzfs/module/zfs/vdev.c | 68 ++- sys/contrib/openzfs/module/zfs/vdev_draid.c | 423 +++++++++++++---- sys/contrib/openzfs/module/zfs/vdev_label.c | 23 +- sys/contrib/openzfs/module/zfs/vdev_mirror.c | 13 +- sys/contrib/openzfs/module/zfs/vdev_raidz.c | 65 ++- sys/contrib/openzfs/module/zfs/zfs_ioctl.c | 318 ++++++++++++- sys/contrib/openzfs/module/zfs/zio.c | 6 +- sys/contrib/openzfs/module/zfs/zvol.c | 51 ++- sys/contrib/openzfs/rpm/generic/zfs.spec.in | 6 + sys/contrib/openzfs/scripts/spdxcheck.pl | 1 - sys/contrib/openzfs/tests/runfiles/common.run | 40 +- sys/contrib/openzfs/tests/runfiles/linux.run | 7 +- sys/contrib/openzfs/tests/runfiles/sanity.run | 6 +- .../openzfs/tests/test-runner/bin/zts-report.py.in | 6 +- sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore | 1 + .../openzfs/tests/zfs-tests/cmd/Makefile.am | 2 + .../tests/zfs-tests/cmd/clone_after_trunc.c | 117 +++++ .../openzfs/tests/zfs-tests/include/commands.cfg | 2 + .../openzfs/tests/zfs-tests/tests/Makefile.am | 46 ++ .../block_cloning/block_cloning_after_trunc.ksh | 31 ++ .../zfs_change-key/zfs_change-key_userprop.ksh | 72 +++ .../zpool_create/zpool_create_draid_005_pos.ksh | 149 ++++++ .../zpool_create/zpool_create_errinfo_001_neg.ksh | 103 +++++ .../functional/cli_root/zpool_get/vdev_get.cfg | 2 + .../functional/cli_root/zpool_get/zpool_get.cfg | 1 + .../cli_root/zpool_offline/zpool_offline_spare.ksh | 84 ++++ .../tests/functional/fadvise/fadvise_dontneed.ksh | 63 +++ .../functional/fault/auto_offline_001_pos.ksh | 5 +- .../functional/fault/suspend_draid_fgroups.ksh | 163 +++++++ .../tests/functional/redundancy/redundancy.kshlib | 65 ++- .../redundancy/redundancy_draid_degraded1.ksh | 141 ++++++ .../redundancy/redundancy_draid_degraded2.ksh | 157 +++++++ .../redundancy/redundancy_draid_spare4.ksh | 152 ++++++ .../redundancy/redundancy_draid_width.ksh | 91 ++++ .../tests/functional/rsend/send_raw_ashift.ksh | 3 - .../tests/functional/zoned_uid/cleanup.ksh | 46 ++ .../zfs-tests/tests/functional/zoned_uid/setup.ksh | 99 ++++ .../tests/functional/zoned_uid/zoned_uid.cfg | 33 ++ .../functional/zoned_uid/zoned_uid_001_pos.ksh | 85 ++++ .../functional/zoned_uid/zoned_uid_002_pos.ksh | 83 ++++ .../functional/zoned_uid/zoned_uid_003_pos.ksh | 100 ++++ .../functional/zoned_uid/zoned_uid_004_pos.ksh | 91 ++++ .../functional/zoned_uid/zoned_uid_005_neg.ksh | 72 +++ .../functional/zoned_uid/zoned_uid_006_pos.ksh | 109 +++++ .../functional/zoned_uid/zoned_uid_007_pos.ksh | 110 +++++ .../functional/zoned_uid/zoned_uid_008_pos.ksh | 128 ++++++ .../functional/zoned_uid/zoned_uid_009_pos.ksh | 149 ++++++ .../functional/zoned_uid/zoned_uid_010_pos.ksh | 157 +++++++ .../functional/zoned_uid/zoned_uid_011_neg.ksh | 153 +++++++ .../functional/zoned_uid/zoned_uid_012_pos.ksh | 120 +++++ .../functional/zoned_uid/zoned_uid_013_pos.ksh | 122 +++++ .../functional/zoned_uid/zoned_uid_014_pos.ksh | 116 +++++ .../functional/zoned_uid/zoned_uid_015_pos.ksh | 114 +++++ .../functional/zoned_uid/zoned_uid_016_pos.ksh | 132 ++++++ .../functional/zoned_uid/zoned_uid_017_neg.ksh | 125 +++++ .../functional/zoned_uid/zoned_uid_018_pos.ksh | 129 ++++++ .../functional/zoned_uid/zoned_uid_019_neg.ksh | 141 ++++++ .../functional/zoned_uid/zoned_uid_020_neg.ksh | 171 +++++++ .../functional/zoned_uid/zoned_uid_021_neg.ksh | 109 +++++ .../functional/zoned_uid/zoned_uid_022_neg.ksh | 154 +++++++ .../functional/zoned_uid/zoned_uid_023_pos.ksh | 131 ++++++ .../functional/zoned_uid/zoned_uid_024_neg.ksh | 144 ++++++ .../functional/zoned_uid/zoned_uid_025_pos.ksh | 102 +++++ .../functional/zoned_uid/zoned_uid_026_pos.ksh | 112 +++++ .../functional/zoned_uid/zoned_uid_027_pos.ksh | 103 +++++ .../functional/zoned_uid/zoned_uid_028_neg.ksh | 103 +++++ .../functional/zoned_uid/zoned_uid_029_neg.ksh | 120 +++++ .../functional/zoned_uid/zoned_uid_030_pos.ksh | 183 ++++++++ .../functional/zoned_uid/zoned_uid_031_pos.ksh | 110 +++++ .../functional/zoned_uid/zoned_uid_common.kshlib | 237 ++++++++++ sys/modules/zfs/zfs_config.h | 4 +- sys/modules/zfs/zfs_gitrev.h | 2 +- 170 files changed, 9623 insertions(+), 846 deletions(-) diff --cc sys/contrib/openzfs/.github/workflows/zfs-arm.yml index 000000000000,6039e4736c42..6039e4736c42 mode 000000,100644..100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-arm.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-arm.yml diff --cc sys/contrib/openzfs/include/os/freebsd/spl/sys/zone.h index cfe63946706b,000000000000..12c80b39dfac mode 100644,000000..100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/zone.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/zone.h @@@ -1,68 -1,0 +1,141 @@@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ZONE_H_ +#define _OPENSOLARIS_SYS_ZONE_H_ + +#include <sys/jail.h> ++#include <sys/errno.h> + +/* + * Macros to help with zone visibility restrictions. + */ + +#define GLOBAL_ZONEID 0 + +/* + * Is proc in the global zone? + */ +#define INGLOBALZONE(proc) (!jailed((proc)->p_ucred)) + +/* + * Attach the given dataset to the given jail. + */ +extern int zone_dataset_attach(struct ucred *, const char *, int); + +/* + * Detach the given dataset to the given jail. + */ +extern int zone_dataset_detach(struct ucred *, const char *, int); + +/* + * Returns true if the named pool/dataset is visible in the current zone. + */ +extern int zone_dataset_visible(const char *, int *); + +/* + * Safely get the hostid of the specified zone (defaults to machine's hostid + * if the specified zone doesn't emulate a hostid). Passing NULL retrieves + * the global zone's (i.e., physical system's) hostid. + */ +extern uint32_t zone_get_hostid(void *); + ++/* ++ * Operations that can be authorized via zoned_uid delegation. ++ * Shared with Linux; on FreeBSD these are defined but the check ++ * always returns NOT_APPLICABLE (no user namespace support). ++ */ ++typedef enum zone_uid_op { ++ ZONE_OP_CREATE, ++ ZONE_OP_SNAPSHOT, ++ ZONE_OP_CLONE, ++ ZONE_OP_DESTROY, ++ ZONE_OP_RENAME, ++ ZONE_OP_SETPROP ++} zone_uid_op_t; ++ ++typedef enum zone_admin_result { ++ ZONE_ADMIN_NOT_APPLICABLE, ++ ZONE_ADMIN_ALLOWED, ++ ZONE_ADMIN_DENIED ++} zone_admin_result_t; ++ ++/* ++ * FreeBSD stub: zoned_uid delegation is not applicable (no user namespaces). ++ * Always returns NOT_APPLICABLE so callers fall through to existing ++ * jail-based permission checks. ++ */ ++static inline zone_admin_result_t ++zone_dataset_admin_check(const char *dataset, zone_uid_op_t op, ++ const char *aux_dataset) ++{ ++ (void) dataset, (void) op, (void) aux_dataset; ++ return (ZONE_ADMIN_NOT_APPLICABLE); ++} ++ ++/* ++ * Callback type for looking up zoned_uid property. ++ */ ++typedef uid_t (*zone_get_zoned_uid_fn_t)(const char *dataset, ++ char *root_out, size_t root_size); ++ ++/* ++ * FreeBSD stubs: zoned_uid attach/detach require user namespaces ++ * which FreeBSD does not have. Return ENXIO (consistent with the ++ * Linux fallback when CONFIG_USER_NS is not defined). ++ */ ++static inline int ++zone_dataset_attach_uid(struct ucred *cred, const char *dataset, uid_t uid) ++{ ++ (void) cred, (void) dataset, (void) uid; ++ return (ENXIO); ++} ++ ++static inline int ++zone_dataset_detach_uid(struct ucred *cred, const char *dataset, uid_t uid) ++{ ++ (void) cred, (void) dataset, (void) uid; ++ return (ENXIO); ++} ++ ++/* ++ * FreeBSD stubs: no-op since zoned_uid delegation requires user namespaces. ++ */ ++static inline void ++zone_register_zoned_uid_callback(zone_get_zoned_uid_fn_t fn) ++{ ++ (void) fn; ++} ++ ++static inline void ++zone_unregister_zoned_uid_callback(void) ++{ ++} ++ +#endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ diff --cc sys/contrib/openzfs/man/man1/dbufstat.1 index 000000000000,311af5e76a98..311af5e76a98 mode 000000,100644..100644 --- a/sys/contrib/openzfs/man/man1/dbufstat.1 +++ b/sys/contrib/openzfs/man/man1/dbufstat.1 diff --cc sys/contrib/openzfs/module/zfs/vdev.c index 9def59b06727,000000000000..30639d7f4c7f mode 100644,000000..100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@@ -1,6866 -1,0 +1,6922 @@@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. + * Copyright (c) 2021, 2025, Klara, Inc. + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2026, Seagate Technology, LLC. + */ + +#include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/bpobj.h> +#include <sys/dmu.h> +#include <sys/dmu_tx.h> +#include <sys/dsl_dir.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_rebuild.h> +#include <sys/vdev_draid.h> +#include <sys/uberblock_impl.h> +#include <sys/metaslab.h> +#include <sys/metaslab_impl.h> +#include <sys/space_map.h> +#include <sys/space_reftree.h> +#include <sys/zio.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/arc.h> +#include <sys/zil.h> +#include <sys/dsl_scan.h> +#include <sys/vdev_raidz.h> +#include <sys/abd.h> +#include <sys/vdev_initialize.h> +#include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> +#include <sys/zvol.h> +#include <sys/zfs_ratelimit.h> +#include "zfs_prop.h" + +/* + * One metaslab from each (normal-class) vdev is used by the ZIL. These are + * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are + * part of the spa_embedded_log_class. The metaslab with the most free space + * in each vdev is selected for this purpose when the pool is opened (or a + * vdev is added). See vdev_metaslab_init(). + * + * Log blocks can be allocated from the following locations. Each one is tried + * in order until the allocation succeeds: + * 1. dedicated log vdevs, aka "slog" (spa_log_class) + * 2. embedded slog metaslabs (spa_embedded_log_class) + * 3. other metaslabs in normal vdevs (spa_normal_class) + * + * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer + * than this number of metaslabs in the vdev. This ensures that we don't set + * aside an unreasonable amount of space for the ZIL. If set to less than + * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced + * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab. + */ +static uint_t zfs_embedded_slog_min_ms = 64; + +/* default target for number of metaslabs per top-level vdev */ +static uint_t zfs_vdev_default_ms_count = 200; + +/* minimum number of metaslabs per top-level vdev */ +static uint_t zfs_vdev_min_ms_count = 16; + +/* practical upper limit of total metaslabs per top-level vdev */ +static uint_t zfs_vdev_ms_count_limit = 1ULL << 17; + +/* lower limit for metaslab size (512M) */ +static uint_t zfs_vdev_default_ms_shift = 29; + +/* upper limit for metaslab size (16G) */ +static uint_t zfs_vdev_max_ms_shift = 34; + +int vdev_validate_skip = B_FALSE; + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +int zfs_vdev_dtl_sm_blksz = (1 << 12); + +/* + * Rate limit slow IO (delay) events to this many per second. + */ +static unsigned int zfs_slow_io_events_per_second = 20; + +/* + * Rate limit deadman "hung IO" events to this many per second. + */ +static unsigned int zfs_deadman_events_per_second = 1; + +/* + * Rate limit direct write IO verify failures to this many per scond. + */ +static unsigned int zfs_dio_write_verify_events_per_second = 20; + +/* + * Rate limit checksum events after this many checksum errors per second. + */ +static unsigned int zfs_checksum_events_per_second = 20; + +/* + * Ignore errors during scrub/resilver. Allows to work around resilver + * upon import when there are pool errors. + */ +static int zfs_scan_ignore_errors = 0; + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +int zfs_vdev_standard_sm_blksz = (1 << 17); + +/* + * Tunable parameter for debugging or performance analysis. Setting this + * will cause pool corruption on power loss if a volatile out-of-order + * write cache is enabled. + */ +int zfs_nocacheflush = 0; + +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint_t zfs_vdev_max_auto_ashift = 14; +uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; + +/* + * VDEV checksum verification for Direct I/O writes. This is neccessary for + * Linux, because anonymous pages can not be placed under write protection + * during Direct I/O writes. + */ +#if !defined(__FreeBSD__) +uint_t zfs_vdev_direct_write_verify = 1; +#else +uint_t zfs_vdev_direct_write_verify = 0; +#endif + +void +vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + if (vd->vdev_path != NULL) { + zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, + vd->vdev_path, buf); + } else { + zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", + vd->vdev_ops->vdev_op_type, + (u_longlong_t)vd->vdev_id, + (u_longlong_t)vd->vdev_guid, buf); + } +} + +void +vdev_dbgmsg_print_tree(vdev_t *vd, int indent) +{ + char state[20]; + + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { + zfs_dbgmsg("%*svdev %llu: %s", indent, "", + (u_longlong_t)vd->vdev_id, + vd->vdev_ops->vdev_op_type); + return; + } + + switch (vd->vdev_state) { + case VDEV_STATE_UNKNOWN: + (void) snprintf(state, sizeof (state), "unknown"); + break; + case VDEV_STATE_CLOSED: + (void) snprintf(state, sizeof (state), "closed"); + break; + case VDEV_STATE_OFFLINE: + (void) snprintf(state, sizeof (state), "offline"); + break; + case VDEV_STATE_REMOVED: + (void) snprintf(state, sizeof (state), "removed"); + break; + case VDEV_STATE_CANT_OPEN: + (void) snprintf(state, sizeof (state), "can't open"); + break; + case VDEV_STATE_FAULTED: + (void) snprintf(state, sizeof (state), "faulted"); + break; + case VDEV_STATE_DEGRADED: + (void) snprintf(state, sizeof (state), "degraded"); + break; + case VDEV_STATE_HEALTHY: + (void) snprintf(state, sizeof (state), "healthy"); + break; + default: + (void) snprintf(state, sizeof (state), "<state %u>", + (uint_t)vd->vdev_state); + } + + zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, + "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, + vd->vdev_islog ? " (log)" : "", + (u_longlong_t)vd->vdev_guid, + vd->vdev_path ? vd->vdev_path : "N/A", state); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); +} + +char * +vdev_rt_name(vdev_t *vd, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name)); +} + +static char * +vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name, + dtl_type)); +} + +/* + * Virtual device management. + */ + +static vdev_ops_t *const vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_spare_ops, + &vdev_disk_ops, + &vdev_file_ops, + &vdev_missing_ops, + &vdev_hole_ops, + &vdev_indirect_ops, + NULL +}; + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, *const *opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* + * Given a vdev and a metaslab class, find which metaslab group we're + * interested in. All vdevs may belong to two different metaslab classes. + * Dedicated slog devices use only the primary metaslab group, rather than a + * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and + * will point to a metaslab group of either embedded_log_class (for normal + * vdevs) or special_embedded_log_class (for special vdevs). + */ +metaslab_group_t * +vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) +{ + if ((mc == spa_embedded_log_class(vd->vdev_spa) || + mc == spa_special_embedded_log_class(vd->vdev_spa)) && + vd->vdev_log_mg != NULL) + return (vd->vdev_log_mg); + else + return (vd->vdev_mg); +} + +void +vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) +{ + (void) vd, (void) remain_rs; + + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; +} + +/* + * Derive the enumerated allocation bias from string input. + * String origin is either the per-vdev zap or zpool(8). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + +uint64_t +vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) +{ + ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift)); + uint64_t csize, psize = asize; + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg); + psize = MIN(psize, csize); + } + + return (psize); +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); + uint64_t csize; + + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); + asize = MAX(asize, csize); + } + + return (asize); +} + +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + +/* + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. + */ +uint64_t +vdev_get_min_asize(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + /* + * If our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); + + /* + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. + */ + if (vd == vd->vdev_top) + return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, + uint64_t)); + + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); +} + +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); +} + +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +static int +vdev_prop_get_objid(vdev_t *vd, uint64_t *objid) +{ + + if (vd->vdev_root_zap != 0) { + *objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + *objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + *objid = vd->vdev_leaf_zap; + } else { + return (EINVAL); + } + + return (0); +} + +static int +vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + uint64_t objid; + int err; + + if (vdev_prop_get_objid(vd, &objid) != 0) + return (EINVAL); + + err = zap_lookup(mos, objid, vdev_prop_to_name(prop), + sizeof (uint64_t), 1, value); + if (err == ENOENT) + *value = vdev_prop_default_numeric(prop); + + return (err); +} + +static int +vdev_prop_get_bool(vdev_t *vd, vdev_prop_t prop, boolean_t *bvalue) +{ + int err; + uint64_t ivalue; + + err = vdev_prop_get_int(vd, prop, &ivalue); + *bvalue = ivalue != 0; + + return (err); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (vdev < rvd->vdev_children) { + ASSERT(rvd->vdev_child[vdev] != NULL); + return (rvd->vdev_child[vdev]); + } + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + vdev_t *mvd; + + if (vd->vdev_guid == guid) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +static int +vdev_count_leaves_impl(vdev_t *vd) +{ + int n = 0; + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (int c = 0; c < vd->vdev_children; c++) + n += vdev_count_leaves_impl(vd->vdev_child[c]); + + return (n); +} + +int +vdev_count_leaves(spa_t *spa) +{ + int rc; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + rc = vdev_count_leaves_impl(spa->spa_root_vdev); + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (rc); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT0P(cvd->vdev_parent); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_alloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + memcpy(newchild, pvd->vdev_child, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + pvd->vdev_nonrot &= cvd->vdev_nonrot; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) { *** 7530 LINES SKIPPED ***home | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?69e489e4.18a32.645fb2c3>
