Date: Tue, 26 Feb 2019 08:13:01 -0800 From: Cy Schubert <Cy.Schubert@cschubert.com> To: Baptiste Daroussin <bapt@FreeBSD.org>, src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: Re: svn commit: r344569 - in head/cddl/contrib/opensolaris: cmd/zfs lib/libzfs/common Message-ID: <1F5FB340-1CD2-400F-82D3-8D4949A351A2@cschubert.com> In-Reply-To: <39F76FF0-B40A-43F9-AC77-5D535674A4B8@cschubert.com> References: <201902260818.x1Q8IZGO061175@repo.freebsd.org> <39F76FF0-B40A-43F9-AC77-5D535674A4B8@cschubert.com>
next in thread | previous in thread | raw e-mail | index | archive | help
On February 26, 2019 7:48:27 AM PST, Cy Schubert <Cy=2ESchubert@cschubert= =2Ecom> wrote: >On February 26, 2019 12:18:35 AM PST, Baptiste Daroussin ><bapt@FreeBSD=2Eorg> wrote: >>Author: bapt >>Date: Tue Feb 26 08:18:34 2019 >>New Revision: 344569 >>URL: https://svnweb=2Efreebsd=2Eorg/changeset/base/344569 >> >>Log: >> Implement parallel mounting for ZFS filesystem >> =20 >> It was first implemented on Illumos and then ported to ZoL=2E >> This patch is a port to FreeBSD of the ZoL version=2E >> This patch also includes a fix for a race condition that was amended >> =20 >>With such patch Delphix has seen a huge decrease in latency of the >>mount phase >> (https://github=2Ecom/openzfs/openzfs/commit/a3f0e2b569 for details)= =2E >>With that current change Gandi has measured improvments that are on >par >>with >> those reported by Delphix=2E >> =20 >> Zol commits incorporated: >>https://github=2Ecom/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825= c70c9c9e303 >>https://github=2Ecom/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d40= 77567dfea21 >> =20 >> Reviewed by: avg, sef >> Approved by: avg, sef >> Obtained from: ZoL >> MFC after: 1 month >> Relnotes: yes >> Sponsored by: Gandi=2Enet >> Differential Revision: https://reviews=2Efreebsd=2Eorg/D19098 >> >>Modified: >> head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec >> head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh >> head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec >> head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh >> head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec >> >>Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec >>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >>--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec Tue Feb 26 >>06:22:10 2019 (r344568) >>+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec Tue Feb 26 >>08:18:34 2019 (r344569) >>@@ -5812,8 +5812,13 @@ zfs_do_holds(int argc, char **argv) >>=20 >> #define CHECK_SPINNER 30 >> #define SPINNER_TIME 3 /* seconds */ >>-#define MOUNT_TIME 5 /* seconds */ >>+#define MOUNT_TIME 1 /* seconds */ >>=20 >>+typedef struct get_all_state { >>+ boolean_t ga_verbose; >>+ get_all_cb_t *ga_cbp; >>+} get_all_state_t; >>+ >> static int >> get_one_dataset(zfs_handle_t *zhp, void *data) >> { >>@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data) >> static int spinval =3D 0; >> static int spincheck =3D 0; >> static time_t last_spin_time =3D (time_t)0; >>- get_all_cb_t *cbp =3D data; >>+ get_all_state_t *state =3D data; >> zfs_type_t type =3D zfs_get_type(zhp); >>=20 >>- if (cbp->cb_verbose) { >>+ if (state->ga_verbose) { >> if (--spincheck < 0) { >> time_t now =3D time(NULL); >> if (last_spin_time + SPINNER_TIME < now) { >>@@ -5850,26 +5855,24 @@ get_one_dataset(zfs_handle_t *zhp, void *data) >> zfs_close(zhp); >> return (0); >> } >>- libzfs_add_handle(cbp, zhp); >>- assert(cbp->cb_used <=3D cbp->cb_alloc); >>+ libzfs_add_handle(state->ga_cbp, zhp); >>+ assert(state->ga_cbp->cb_used <=3D state->ga_cbp->cb_alloc); >>=20 >> return (0); >> } >>=20 >> static void >>-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t >>verbose) >>+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) >> { >>- get_all_cb_t cb =3D { 0 }; >>- cb=2Ecb_verbose =3D verbose; >>- cb=2Ecb_getone =3D get_one_dataset; >>+ get_all_state_t state =3D { >>+ =2Ega_verbose =3D verbose, >>+ =2Ega_cbp =3D cbp >>+ }; >>=20 >> if (verbose) >> set_progress_header(gettext("Reading ZFS config")); >>- (void) zfs_iter_root(g_zfs, get_one_dataset, &cb); >>+ (void) zfs_iter_root(g_zfs, get_one_dataset, &state); >>=20 >>- *dslist =3D cb=2Ecb_handles; >>- *count =3D cb=2Ecb_used; >>- >> if (verbose) >> finish_progress(gettext("done=2E")); >> } >>@@ -5879,9 +5882,20 @@ get_all_datasets(zfs_handle_t ***dslist, size_t >>*count >>* similar, we have a common function with an extra parameter to >>determine which >> * mode we are using=2E >> */ >>-#define OP_SHARE 0x1 >>-#define OP_MOUNT 0x2 >>+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; >>=20 >>+typedef struct share_mount_state { >>+ share_mount_op_t sm_op; >>+ boolean_t sm_verbose; >>+ int sm_flags; >>+ char *sm_options; >>+ char *sm_proto; /* only valid for OP_SHARE */ >>+ pthread_mutex_t sm_lock; /* protects the remaining fields */ >>+ uint_t sm_total; /* number of filesystems to process */ >>+ uint_t sm_done; /* number of filesystems processed */ >>+ int sm_status; /* -1 if any of the share/mount operations failed */ >>+} share_mount_state_t; >>+ >> /* >> * Share or mount a dataset=2E >> */ >>@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total) >> update_progress(info); >> } >>=20 >>+/* >>+ * zfs_foreach_mountpoint() callback that mounts or shares on >>filesystem and >>+ * updates the progress meter >>+ */ >>+static int >>+share_mount_one_cb(zfs_handle_t *zhp, void *arg) >>+{ >>+ share_mount_state_t *sms =3D arg; >>+ int ret; >>+ >>+ ret =3D share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, >>+ B_FALSE, sms->sm_options); >>+ >>+ pthread_mutex_lock(&sms->sm_lock); >>+ if (ret !=3D 0) >>+ sms->sm_status =3D ret; >>+ sms->sm_done++; >>+ if (sms->sm_verbose) >>+ report_mount_progress(sms->sm_done, sms->sm_total); >>+ pthread_mutex_unlock(&sms->sm_lock); >>+ return (ret); >>+} >>+ >> static void >> append_options(char *mntopts, char *newopts) >> { >>@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv) >>=20 >> /* check number of arguments */ >> if (do_all) { >>- zfs_handle_t **dslist =3D NULL; >>- size_t i, count =3D 0; >> char *protocol =3D NULL; >>=20 >> if (op =3D=3D OP_SHARE && argc > 0) { >>@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv) >> } >>=20 >> start_progress_timer(); >>- get_all_datasets(&dslist, &count, verbose); >>+ get_all_cb_t cb =3D { 0 }; >>+ get_all_datasets(&cb, verbose); >>=20 >>- if (count =3D=3D 0) >>+ if (cb=2Ecb_used =3D=3D 0) { >>+ if (options !=3D NULL) >>+ free(options); >> return (0); >>+ } >>=20 >>- qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp); >> #ifdef illumos >>- sa_init_selective_arg_t sharearg; >>- sharearg=2Ezhandle_arr =3D dslist; >>- sharearg=2Ezhandle_len =3D count; >>- if ((ret =3D zfs_init_libshare_arg(zfs_get_handle(dslist[0]), >>- SA_INIT_SHARE_API_SELECTIVE, &sharearg)) !=3D SA_OK) { >>- (void) fprintf(stderr, >>- gettext("Could not initialize libshare, %d"), ret); >>- return (ret); >>+ if (op =3D=3D OP_SHARE) { >>+ sa_init_selective_arg_t sharearg; >>+ sharearg=2Ezhandle_arr =3D cb=2Ecb_handles; >>+ sharearg=2Ezhandle_len =3D cb=2Ecb_used; >>+ if ((ret =3D zfs_init_libshare_arg(g_zfs, >>+ SA_INIT_SHARE_API_SELECTIVE, &sharearg)) !=3D SA_OK) { >>+ (void) fprintf(stderr, gettext( >>+ "Could not initialize libshare, %d"), ret); >>+ return (ret); >>+ } >> } >> #endif >>+ share_mount_state_t share_mount_state =3D { 0 }; >>+ share_mount_state=2Esm_op =3D op; >>+ share_mount_state=2Esm_verbose =3D verbose; >>+ share_mount_state=2Esm_flags =3D flags; >>+ share_mount_state=2Esm_options =3D options; >>+ share_mount_state=2Esm_proto =3D protocol; >>+ share_mount_state=2Esm_total =3D cb=2Ecb_used; >>+ pthread_mutex_init(&share_mount_state=2Esm_lock, NULL); >>=20 >>- for (i =3D 0; i < count; i++) { >>- if (verbose) >>- report_mount_progress(i, count); >>+ /* >>+ * libshare isn't mt-safe, so only do the operation in parallel >>+ * if we're mounting=2E >>+ */ >>+ zfs_foreach_mountpoint(g_zfs, cb=2Ecb_handles, cb=2Ecb_used, >>+ share_mount_one_cb, &share_mount_state, op =3D=3D OP_MOUNT); >>+ ret =3D share_mount_state=2Esm_status; >>=20 >>- if (share_mount_one(dslist[i], op, flags, protocol, >>- B_FALSE, options) !=3D 0) >>- ret =3D 1; >>- zfs_close(dslist[i]); >>- } >>- >>- free(dslist); >>+ for (int i =3D 0; i < cb=2Ecb_used; i++) >>+ zfs_close(cb=2Ecb_handles[i]); >>+ free(cb=2Ecb_handles); >> } else if (argc =3D=3D 0) { >> struct mnttab entry; >>=20 >> >>Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh >>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh Tue Feb >26 >>06:22:10 2019 (r344568) >>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh Tue Feb >26 >>08:18:34 2019 (r344569) >>@@ -579,12 +579,12 @@ typedef struct get_all_cb { >> zfs_handle_t **cb_handles; >> size_t cb_alloc; >> size_t cb_used; >>- boolean_t cb_verbose; >>- int (*cb_getone)(zfs_handle_t *, void *); >> } get_all_cb_t; >>=20 >>+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, >>size_t, >>+ zfs_iter_f, void*, boolean_t); >>+ >> void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); >>-int libzfs_dataset_cmp(const void *, const void *); >>=20 >> /* >> * Functions to create and destroy datasets=2E >> >>Modified: >>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec >>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >>--- >>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec Tue >>Feb 26 06:22:10 2019 (r344568) >>+++ >>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec Tue >>Feb 26 08:18:34 2019 (r344569) >>@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1, >const >>vo >> void >> libzfs_mnttab_init(libzfs_handle_t *hdl) >> { >>+ pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); >> assert(avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0); >> avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, >> sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); >>@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl) >> free(mtn); >> } >> avl_destroy(&hdl->libzfs_mnttab_cache); >>+ (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); >> } >>=20 >> void >>@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const >char >>*f >> { >> mnttab_node_t find; >> mnttab_node_t *mtn; >>+ int ret =3D ENOENT; >>=20 >> if (!hdl->libzfs_mnttab_enable) { >> struct mnttab srch =3D { 0 }; >>@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const >char >>*f >> return (ENOENT); >> } >>=20 >>+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); >> if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) >> libzfs_mnttab_update(hdl); >>=20 >>@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const >>char *f >> mtn =3D avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); >> if (mtn) { >> *entry =3D mtn->mtn_mt; >>- return (0); >>+ ret =3D 0; >> } >>- return (ENOENT); >>+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); >>+ return (ret); >> } >>=20 >> void >>@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const >>char *sp >> { >> mnttab_node_t *mtn; >>=20 >>- if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) >>- return; >>- mtn =3D zfs_alloc(hdl, sizeof (mnttab_node_t)); >>- mtn->mtn_mt=2Emnt_special =3D zfs_strdup(hdl, special); >>- mtn->mtn_mt=2Emnt_mountp =3D zfs_strdup(hdl, mountp); >>- mtn->mtn_mt=2Emnt_fstype =3D zfs_strdup(hdl, MNTTYPE_ZFS); >>- mtn->mtn_mt=2Emnt_mntopts =3D zfs_strdup(hdl, mntopts); >>- avl_add(&hdl->libzfs_mnttab_cache, mtn); >>-} >>+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); >>+ if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) { >>+ mtn =3D zfs_alloc(hdl, sizeof (mnttab_node_t)); >>+ mtn->mtn_mt=2Emnt_special =3D zfs_strdup(hdl, special); >>+ mtn->mtn_mt=2Emnt_mountp =3D zfs_strdup(hdl, mountp); >>+ mtn->mtn_mt=2Emnt_fstype =3D zfs_strdup(hdl, MNTTYPE_ZFS); >>+ mtn->mtn_mt=2Emnt_mntopts =3D zfs_strdup(hdl, mntopts); >>+ avl_add(&hdl->libzfs_mnttab_cache, mtn); >>+ } >>+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); >>+} =09 >>=20 >> void >> libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) >>@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const >>char=20 >> mnttab_node_t find; >> mnttab_node_t *ret; >>=20 >>+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); >> find=2Emtn_mt=2Emnt_special =3D (char *)fsname; >> if ((ret =3D avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) >> !=3D NULL) { >>@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const >>char=20 >> free(ret->mtn_mt=2Emnt_mntopts); >> free(ret); >> } >>+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); >> } >>=20 >> int >> >>Modified: >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh >>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh Tue >>Feb 26 06:22:10 2019 (r344568) >>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh Tue >>Feb 26 08:18:34 2019 (r344569) >>@@ -22,7 +22,7 @@ >> /* >>* Copyright (c) 2005, 2010, Oracle and/or its affiliates=2E All rights >>reserved=2E >> * Copyright (c) 2011 Pawel Jakub Dawidek=2E All rights reserved=2E >>- * Copyright (c) 2011, 2016 by Delphix=2E All rights reserved=2E >>+ * Copyright (c) 2011, 2017 by Delphix=2E All rights reserved=2E >>* Copyright (c) 2013 Martin Matuska <mm@FreeBSD=2Eorg>=2E All rights >>reserved=2E >> */ >>=20 >>@@ -73,6 +73,13 @@ struct libzfs_handle { >> int libzfs_storeerr; /* stuff error messages into buffer */ >> void *libzfs_sharehdl; /* libshare handle */ >> boolean_t libzfs_mnttab_enable; >>+ /* >>+ * We need a lock to handle the case where parallel mount >>+ * threads are populating the mnttab cache simultaneously=2E The >>+ * lock only protects the integrity of the avl tree, and does >>+ * not protect the contents of the mnttab entries themselves=2E >>+ */ >>+ pthread_mutex_t libzfs_mnttab_cache_lock; >> avl_tree_t libzfs_mnttab_cache; >> int libzfs_pool_iter; >> libzfs_fru_t **libzfs_fru_hash; >> >>Modified: >>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec >>=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec Tue >>Feb 26 06:22:10 2019 (r344568) >>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec Tue >>Feb 26 08:18:34 2019 (r344569) >>@@ -26,6 +26,7 @@ >> * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail=2Ecom> >> * Copyright 2017 Joyent, Inc=2E >> * Copyright 2017 RackTop Systems=2E >>+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association=2E >> */ >>=20 >> /* >>@@ -34,25 +35,25 @@ >> * they are used by mount and unmount and when changing a >filesystem's >> * mountpoint=2E >> * >>- * zfs_is_mounted() >>- * zfs_mount() >>- * zfs_unmount() >>- * zfs_unmountall() >>+ * zfs_is_mounted() >>+ * zfs_mount() >>+ * zfs_unmount() >>+ * zfs_unmountall() >> * >>* This file also contains the functions used to manage sharing >>filesystems via >> * NFS and iSCSI: >> * >>- * zfs_is_shared() >>- * zfs_share() >>- * zfs_unshare() >>+ * zfs_is_shared() >>+ * zfs_share() >>+ * zfs_unshare() >> * >>- * zfs_is_shared_nfs() >>- * zfs_is_shared_smb() >>- * zfs_share_proto() >>- * zfs_shareall(); >>- * zfs_unshare_nfs() >>- * zfs_unshare_smb() >>- * zfs_unshareall_nfs() >>+ * zfs_is_shared_nfs() >>+ * zfs_is_shared_smb() >>+ * zfs_share_proto() >>+ * zfs_shareall(); >>+ * zfs_unshare_nfs() >>+ * zfs_unshare_smb() >>+ * zfs_unshareall_nfs() >> * zfs_unshareall_smb() >> * zfs_unshareall() >> * zfs_unshareall_bypath() >>@@ -60,8 +61,8 @@ >> * The following functions are available for pool consumers, and will >> * mount/unmount and share/unshare all datasets within pool: >> * >>- * zpool_enable_datasets() >>- * zpool_disable_datasets() >>+ * zpool_enable_datasets() >>+ * zpool_disable_datasets() >> */ >>=20 >> #include <dirent=2Eh> >>@@ -83,10 +84,14 @@ >> #include <libzfs=2Eh> >>=20 >> #include "libzfs_impl=2Eh" >>+#include <thread_pool=2Eh> >>=20 >> #include <libshare=2Eh> >> #define MAXISALEN 257 /* based on sysinfo(2) man page */ >>=20 >>+static int mount_tp_nthr =3D 512; /* tpool threads for multi-threaded >>mounting */ >>+ >>+static void zfs_mount_task(void *); >> static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); >> zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, >> zfs_share_proto_t); >>@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp) >> } >> } >>=20 >>+/* >>+ * Add the given zfs handle to the cb_handles array, dynamically >>reallocating >>+ * the array if it is out of space >>+ */ >> void >> libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) >> { >> if (cbp->cb_alloc =3D=3D cbp->cb_used) { >> size_t newsz; >>- void *ptr; >>+ zfs_handle_t **newhandles; >>=20 >>- newsz =3D cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; >>- ptr =3D zfs_realloc(zhp->zfs_hdl, >>- cbp->cb_handles, cbp->cb_alloc * sizeof (void *), >>- newsz * sizeof (void *)); >>- cbp->cb_handles =3D ptr; >>+ newsz =3D cbp->cb_alloc !=3D 0 ? cbp->cb_alloc * 2 : 64; >>+ newhandles =3D zfs_realloc(zhp->zfs_hdl, >>+ cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), >>+ newsz * sizeof (zfs_handle_t *)); >>+ cbp->cb_handles =3D newhandles; >> cbp->cb_alloc =3D newsz; >> } >> cbp->cb_handles[cbp->cb_used++] =3D zhp; >> } >>=20 >>+/* >>+ * Recursive helper function used during file system enumeration >>+ */ >> static int >>-mount_cb(zfs_handle_t *zhp, void *data) >>+zfs_iter_cb(zfs_handle_t *zhp, void *data) >> { >> get_all_cb_t *cbp =3D data; >>=20 >>@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data) >> } >>=20 >> libzfs_add_handle(cbp, zhp); >>- if (zfs_iter_filesystems(zhp, mount_cb, cbp) !=3D 0) { >>+ if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) !=3D 0) { >> zfs_close(zhp); >> return (-1); >> } >> return (0); >> } >>=20 >>-int >>-libzfs_dataset_cmp(const void *a, const void *b) >>+/* >>+ * Sort comparator that compares two mountpoint paths=2E We sort these >>paths so >>+ * that subdirectories immediately follow their parents=2E This means >>that we >>+ * effectively treat the '/' character as the lowest value non-nul >>char=2E >>+ * Since filesystems from non-global zones can have the same >>mountpoint >>+ * as other filesystems, the comparator sorts global zone filesystems >>to >>+ * the top of the list=2E This means that the global zone will traverse >>the >>+ * filesystem list in the correct order and can stop when it sees the >>+ * first zoned filesystem=2E In a non-global zone, only the delegated >>+ * filesystems are seen=2E >>+ * >>+ * An example sorted list using this comparator would look like: >>+ * >>+ * /foo >>+ * /foo/bar >>+ * /foo/bar/baz >>+ * /foo/baz >>+ * /foo=2Ebar >>+ * /foo (NGZ1) >>+ * /foo (NGZ2) >>+ * >>+ * The mount code depend on this ordering to deterministically >iterate >>+ * over filesystems in order to spawn parallel mount tasks=2E >>+ */ >>+static int >>+mountpoint_cmp(const void *arga, const void *argb) >> { >>- zfs_handle_t **za =3D (zfs_handle_t **)a; >>- zfs_handle_t **zb =3D (zfs_handle_t **)b; >>+ zfs_handle_t *const *zap =3D arga; >>+ zfs_handle_t *za =3D *zap; >>+ zfs_handle_t *const *zbp =3D argb; >>+ zfs_handle_t *zb =3D *zbp; >> char mounta[MAXPATHLEN]; >> char mountb[MAXPATHLEN]; >>+ const char *a =3D mounta; >>+ const char *b =3D mountb; >> boolean_t gota, gotb; >>+ uint64_t zoneda, zonedb; >>=20 >>- if ((gota =3D (zfs_get_type(*za) =3D=3D ZFS_TYPE_FILESYSTEM)) !=3D 0) >>- verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, >>+ zoneda =3D zfs_prop_get_int(za, ZFS_PROP_ZONED); >>+ zonedb =3D zfs_prop_get_int(zb, ZFS_PROP_ZONED); >>+ if (zoneda && !zonedb) >>+ return (1); >>+ if (!zoneda && zonedb) >>+ return (-1); >>+ gota =3D (zfs_get_type(za) =3D=3D ZFS_TYPE_FILESYSTEM); >>+ if (gota) >>+ verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, >> sizeof (mounta), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>- if ((gotb =3D (zfs_get_type(*zb) =3D=3D ZFS_TYPE_FILESYSTEM)) !=3D 0) >>- verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, >>+ gotb =3D (zfs_get_type(zb) =3D=3D ZFS_TYPE_FILESYSTEM); >>+ if (gotb) >>+ verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, >> sizeof (mountb), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>=20 >>- if (gota && gotb) >>- return (strcmp(mounta, mountb)); >>+ if (gota && gotb) { >>+ while (*a !=3D '\0' && (*a =3D=3D *b)) { >>+ a++; >>+ b++; >>+ } >>+ if (*a =3D=3D *b) >>+ return (0); >>+ if (*a =3D=3D '\0') >>+ return (-1); >>+ if (*b =3D=3D '\0') >>+ return (-1); >>+ if (*a =3D=3D '/') >>+ return (-1); >>+ if (*b =3D=3D '/') >>+ return (-1); >>+ return (*a < *b ? -1 : *a > *b); >>+ } >>=20 >> if (gota) >> return (-1); >> if (gotb) >> return (1); >>=20 >>- return (strcmp(zfs_get_name(a), zfs_get_name(b))); >>+ /* >>+ * If neither filesystem has a mountpoint, revert to sorting by >>+ * datset name=2E >>+ */ >>+ return (strcmp(zfs_get_name(za), zfs_get_name(zb))); >> } >>=20 >> /* >>+ * Reutrn true if path2 is a child of path1 >>+ */ >>+static boolean_t >>+libzfs_path_contains(const char *path1, const char *path2) >>+{ >>+ return (strstr(path2, path1) =3D=3D path2 && path2[strlen(path1)] =3D= =3D >>'/'); >>+} >>+ >>+ >>+static int >>+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int >>idx) >>+{ >>+ char parent[ZFS_MAXPROPLEN]; >>+ char child[ZFS_MAXPROPLEN]; >>+ int i; >>+ >>+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, >>+ sizeof (parent), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>+ >>+ for (i =3D idx + 1; i < num_handles; i++) { >>+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, >>+ sizeof (child), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>+ if (!libzfs_path_contains(parent, child)) >>+ break; >>+ } >>+ return (i); >>+} >>+ >>+typedef struct mnt_param { >>+ libzfs_handle_t *mnt_hdl; >>+ tpool_t *mnt_tp; >>+ zfs_handle_t **mnt_zhps; /* filesystems to mount */ >>+ size_t mnt_num_handles; >>+ int mnt_idx; /* Index of selected entry to mount */ >>+ zfs_iter_f mnt_func; >>+ void *mnt_data; >>+} mnt_param_t; >>+ >>+/* >>+ * Allocate and populate the parameter struct for mount function, and >>+ * schedule mounting of the entry selected by idx=2E >>+ */ >>+static void >>+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, >>+ size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t >>*tp) >>+{ >>+ mnt_param_t *mnt_param =3D zfs_alloc(hdl, sizeof (mnt_param_t)); >>+ >>+ mnt_param->mnt_hdl =3D hdl; >>+ mnt_param->mnt_tp =3D tp; >>+ mnt_param->mnt_zhps =3D handles; >>+ mnt_param->mnt_num_handles =3D num_handles; >>+ mnt_param->mnt_idx =3D idx; >>+ mnt_param->mnt_func =3D func; >>+ mnt_param->mnt_data =3D data; >>+ >>+ (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); >>+} >>+ >>+/* >>+ * This is the structure used to keep state of mounting or sharing >>operations >>+ * during a call to zpool_enable_datasets()=2E >>+ */ >>+typedef struct mount_state { >>+ /* >>+ * ms_mntstatus is set to -1 if any mount fails=2E While multiple >>threads >>+ * could update this variable concurrently, no synchronization is >>+ * needed as it's only ever set to -1=2E >>+ */ >>+ int ms_mntstatus; >>+ int ms_mntflags; >>+ const char *ms_mntopts; >>+} mount_state_t; >>+ >>+static int >>+zfs_mount_one(zfs_handle_t *zhp, void *arg) >>+{ >>+ mount_state_t *ms =3D arg; >>+ int ret =3D 0; >>+ >>+ if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) !=3D 0) >>+ ret =3D ms->ms_mntstatus =3D -1; >>+ return (ret); >>+} >>+ >>+static int >>+zfs_share_one(zfs_handle_t *zhp, void *arg) >>+{ >>+ mount_state_t *ms =3D arg; >>+ int ret =3D 0; >>+ >>+ if (zfs_share(zhp) !=3D 0) >>+ ret =3D ms->ms_mntstatus =3D -1; >>+ return (ret); >>+} >>+ >>+/* >>+ * Thread pool function to mount one file system=2E On completion, it >>finds and >>+ * schedules its children to be mounted=2E This depends on the sorting >>done in >>+ * zfs_foreach_mountpoint()=2E Note that the degenerate case (chain of >>entries >>+ * each descending from the previous) will have no parallelism since >>we always >>+ * have to wait for the parent to finish mounting before we can >>schedule >>+ * its children=2E >>+ */ >>+static void >>+zfs_mount_task(void *arg) >>+{ >>+ mnt_param_t *mp =3D arg; >>+ int idx =3D mp->mnt_idx; >>+ zfs_handle_t **handles =3D mp->mnt_zhps; >>+ size_t num_handles =3D mp->mnt_num_handles; >>+ char mountpoint[ZFS_MAXPROPLEN]; >>+ >>+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, >>+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>+ >>+ if (mp->mnt_func(handles[idx], mp->mnt_data) !=3D 0) >>+ return; >>+ >>+ /* >>+ * We dispatch tasks to mount filesystems with mountpoints >underneath >>+ * this one=2E We do this by dispatching the next filesystem with a >>+ * descendant mountpoint of the one we just mounted, then skip all >of >>+ * its descendants, dispatch the next descendant mountpoint, and so >>on=2E >>+ * The non_descendant_idx() function skips over filesystems that are >>+ * descendants of the filesystem we just dispatched=2E >>+ */ >>+ for (int i =3D idx + 1; i < num_handles; >>+ i =3D non_descendant_idx(handles, num_handles, i)) { >>+ char child[ZFS_MAXPROPLEN]; >>+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, >>+ child, sizeof (child), NULL, NULL, 0, B_FALSE) =3D=3D 0); >>+ >>+ if (!libzfs_path_contains(mountpoint, child)) >>+ break; /* not a descendant, return */ >>+ zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, >>+ mp->mnt_func, mp->mnt_data, mp->mnt_tp); >>+ } >>+ free(mp); >>+} >>+ >>+/* >>+ * Issue the func callback for each ZFS handle contained in the >>handles >>+ * array=2E This function is used to mount all datasets, and so this >>function >>+ * guarantees that filesystems for parent mountpoints are called >>before their >>+ * children=2E As such, before issuing any callbacks, we first sort the >>array >>+ * of handles by mountpoint=2E >>+ * >>+ * Callbacks are issued in one of two ways: >>+ * >>+ * 1=2E Sequentially: If the parallel argument is B_FALSE or the >>ZFS_SERIAL_MOUNT >>+ * environment variable is set, then we issue callbacks >>sequentially=2E >>+ * >>+ * 2=2E In parallel: If the parallel argument is B_TRUE and the >>ZFS_SERIAL_MOUNT >>+ * environment variable is not set, then we use a tpool to >dispatch >>threads >>+ * to mount filesystems in parallel=2E This function dispatches >tasks >>to mount >>+ * the filesystems at the top-level mountpoints, and these tasks >in >>turn >>+ * are responsible for recursively mounting filesystems in their >>children >>+ * mountpoints=2E >>+ */ >>+void >>+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, >>+ size_t num_handles, zfs_iter_f func, void *data, boolean_t >>parallel) >>+{ >>+ zoneid_t zoneid =3D getzoneid(); >>+ >>+ /* >>+ * The ZFS_SERIAL_MOUNT environment variable is an undocumented >>+ * variable that can be used as a convenience to do a/b comparison >>+ * of serial vs=2E parallel mounting=2E >>+ */ >>+ boolean_t serial_mount =3D !parallel || >>+ (getenv("ZFS_SERIAL_MOUNT") !=3D NULL); >>+ >>+ /* >>+ * Sort the datasets by mountpoint=2E See mountpoint_cmp for details >>+ * of how these are sorted=2E >>+ */ >>+ qsort(handles, num_handles, sizeof (zfs_handle_t *), >mountpoint_cmp); >>+ >>+ if (serial_mount) { >>+ for (int i =3D 0; i < num_handles; i++) { >>+ func(handles[i], data); >>+ } >>+ return; >>+ } >>+ >>+ /* >>+ * Issue the callback function for each dataset using a parallel >>+ * algorithm that uses a thread pool to manage threads=2E >>+ */ >>+ tpool_t *tp =3D tpool_create(1, mount_tp_nthr, 0, NULL); >>+ >>+ /* >>+ * There may be multiple "top level" mountpoints outside of the >>pool's >>+ * root mountpoint, e=2Eg=2E: /foo /bar=2E Dispatch a mount task for e= ach >>of >>+ * these=2E >>+ */ >>+ for (int i =3D 0; i < num_handles; >>+ i =3D non_descendant_idx(handles, num_handles, i)) { >>+ /* >>+ * Since the mountpoints have been sorted so that the zoned >>+ * filesystems are at the end, a zoned filesystem seen from >>+ * the global zone means that we're done=2E >>+ */ >>+ if (zoneid =3D=3D GLOBAL_ZONEID && >>+ zfs_prop_get_int(handles[i], ZFS_PROP_ZONED)) >>+ break; >>+ zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, >>+ tp); >>+ } >>+ >>+ tpool_wait(tp); /* wait for all scheduled mounts to complete */ >>+ tpool_destroy(tp); >>+} >>+ >>+/* >>* Mount and share all datasets within the given pool=2E This assumes >>that no >>- * datasets within the pool are currently mounted=2E Because users can >>create >>- * complicated nested hierarchies of mountpoints, we first gather all >>the >>- * datasets and mountpoints within the pool, and sort them by >>mountpoint=2E Once >>- * we have the list of all filesystems, we iterate over them in order >>and mount >>- * and/or share each one=2E >>+ * datasets within the pool are currently mounted=2E >> */ >> #pragma weak zpool_mount_datasets =3D zpool_enable_datasets >> int >>zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int >>flags) >> { >> get_all_cb_t cb =3D { 0 }; >>- libzfs_handle_t *hdl =3D zhp->zpool_hdl; >>+ mount_state_t ms =3D { 0 }; >> zfs_handle_t *zfsp; >>- int i, ret =3D -1; >>- int *good; >>+ int ret =3D 0; >>=20 >>- /* >>- * Gather all non-snap datasets within the pool=2E >>- */ >>- if ((zfsp =3D zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) =3D=3D >>NULL) >>+ if ((zfsp =3D zfs_open(zhp->zpool_hdl, zhp->zpool_name, >>+ ZFS_TYPE_DATASET)) =3D=3D NULL) >> goto out; >>=20 >>- libzfs_add_handle(&cb, zfsp); >>- if (zfs_iter_filesystems(zfsp, mount_cb, &cb) !=3D 0) >>- goto out; >> /* >>- * Sort the datasets by mountpoint=2E >>+ * Gather all non-snapshot datasets within the pool=2E Start by adding >>+ * the root filesystem for this pool to the list, and then iterate >>+ * over all child filesystems=2E >> */ >>- qsort(cb=2Ecb_handles, cb=2Ecb_used, sizeof (void *), >>- libzfs_dataset_cmp); >>+ libzfs_add_handle(&cb, zfsp); >>+ if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) !=3D 0) >>+ goto out; >>=20 >> /* >>- * And mount all the datasets, keeping track of which ones >>- * succeeded or failed=2E >>+ * Mount all filesystems >> */ >>- if ((good =3D zfs_alloc(zhp->zpool_hdl, >>- cb=2Ecb_used * sizeof (int))) =3D=3D NULL) >>- goto out; >>+ ms=2Ems_mntopts =3D mntopts; >>+ ms=2Ems_mntflags =3D flags; >>+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb=2Ecb_handles, cb=2Ecb_used, >>+ zfs_mount_one, &ms, B_TRUE); >>+ if (ms=2Ems_mntstatus !=3D 0) >>+ ret =3D ms=2Ems_mntstatus; >>=20 >>- ret =3D 0; >>- for (i =3D 0; i < cb=2Ecb_used; i++) { >>- if (zfs_mount(cb=2Ecb_handles[i], mntopts, flags) !=3D 0) >>- ret =3D -1; >>- else >>- good[i] =3D 1; >>- } >>- >> /* >>- * Then share all the ones that need to be shared=2E This needs >>- * to be a separate pass in order to avoid excessive reloading >>- * of the configuration=2E Good should never be NULL since >>- * zfs_alloc is supposed to exit if memory isn't available=2E >>+ * Share all filesystems that need to be shared=2E This needs to be >>+ * a separate pass because libshare is not mt-safe, and so we need >>+ * to share serially=2E >> */ >>- for (i =3D 0; i < cb=2Ecb_used; i++) { >>- if (good[i] && zfs_share(cb=2Ecb_handles[i]) !=3D 0) >>- ret =3D -1; >>- } >>+ ms=2Ems_mntstatus =3D 0; >>+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb=2Ecb_handles, cb=2Ecb_used, >>+ zfs_share_one, &ms, B_FALSE); >>+ if (ms=2Ems_mntstatus !=3D 0) >>+ ret =3D ms=2Ems_mntstatus; >>=20 >>- free(good); >>- >> out: >>- for (i =3D 0; i < cb=2Ecb_used; i++) >>+ for (int i =3D 0; i < cb=2Ecb_used; i++) >> zfs_close(cb=2Ecb_handles[i]); >> free(cb=2Ecb_handles); >>=20 > >This broke my systems, many filesystems fail to mount causing nullfs >late mounts to fail=2E No details now until tonight=2E > >Suggest we back this out until it is properly tested=2E Nested zfs filesystems seem not to be handled properly or possibly not sup= ported any more=2E This explains my mail gateway also not mounting all file= systems in /home=2E It was odd that dovecot stopped working=2E The symptom of the problem is zfs mount -a no longer mounts all filesystem= s=2E Zfs mount fails saying the filesystem is already mounted=2E The workar= ound is to zfs umount each affected zfs dataset by hand and zfs mount it by= hand=2E Generally this has screwed up sites that have hundreds (in my case 122) zf= s datasets=2E The work around might be to script testing each mount, unmoun= ting and remounting if necessary=2E I'm being sarcastic about creating an rc script to clean this up=2E This n= eeds to be backed out and tested properly before being committed=2E=20 --=20 Pardon the typos and autocorrect, small keyboard in use=2E Cheers, Cy Schubert <Cy=2ESchubert@cschubert=2Ecom> FreeBSD UNIX: <cy@FreeBSD=2Eorg> Web: http://www=2EFreeBSD=2Eorg The need of the many outweighs the greed of the few=2E
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?1F5FB340-1CD2-400F-82D3-8D4949A351A2>