Date: Tue, 26 Feb 2019 07:48:27 -0800 From: Cy Schubert <Cy.Schubert@cschubert.com> To: Baptiste Daroussin <bapt@FreeBSD.org>, src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: Re: svn commit: r344569 - in head/cddl/contrib/opensolaris: cmd/zfs lib/libzfs/common Message-ID: <39F76FF0-B40A-43F9-AC77-5D535674A4B8@cschubert.com> In-Reply-To: <201902260818.x1Q8IZGO061175@repo.freebsd.org> References: <201902260818.x1Q8IZGO061175@repo.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
On February 26, 2019 12:18:35 AM PST, Baptiste Daroussin <bapt@FreeBSD=2Eor= g> wrote: >Author: bapt >Date: Tue Feb 26 08:18:34 2019 >New Revision: 344569 >URL: https://svnweb=2Efreebsd=2Eorg/changeset/base/344569 > >Log: > Implement parallel mounting for ZFS filesystem > =20 > It was first implemented on Illumos and then ported to ZoL=2E > This patch is a port to FreeBSD of the ZoL version=2E > This patch also includes a fix for a race condition that was amended > =20 >With such patch Delphix has seen a huge decrease in latency of the >mount phase > (https://github=2Ecom/openzfs/openzfs/commit/a3f0e2b569 for details)=2E >With that current change Gandi has measured improvments that are on par >with > those reported by Delphix=2E > =20 > Zol commits incorporated: >https://github=2Ecom/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825c= 70c9c9e303 >https://github=2Ecom/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d407= 7567dfea21 > =20 > Reviewed by: avg, sef > Approved by: avg, sef > Obtained from: ZoL > MFC after: 1 month > Relnotes: yes > Sponsored by: Gandi=2Enet > Differential Revision: https://reviews=2Efreebsd=2Eorg/D19098 > >Modified: > head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec > head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh > head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec > head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh > head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec > >Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec >=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec Tue Feb 26 >06:22:10 2019 (r344568) >+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main=2Ec Tue Feb 26 >08:18:34 2019 (r344569) >@@ -5812,8 +5812,13 @@ zfs_do_holds(int argc, char **argv) >=20 > #define CHECK_SPINNER 30 > #define SPINNER_TIME 3 /* seconds */ >-#define MOUNT_TIME 5 /* seconds */ >+#define MOUNT_TIME 1 /* seconds */ >=20 >+typedef struct get_all_state { >+ boolean_t ga_verbose; >+ get_all_cb_t *ga_cbp; >+} get_all_state_t; >+ > static int > get_one_dataset(zfs_handle_t *zhp, void *data) > { >@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data) > static int spinval =3D 0; > static int spincheck =3D 0; > static time_t last_spin_time =3D (time_t)0; >- get_all_cb_t *cbp =3D data; >+ get_all_state_t *state =3D data; > zfs_type_t type =3D zfs_get_type(zhp); >=20 >- if (cbp->cb_verbose) { >+ if (state->ga_verbose) { > if (--spincheck < 0) { > time_t now =3D time(NULL); > if (last_spin_time + SPINNER_TIME < now) { >@@ -5850,26 +5855,24 @@ get_one_dataset(zfs_handle_t *zhp, void *data) > zfs_close(zhp); > return (0); > } >- libzfs_add_handle(cbp, zhp); >- assert(cbp->cb_used <=3D cbp->cb_alloc); >+ libzfs_add_handle(state->ga_cbp, zhp); >+ assert(state->ga_cbp->cb_used <=3D state->ga_cbp->cb_alloc); >=20 > return (0); > } >=20 > static void >-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t >verbose) >+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) > { >- get_all_cb_t cb =3D { 0 }; >- cb=2Ecb_verbose =3D verbose; >- cb=2Ecb_getone =3D get_one_dataset; >+ get_all_state_t state =3D { >+ =2Ega_verbose =3D verbose, >+ =2Ega_cbp =3D cbp >+ }; >=20 > if (verbose) > set_progress_header(gettext("Reading ZFS config")); >- (void) zfs_iter_root(g_zfs, get_one_dataset, &cb); >+ (void) zfs_iter_root(g_zfs, get_one_dataset, &state); >=20 >- *dslist =3D cb=2Ecb_handles; >- *count =3D cb=2Ecb_used; >- > if (verbose) > finish_progress(gettext("done=2E")); > } >@@ -5879,9 +5882,20 @@ get_all_datasets(zfs_handle_t ***dslist, size_t >*count >* similar, we have a common function with an extra parameter to >determine which > * mode we are using=2E > */ >-#define OP_SHARE 0x1 >-#define OP_MOUNT 0x2 >+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; >=20 >+typedef struct share_mount_state { >+ share_mount_op_t sm_op; >+ boolean_t sm_verbose; >+ int sm_flags; >+ char *sm_options; >+ char *sm_proto; /* only valid for OP_SHARE */ >+ pthread_mutex_t sm_lock; /* protects the remaining fields */ >+ uint_t sm_total; /* number of filesystems to process */ >+ uint_t sm_done; /* number of filesystems processed */ >+ int sm_status; /* -1 if any of the share/mount operations failed */ >+} share_mount_state_t; >+ > /* > * Share or mount a dataset=2E > */ >@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total) > update_progress(info); > } >=20 >+/* >+ * zfs_foreach_mountpoint() callback that mounts or shares on >filesystem and >+ * updates the progress meter >+ */ >+static int >+share_mount_one_cb(zfs_handle_t *zhp, void *arg) >+{ >+ share_mount_state_t *sms =3D arg; >+ int ret; >+ >+ ret =3D share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, >+ B_FALSE, sms->sm_options); >+ >+ pthread_mutex_lock(&sms->sm_lock); >+ if (ret !=3D 0) >+ sms->sm_status =3D ret; >+ sms->sm_done++; >+ if (sms->sm_verbose) >+ report_mount_progress(sms->sm_done, sms->sm_total); >+ pthread_mutex_unlock(&sms->sm_lock); >+ return (ret); >+} >+ > static void > append_options(char *mntopts, char *newopts) > { >@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv) >=20 > /* check number of arguments */ > if (do_all) { >- zfs_handle_t **dslist =3D NULL; >- size_t i, count =3D 0; > char *protocol =3D NULL; >=20 > if (op =3D=3D OP_SHARE && argc > 0) { >@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv) > } >=20 > start_progress_timer(); >- get_all_datasets(&dslist, &count, verbose); >+ get_all_cb_t cb =3D { 0 }; >+ get_all_datasets(&cb, verbose); >=20 >- if (count =3D=3D 0) >+ if (cb=2Ecb_used =3D=3D 0) { >+ if (options !=3D NULL) >+ free(options); > return (0); >+ } >=20 >- qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp); > #ifdef illumos >- sa_init_selective_arg_t sharearg; >- sharearg=2Ezhandle_arr =3D dslist; >- sharearg=2Ezhandle_len =3D count; >- if ((ret =3D zfs_init_libshare_arg(zfs_get_handle(dslist[0]), >- SA_INIT_SHARE_API_SELECTIVE, &sharearg)) !=3D SA_OK) { >- (void) fprintf(stderr, >- gettext("Could not initialize libshare, %d"), ret); >- return (ret); >+ if (op =3D=3D OP_SHARE) { >+ sa_init_selective_arg_t sharearg; >+ sharearg=2Ezhandle_arr =3D cb=2Ecb_handles; >+ sharearg=2Ezhandle_len =3D cb=2Ecb_used; >+ if ((ret =3D zfs_init_libshare_arg(g_zfs, >+ SA_INIT_SHARE_API_SELECTIVE, &sharearg)) !=3D SA_OK) { >+ (void) fprintf(stderr, gettext( >+ "Could not initialize libshare, %d"), ret); >+ return (ret); >+ } > } > #endif >+ share_mount_state_t share_mount_state =3D { 0 }; >+ share_mount_state=2Esm_op =3D op; >+ share_mount_state=2Esm_verbose =3D verbose; >+ share_mount_state=2Esm_flags =3D flags; >+ share_mount_state=2Esm_options =3D options; >+ share_mount_state=2Esm_proto =3D protocol; >+ share_mount_state=2Esm_total =3D cb=2Ecb_used; >+ pthread_mutex_init(&share_mount_state=2Esm_lock, NULL); >=20 >- for (i =3D 0; i < count; i++) { >- if (verbose) >- report_mount_progress(i, count); >+ /* >+ * libshare isn't mt-safe, so only do the operation in parallel >+ * if we're mounting=2E >+ */ >+ zfs_foreach_mountpoint(g_zfs, cb=2Ecb_handles, cb=2Ecb_used, >+ share_mount_one_cb, &share_mount_state, op =3D=3D OP_MOUNT); >+ ret =3D share_mount_state=2Esm_status; >=20 >- if (share_mount_one(dslist[i], op, flags, protocol, >- B_FALSE, options) !=3D 0) >- ret =3D 1; >- zfs_close(dslist[i]); >- } >- >- free(dslist); >+ for (int i =3D 0; i < cb=2Ecb_used; i++) >+ zfs_close(cb=2Ecb_handles[i]); >+ free(cb=2Ecb_handles); > } else if (argc =3D=3D 0) { > struct mnttab entry; >=20 > >Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh >=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh Tue Feb 26 >06:22:10 2019 (r344568) >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs=2Eh Tue Feb 26 >08:18:34 2019 (r344569) >@@ -579,12 +579,12 @@ typedef struct get_all_cb { > zfs_handle_t **cb_handles; > size_t cb_alloc; > size_t cb_used; >- boolean_t cb_verbose; >- int (*cb_getone)(zfs_handle_t *, void *); > } get_all_cb_t; >=20 >+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, >size_t, >+ zfs_iter_f, void*, boolean_t); >+ > void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); >-int libzfs_dataset_cmp(const void *, const void *); >=20 > /* > * Functions to create and destroy datasets=2E > >Modified: >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec >=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >--- >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec Tue >Feb 26 06:22:10 2019 (r344568) >+++ >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset=2Ec Tue >Feb 26 08:18:34 2019 (r344569) >@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const >vo > void > libzfs_mnttab_init(libzfs_handle_t *hdl) > { >+ pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); > assert(avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0); > avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, > sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); >@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl) > free(mtn); > } > avl_destroy(&hdl->libzfs_mnttab_cache); >+ (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); > } >=20 > void >@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char >*f > { > mnttab_node_t find; > mnttab_node_t *mtn; >+ int ret =3D ENOENT; >=20 > if (!hdl->libzfs_mnttab_enable) { > struct mnttab srch =3D { 0 }; >@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char >*f > return (ENOENT); > } >=20 >+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); > if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) > libzfs_mnttab_update(hdl); >=20 >@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const >char *f > mtn =3D avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); > if (mtn) { > *entry =3D mtn->mtn_mt; >- return (0); >+ ret =3D 0; > } >- return (ENOENT); >+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); >+ return (ret); > } >=20 > void >@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const >char *sp > { > mnttab_node_t *mtn; >=20 >- if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) >- return; >- mtn =3D zfs_alloc(hdl, sizeof (mnttab_node_t)); >- mtn->mtn_mt=2Emnt_special =3D zfs_strdup(hdl, special); >- mtn->mtn_mt=2Emnt_mountp =3D zfs_strdup(hdl, mountp); >- mtn->mtn_mt=2Emnt_fstype =3D zfs_strdup(hdl, MNTTYPE_ZFS); >- mtn->mtn_mt=2Emnt_mntopts =3D zfs_strdup(hdl, mntopts); >- avl_add(&hdl->libzfs_mnttab_cache, mtn); >-} >+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); >+ if (avl_numnodes(&hdl->libzfs_mnttab_cache) =3D=3D 0) { >+ mtn =3D zfs_alloc(hdl, sizeof (mnttab_node_t)); >+ mtn->mtn_mt=2Emnt_special =3D zfs_strdup(hdl, special); >+ mtn->mtn_mt=2Emnt_mountp =3D zfs_strdup(hdl, mountp); >+ mtn->mtn_mt=2Emnt_fstype =3D zfs_strdup(hdl, MNTTYPE_ZFS); >+ mtn->mtn_mt=2Emnt_mntopts =3D zfs_strdup(hdl, mntopts); >+ avl_add(&hdl->libzfs_mnttab_cache, mtn); >+ } >+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); >+} =09 >=20 > void > libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) >@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const >char=20 > mnttab_node_t find; > mnttab_node_t *ret; >=20 >+ pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); > find=2Emtn_mt=2Emnt_special =3D (char *)fsname; > if ((ret =3D avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) > !=3D NULL) { >@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const >char=20 > free(ret->mtn_mt=2Emnt_mntopts); > free(ret); > } >+ pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); > } >=20 > int > >Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh >=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh Tue >Feb 26 06:22:10 2019 (r344568) >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl=2Eh Tue >Feb 26 08:18:34 2019 (r344569) >@@ -22,7 +22,7 @@ > /* >* Copyright (c) 2005, 2010, Oracle and/or its affiliates=2E All rights >reserved=2E > * Copyright (c) 2011 Pawel Jakub Dawidek=2E All rights reserved=2E >- * Copyright (c) 2011, 2016 by Delphix=2E All rights reserved=2E >+ * Copyright (c) 2011, 2017 by Delphix=2E All rights reserved=2E >* Copyright (c) 2013 Martin Matuska <mm@FreeBSD=2Eorg>=2E All rights >reserved=2E > */ >=20 >@@ -73,6 +73,13 @@ struct libzfs_handle { > int libzfs_storeerr; /* stuff error messages into buffer */ > void *libzfs_sharehdl; /* libshare handle */ > boolean_t libzfs_mnttab_enable; >+ /* >+ * We need a lock to handle the case where parallel mount >+ * threads are populating the mnttab cache simultaneously=2E The >+ * lock only protects the integrity of the avl tree, and does >+ * not protect the contents of the mnttab entries themselves=2E >+ */ >+ pthread_mutex_t libzfs_mnttab_cache_lock; > avl_tree_t libzfs_mnttab_cache; > int libzfs_pool_iter; > libzfs_fru_t **libzfs_fru_hash; > >Modified: >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec >=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec Tue >Feb 26 06:22:10 2019 (r344568) >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount=2Ec Tue >Feb 26 08:18:34 2019 (r344569) >@@ -26,6 +26,7 @@ > * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail=2Ecom> > * Copyright 2017 Joyent, Inc=2E > * Copyright 2017 RackTop Systems=2E >+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association=2E > */ >=20 > /* >@@ -34,25 +35,25 @@ > * they are used by mount and unmount and when changing a filesystem's > * mountpoint=2E > * >- * zfs_is_mounted() >- * zfs_mount() >- * zfs_unmount() >- * zfs_unmountall() >+ * zfs_is_mounted() >+ * zfs_mount() >+ * zfs_unmount() >+ * zfs_unmountall() > * >* This file also contains the functions used to manage sharing >filesystems via > * NFS and iSCSI: > * >- * zfs_is_shared() >- * zfs_share() >- * zfs_unshare() >+ * zfs_is_shared() >+ * zfs_share() >+ * zfs_unshare() > * >- * zfs_is_shared_nfs() >- * zfs_is_shared_smb() >- * zfs_share_proto() >- * zfs_shareall(); >- * zfs_unshare_nfs() >- * zfs_unshare_smb() >- * zfs_unshareall_nfs() >+ * zfs_is_shared_nfs() >+ * zfs_is_shared_smb() >+ * zfs_share_proto() >+ * zfs_shareall(); >+ * zfs_unshare_nfs() >+ * zfs_unshare_smb() >+ * zfs_unshareall_nfs() > * zfs_unshareall_smb() > * zfs_unshareall() > * zfs_unshareall_bypath() >@@ -60,8 +61,8 @@ > * The following functions are available for pool consumers, and will > * mount/unmount and share/unshare all datasets within pool: > * >- * zpool_enable_datasets() >- * zpool_disable_datasets() >+ * zpool_enable_datasets() >+ * zpool_disable_datasets() > */ >=20 > #include <dirent=2Eh> >@@ -83,10 +84,14 @@ > #include <libzfs=2Eh> >=20 > #include "libzfs_impl=2Eh" >+#include <thread_pool=2Eh> >=20 > #include <libshare=2Eh> > #define MAXISALEN 257 /* based on sysinfo(2) man page */ >=20 >+static int mount_tp_nthr =3D 512; /* tpool threads for multi-threaded >mounting */ >+ >+static void zfs_mount_task(void *); > static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); > zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, > zfs_share_proto_t); >@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp) > } > } >=20 >+/* >+ * Add the given zfs handle to the cb_handles array, dynamically >reallocating >+ * the array if it is out of space >+ */ > void > libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) > { > if (cbp->cb_alloc =3D=3D cbp->cb_used) { > size_t newsz; >- void *ptr; >+ zfs_handle_t **newhandles; >=20 >- newsz =3D cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; >- ptr =3D zfs_realloc(zhp->zfs_hdl, >- cbp->cb_handles, cbp->cb_alloc * sizeof (void *), >- newsz * sizeof (void *)); >- cbp->cb_handles =3D ptr; >+ newsz =3D cbp->cb_alloc !=3D 0 ? cbp->cb_alloc * 2 : 64; >+ newhandles =3D zfs_realloc(zhp->zfs_hdl, >+ cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), >+ newsz * sizeof (zfs_handle_t *)); >+ cbp->cb_handles =3D newhandles; > cbp->cb_alloc =3D newsz; > } > cbp->cb_handles[cbp->cb_used++] =3D zhp; > } >=20 >+/* >+ * Recursive helper function used during file system enumeration >+ */ > static int >-mount_cb(zfs_handle_t *zhp, void *data) >+zfs_iter_cb(zfs_handle_t *zhp, void *data) > { > get_all_cb_t *cbp =3D data; >=20 >@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data) > } >=20 > libzfs_add_handle(cbp, zhp); >- if (zfs_iter_filesystems(zhp, mount_cb, cbp) !=3D 0) { >+ if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) !=3D 0) { > zfs_close(zhp); > return (-1); > } > return (0); > } >=20 >-int >-libzfs_dataset_cmp(const void *a, const void *b) >+/* >+ * Sort comparator that compares two mountpoint paths=2E We sort these >paths so >+ * that subdirectories immediately follow their parents=2E This means >that we >+ * effectively treat the '/' character as the lowest value non-nul >char=2E >+ * Since filesystems from non-global zones can have the same >mountpoint >+ * as other filesystems, the comparator sorts global zone filesystems >to >+ * the top of the list=2E This means that the global zone will traverse >the >+ * filesystem list in the correct order and can stop when it sees the >+ * first zoned filesystem=2E In a non-global zone, only the delegated >+ * filesystems are seen=2E >+ * >+ * An example sorted list using this comparator would look like: >+ * >+ * /foo >+ * /foo/bar >+ * /foo/bar/baz >+ * /foo/baz >+ * /foo=2Ebar >+ * /foo (NGZ1) >+ * /foo (NGZ2) >+ * >+ * The mount code depend on this ordering to deterministically iterate >+ * over filesystems in order to spawn parallel mount tasks=2E >+ */ >+static int >+mountpoint_cmp(const void *arga, const void *argb) > { >- zfs_handle_t **za =3D (zfs_handle_t **)a; >- zfs_handle_t **zb =3D (zfs_handle_t **)b; >+ zfs_handle_t *const *zap =3D arga; >+ zfs_handle_t *za =3D *zap; >+ zfs_handle_t *const *zbp =3D argb; >+ zfs_handle_t *zb =3D *zbp; > char mounta[MAXPATHLEN]; > char mountb[MAXPATHLEN]; >+ const char *a =3D mounta; >+ const char *b =3D mountb; > boolean_t gota, gotb; >+ uint64_t zoneda, zonedb; >=20 >- if ((gota =3D (zfs_get_type(*za) =3D=3D ZFS_TYPE_FILESYSTEM)) !=3D 0) >- verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, >+ zoneda =3D zfs_prop_get_int(za, ZFS_PROP_ZONED); >+ zonedb =3D zfs_prop_get_int(zb, ZFS_PROP_ZONED); >+ if (zoneda && !zonedb) >+ return (1); >+ if (!zoneda && zonedb) >+ return (-1); >+ gota =3D (zfs_get_type(za) =3D=3D ZFS_TYPE_FILESYSTEM); >+ if (gota) >+ verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, > sizeof (mounta), NULL, NULL, 0, B_FALSE) =3D=3D 0); >- if ((gotb =3D (zfs_get_type(*zb) =3D=3D ZFS_TYPE_FILESYSTEM)) !=3D 0) >- verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, >+ gotb =3D (zfs_get_type(zb) =3D=3D ZFS_TYPE_FILESYSTEM); >+ if (gotb) >+ verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, > sizeof (mountb), NULL, NULL, 0, B_FALSE) =3D=3D 0); >=20 >- if (gota && gotb) >- return (strcmp(mounta, mountb)); >+ if (gota && gotb) { >+ while (*a !=3D '\0' && (*a =3D=3D *b)) { >+ a++; >+ b++; >+ } >+ if (*a =3D=3D *b) >+ return (0); >+ if (*a =3D=3D '\0') >+ return (-1); >+ if (*b =3D=3D '\0') >+ return (-1); >+ if (*a =3D=3D '/') >+ return (-1); >+ if (*b =3D=3D '/') >+ return (-1); >+ return (*a < *b ? -1 : *a > *b); >+ } >=20 > if (gota) > return (-1); > if (gotb) > return (1); >=20 >- return (strcmp(zfs_get_name(a), zfs_get_name(b))); >+ /* >+ * If neither filesystem has a mountpoint, revert to sorting by >+ * datset name=2E >+ */ >+ return (strcmp(zfs_get_name(za), zfs_get_name(zb))); > } >=20 > /* >+ * Reutrn true if path2 is a child of path1 >+ */ >+static boolean_t >+libzfs_path_contains(const char *path1, const char *path2) >+{ >+ return (strstr(path2, path1) =3D=3D path2 && path2[strlen(path1)] =3D= =3D >'/'); >+} >+ >+ >+static int >+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int >idx) >+{ >+ char parent[ZFS_MAXPROPLEN]; >+ char child[ZFS_MAXPROPLEN]; >+ int i; >+ >+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, >+ sizeof (parent), NULL, NULL, 0, B_FALSE) =3D=3D 0); >+ >+ for (i =3D idx + 1; i < num_handles; i++) { >+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, >+ sizeof (child), NULL, NULL, 0, B_FALSE) =3D=3D 0); >+ if (!libzfs_path_contains(parent, child)) >+ break; >+ } >+ return (i); >+} >+ >+typedef struct mnt_param { >+ libzfs_handle_t *mnt_hdl; >+ tpool_t *mnt_tp; >+ zfs_handle_t **mnt_zhps; /* filesystems to mount */ >+ size_t mnt_num_handles; >+ int mnt_idx; /* Index of selected entry to mount */ >+ zfs_iter_f mnt_func; >+ void *mnt_data; >+} mnt_param_t; >+ >+/* >+ * Allocate and populate the parameter struct for mount function, and >+ * schedule mounting of the entry selected by idx=2E >+ */ >+static void >+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, >+ size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t >*tp) >+{ >+ mnt_param_t *mnt_param =3D zfs_alloc(hdl, sizeof (mnt_param_t)); >+ >+ mnt_param->mnt_hdl =3D hdl; >+ mnt_param->mnt_tp =3D tp; >+ mnt_param->mnt_zhps =3D handles; >+ mnt_param->mnt_num_handles =3D num_handles; >+ mnt_param->mnt_idx =3D idx; >+ mnt_param->mnt_func =3D func; >+ mnt_param->mnt_data =3D data; >+ >+ (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); >+} >+ >+/* >+ * This is the structure used to keep state of mounting or sharing >operations >+ * during a call to zpool_enable_datasets()=2E >+ */ >+typedef struct mount_state { >+ /* >+ * ms_mntstatus is set to -1 if any mount fails=2E While multiple >threads >+ * could update this variable concurrently, no synchronization is >+ * needed as it's only ever set to -1=2E >+ */ >+ int ms_mntstatus; >+ int ms_mntflags; >+ const char *ms_mntopts; >+} mount_state_t; >+ >+static int >+zfs_mount_one(zfs_handle_t *zhp, void *arg) >+{ >+ mount_state_t *ms =3D arg; >+ int ret =3D 0; >+ >+ if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) !=3D 0) >+ ret =3D ms->ms_mntstatus =3D -1; >+ return (ret); >+} >+ >+static int >+zfs_share_one(zfs_handle_t *zhp, void *arg) >+{ >+ mount_state_t *ms =3D arg; >+ int ret =3D 0; >+ >+ if (zfs_share(zhp) !=3D 0) >+ ret =3D ms->ms_mntstatus =3D -1; >+ return (ret); >+} >+ >+/* >+ * Thread pool function to mount one file system=2E On completion, it >finds and >+ * schedules its children to be mounted=2E This depends on the sorting >done in >+ * zfs_foreach_mountpoint()=2E Note that the degenerate case (chain of >entries >+ * each descending from the previous) will have no parallelism since >we always >+ * have to wait for the parent to finish mounting before we can >schedule >+ * its children=2E >+ */ >+static void >+zfs_mount_task(void *arg) >+{ >+ mnt_param_t *mp =3D arg; >+ int idx =3D mp->mnt_idx; >+ zfs_handle_t **handles =3D mp->mnt_zhps; >+ size_t num_handles =3D mp->mnt_num_handles; >+ char mountpoint[ZFS_MAXPROPLEN]; >+ >+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, >+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) =3D=3D 0); >+ >+ if (mp->mnt_func(handles[idx], mp->mnt_data) !=3D 0) >+ return; >+ >+ /* >+ * We dispatch tasks to mount filesystems with mountpoints underneath >+ * this one=2E We do this by dispatching the next filesystem with a >+ * descendant mountpoint of the one we just mounted, then skip all of >+ * its descendants, dispatch the next descendant mountpoint, and so >on=2E >+ * The non_descendant_idx() function skips over filesystems that are >+ * descendants of the filesystem we just dispatched=2E >+ */ >+ for (int i =3D idx + 1; i < num_handles; >+ i =3D non_descendant_idx(handles, num_handles, i)) { >+ char child[ZFS_MAXPROPLEN]; >+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, >+ child, sizeof (child), NULL, NULL, 0, B_FALSE) =3D=3D 0); >+ >+ if (!libzfs_path_contains(mountpoint, child)) >+ break; /* not a descendant, return */ >+ zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, >+ mp->mnt_func, mp->mnt_data, mp->mnt_tp); >+ } >+ free(mp); >+} >+ >+/* >+ * Issue the func callback for each ZFS handle contained in the >handles >+ * array=2E This function is used to mount all datasets, and so this >function >+ * guarantees that filesystems for parent mountpoints are called >before their >+ * children=2E As such, before issuing any callbacks, we first sort the >array >+ * of handles by mountpoint=2E >+ * >+ * Callbacks are issued in one of two ways: >+ * >+ * 1=2E Sequentially: If the parallel argument is B_FALSE or the >ZFS_SERIAL_MOUNT >+ * environment variable is set, then we issue callbacks >sequentially=2E >+ * >+ * 2=2E In parallel: If the parallel argument is B_TRUE and the >ZFS_SERIAL_MOUNT >+ * environment variable is not set, then we use a tpool to dispatch >threads >+ * to mount filesystems in parallel=2E This function dispatches tasks >to mount >+ * the filesystems at the top-level mountpoints, and these tasks in >turn >+ * are responsible for recursively mounting filesystems in their >children >+ * mountpoints=2E >+ */ >+void >+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, >+ size_t num_handles, zfs_iter_f func, void *data, boolean_t >parallel) >+{ >+ zoneid_t zoneid =3D getzoneid(); >+ >+ /* >+ * The ZFS_SERIAL_MOUNT environment variable is an undocumented >+ * variable that can be used as a convenience to do a/b comparison >+ * of serial vs=2E parallel mounting=2E >+ */ >+ boolean_t serial_mount =3D !parallel || >+ (getenv("ZFS_SERIAL_MOUNT") !=3D NULL); >+ >+ /* >+ * Sort the datasets by mountpoint=2E See mountpoint_cmp for details >+ * of how these are sorted=2E >+ */ >+ qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); >+ >+ if (serial_mount) { >+ for (int i =3D 0; i < num_handles; i++) { >+ func(handles[i], data); >+ } >+ return; >+ } >+ >+ /* >+ * Issue the callback function for each dataset using a parallel >+ * algorithm that uses a thread pool to manage threads=2E >+ */ >+ tpool_t *tp =3D tpool_create(1, mount_tp_nthr, 0, NULL); >+ >+ /* >+ * There may be multiple "top level" mountpoints outside of the >pool's >+ * root mountpoint, e=2Eg=2E: /foo /bar=2E Dispatch a mount task for ea= ch >of >+ * these=2E >+ */ >+ for (int i =3D 0; i < num_handles; >+ i =3D non_descendant_idx(handles, num_handles, i)) { >+ /* >+ * Since the mountpoints have been sorted so that the zoned >+ * filesystems are at the end, a zoned filesystem seen from >+ * the global zone means that we're done=2E >+ */ >+ if (zoneid =3D=3D GLOBAL_ZONEID && >+ zfs_prop_get_int(handles[i], ZFS_PROP_ZONED)) >+ break; >+ zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, >+ tp); >+ } >+ >+ tpool_wait(tp); /* wait for all scheduled mounts to complete */ >+ tpool_destroy(tp); >+} >+ >+/* >* Mount and share all datasets within the given pool=2E This assumes >that no >- * datasets within the pool are currently mounted=2E Because users can >create >- * complicated nested hierarchies of mountpoints, we first gather all >the >- * datasets and mountpoints within the pool, and sort them by >mountpoint=2E Once >- * we have the list of all filesystems, we iterate over them in order >and mount >- * and/or share each one=2E >+ * datasets within the pool are currently mounted=2E > */ > #pragma weak zpool_mount_datasets =3D zpool_enable_datasets > int >zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int >flags) > { > get_all_cb_t cb =3D { 0 }; >- libzfs_handle_t *hdl =3D zhp->zpool_hdl; >+ mount_state_t ms =3D { 0 }; > zfs_handle_t *zfsp; >- int i, ret =3D -1; >- int *good; >+ int ret =3D 0; >=20 >- /* >- * Gather all non-snap datasets within the pool=2E >- */ >- if ((zfsp =3D zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) =3D=3D >NULL) >+ if ((zfsp =3D zfs_open(zhp->zpool_hdl, zhp->zpool_name, >+ ZFS_TYPE_DATASET)) =3D=3D NULL) > goto out; >=20 >- libzfs_add_handle(&cb, zfsp); >- if (zfs_iter_filesystems(zfsp, mount_cb, &cb) !=3D 0) >- goto out; > /* >- * Sort the datasets by mountpoint=2E >+ * Gather all non-snapshot datasets within the pool=2E Start by adding >+ * the root filesystem for this pool to the list, and then iterate >+ * over all child filesystems=2E > */ >- qsort(cb=2Ecb_handles, cb=2Ecb_used, sizeof (void *), >- libzfs_dataset_cmp); >+ libzfs_add_handle(&cb, zfsp); >+ if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) !=3D 0) >+ goto out; >=20 > /* >- * And mount all the datasets, keeping track of which ones >- * succeeded or failed=2E >+ * Mount all filesystems > */ >- if ((good =3D zfs_alloc(zhp->zpool_hdl, >- cb=2Ecb_used * sizeof (int))) =3D=3D NULL) >- goto out; >+ ms=2Ems_mntopts =3D mntopts; >+ ms=2Ems_mntflags =3D flags; >+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb=2Ecb_handles, cb=2Ecb_used, >+ zfs_mount_one, &ms, B_TRUE); >+ if (ms=2Ems_mntstatus !=3D 0) >+ ret =3D ms=2Ems_mntstatus; >=20 >- ret =3D 0; >- for (i =3D 0; i < cb=2Ecb_used; i++) { >- if (zfs_mount(cb=2Ecb_handles[i], mntopts, flags) !=3D 0) >- ret =3D -1; >- else >- good[i] =3D 1; >- } >- > /* >- * Then share all the ones that need to be shared=2E This needs >- * to be a separate pass in order to avoid excessive reloading >- * of the configuration=2E Good should never be NULL since >- * zfs_alloc is supposed to exit if memory isn't available=2E >+ * Share all filesystems that need to be shared=2E This needs to be >+ * a separate pass because libshare is not mt-safe, and so we need >+ * to share serially=2E > */ >- for (i =3D 0; i < cb=2Ecb_used; i++) { >- if (good[i] && zfs_share(cb=2Ecb_handles[i]) !=3D 0) >- ret =3D -1; >- } >+ ms=2Ems_mntstatus =3D 0; >+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb=2Ecb_handles, cb=2Ecb_used, >+ zfs_share_one, &ms, B_FALSE); >+ if (ms=2Ems_mntstatus !=3D 0) >+ ret =3D ms=2Ems_mntstatus; >=20 >- free(good); >- > out: >- for (i =3D 0; i < cb=2Ecb_used; i++) >+ for (int i =3D 0; i < cb=2Ecb_used; i++) > zfs_close(cb=2Ecb_handles[i]); > free(cb=2Ecb_handles); >=20 This broke my systems, many filesystems fail to mount causing nullfs late = mounts to fail=2E No details now until tonight=2E Suggest we back this out until it is properly tested=2E --=20 Pardon the typos and autocorrect, small keyboard in use=2E Cheers, Cy Schubert <Cy=2ESchubert@cschubert=2Ecom> FreeBSD UNIX: <cy@FreeBSD=2Eorg> Web: http://www=2EFreeBSD=2Eorg The need of the many outweighs the greed of the few=2E
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?39F76FF0-B40A-43F9-AC77-5D535674A4B8>