Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 8 Mar 2020 19:18:33 +0000 (UTC)
From:      Rick Macklem <rmacklem@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r358772 - projects/nfs-over-tls/sys/fs/nfsserver
Message-ID:  <202003081918.028JIXTw043343@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rmacklem
Date: Sun Mar  8 19:18:33 2020
New Revision: 358772
URL: https://svnweb.freebsd.org/changeset/base/358772

Log:
  Add support for reception of ext_pgs mbufs to the NFS server code.
  This also includes a cleanup of nfs_fha_new.c to avoid use of all
  the function pointer indirection, no longer needed, since there is
  no old NFS server. It would be nice to make file handle affinity work
  for NFSv4, but I can't see how to do it.

Modified:
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_fha_new.c
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdcache.c
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdkrpc.c
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdport.c
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdserv.c
  projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdsubs.c

Modified: projects/nfs-over-tls/sys/fs/nfsserver/nfs_fha_new.c
==============================================================================
--- projects/nfs-over-tls/sys/fs/nfsserver/nfs_fha_new.c	Sun Mar  8 19:09:13 2020	(r358771)
+++ projects/nfs-over-tls/sys/fs/nfsserver/nfs_fha_new.c	Sun Mar  8 19:18:33 2020	(r358772)
@@ -31,22 +31,24 @@ __FBSDID("$FreeBSD$");
 
 #include <fs/nfs/nfsport.h>
 
+#include <sys/sbuf.h>
 #include <rpc/rpc.h>
-#include <nfs/nfs_fha.h>
 #include <fs/nfs/xdr_subs.h>
 #include <fs/nfs/nfs.h>
 #include <fs/nfs/nfsproto.h>
 #include <fs/nfs/nfsm_subs.h>
 #include <fs/nfsserver/nfs_fha_new.h>
 
+static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
+
 static void fhanew_init(void *foo);
 static void fhanew_uninit(void *foo);
 rpcproc_t fhanew_get_procnum(rpcproc_t procnum);
 int fhanew_realign(struct mbuf **mb, int malloc_flags);
-int fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, caddr_t *dpos);
+int fhanew_get_fh(uint64_t *fh, int v3, struct nfsrv_descript *nd);
 int fhanew_is_read(rpcproc_t procnum);
 int fhanew_is_write(rpcproc_t procnum);
-int fhanew_get_offset(struct mbuf **md, caddr_t *dpos, int v3,
+int fhanew_get_offset(struct nfsrv_descript *nd, int v3,
 		      struct fha_info *info);
 int fhanew_no_offset(rpcproc_t procnum);
 void fhanew_set_locktype(rpcproc_t procnum, struct fha_info *info);
@@ -62,7 +64,324 @@ extern SVCPOOL	*nfsrvd_pool;
 SYSINIT(nfs_fhanew, SI_SUB_ROOT_CONF, SI_ORDER_ANY, fhanew_init, NULL);
 SYSUNINIT(nfs_fhanew, SI_SUB_ROOT_CONF, SI_ORDER_ANY, fhanew_uninit, NULL);
 
+static struct fha_hash_entry *
+fha_hash_entry_new(u_int64_t fh)
+{
+	struct fha_hash_entry *e;
+
+	e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
+	e->fh = fh;
+	e->num_rw = 0;
+	e->num_exclusive = 0;
+	e->num_threads = 0;
+	LIST_INIT(&e->threads);
+
+	return (e);
+}
+
 static void
+fha_hash_entry_destroy(struct fha_hash_entry *e)
+{
+
+	mtx_assert(e->mtx, MA_OWNED);
+	KASSERT(e->num_rw == 0,
+	    ("%d reqs on destroyed fhe %p", e->num_rw, e));
+	KASSERT(e->num_exclusive == 0,
+	    ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e));
+	KASSERT(e->num_threads == 0,
+	    ("%d threads on destroyed fhe %p", e->num_threads, e));
+	free(e, M_NFS_FHA);
+}
+
+static void
+fha_hash_entry_remove(struct fha_hash_entry *e)
+{
+
+	mtx_assert(e->mtx, MA_OWNED);
+	LIST_REMOVE(e, link);
+	fha_hash_entry_destroy(e);
+}
+
+static struct fha_hash_entry *
+fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
+{
+	struct fha_hash_slot *fhs;
+	struct fha_hash_entry *fhe, *new_fhe;
+
+	fhs = &softc->fha_hash[fh % FHA_HASH_SIZE];
+	new_fhe = fha_hash_entry_new(fh);
+	new_fhe->mtx = &fhs->mtx;
+	mtx_lock(&fhs->mtx);
+	LIST_FOREACH(fhe, &fhs->list, link)
+		if (fhe->fh == fh)
+			break;
+	if (!fhe) {
+		fhe = new_fhe;
+		LIST_INSERT_HEAD(&fhs->list, fhe, link);
+	} else
+		fha_hash_entry_destroy(new_fhe);
+	return (fhe);
+}
+
+static void
+fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
+{
+
+	mtx_assert(fhe->mtx, MA_OWNED);
+	thread->st_p2 = 0;
+	LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
+	fhe->num_threads++;
+}
+
+static void
+fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
+{
+
+	mtx_assert(fhe->mtx, MA_OWNED);
+	KASSERT(thread->st_p2 == 0,
+	    ("%d reqs on removed thread %p", thread->st_p2, thread));
+	LIST_REMOVE(thread, st_alink);
+	fhe->num_threads--;
+}
+
+/*
+ * Account for an ongoing operation associated with this file.
+ */
+static void
+fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
+{
+
+	mtx_assert(fhe->mtx, MA_OWNED);
+	if (LK_EXCLUSIVE == locktype)
+		fhe->num_exclusive += count;
+	else
+		fhe->num_rw += count;
+}
+
+/*
+ * Get the service thread currently associated with the fhe that is
+ * appropriate to handle this operation.
+ */
+static SVCTHREAD *
+fha_hash_entry_choose_thread(struct fha_params *softc,
+    struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
+{
+	SVCTHREAD *thread, *min_thread = NULL;
+	int req_count, min_count = 0;
+	off_t offset1, offset2;
+
+	LIST_FOREACH(thread, &fhe->threads, st_alink) {
+		req_count = thread->st_p2;
+
+		/* If there are any writes in progress, use the first thread. */
+		if (fhe->num_exclusive) {
+#if 0
+			ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
+			    "fha: %p(%d)w", thread, req_count);
+#endif
+			return (thread);
+		}
+
+		/* Check whether we should consider locality. */
+		if ((i->read && !softc->ctls.read) ||
+		    (i->write && !softc->ctls.write))
+			goto noloc;
+
+		/*
+		 * Check for locality, making sure that we won't
+		 * exceed our per-thread load limit in the process.
+		 */
+		offset1 = i->offset;
+		offset2 = thread->st_p3;
+
+		if (((offset1 >= offset2)
+		  && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
+		 || ((offset2 > offset1)
+		  && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
+			if ((softc->ctls.max_reqs_per_nfsd == 0) ||
+			    (req_count < softc->ctls.max_reqs_per_nfsd)) {
+#if 0
+				ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
+				    "fha: %p(%d)r", thread, req_count);
+#endif
+				return (thread);
+			}
+		}
+
+noloc:
+		/*
+		 * We don't have a locality match, so skip this thread,
+		 * but keep track of the most attractive thread in case
+		 * we need to come back to it later.
+		 */
+#if 0
+		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
+		    "fha: %p(%d)s off1 %llu off2 %llu", thread,
+		    req_count, offset1, offset2);
+#endif
+		if ((min_thread == NULL) || (req_count < min_count)) {
+			min_count = req_count;
+			min_thread = thread;
+		}
+	}
+
+	/*
+	 * We didn't find a good match yet.  See if we can add
+	 * a new thread to this file handle entry's thread list.
+	 */
+	if ((softc->ctls.max_nfsds_per_fh == 0) ||
+	    (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
+		thread = this_thread;
+#if 0
+		ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
+		    "fha: %p(%d)t", thread, thread->st_p2);
+#endif
+		fha_hash_entry_add_thread(fhe, thread);
+	} else {
+		/*
+		 * We don't want to use any more threads for this file, so
+		 * go back to the most attractive nfsd we're already using.
+		 */
+		thread = min_thread;
+	}
+
+	return (thread);
+}
+
+static void
+fha_init(struct fha_params *softc)
+{
+	int i;
+
+	for (i = 0; i < FHA_HASH_SIZE; i++)
+		mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF);
+
+	/*
+	 * Set the default tuning parameters.
+	 */
+	softc->ctls.enable = FHA_DEF_ENABLE;
+	softc->ctls.read = FHA_DEF_READ;
+	softc->ctls.write = FHA_DEF_WRITE;
+	softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
+	softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
+	softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
+
+	/*
+	 * Add sysctls so the user can change the tuning parameters.
+	 */
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "enable", CTLFLAG_RWTUN,
+	    &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
+
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "read", CTLFLAG_RWTUN,
+	    &softc->ctls.read, 0, "Enable NFS FHA read locality");
+
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "write", CTLFLAG_RWTUN,
+	    &softc->ctls.write, 0, "Enable NFS FHA write locality");
+
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "bin_shift", CTLFLAG_RWTUN,
+	    &softc->ctls.bin_shift, 0, "Maximum locality distance 2^(bin_shift) bytes");
+
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RWTUN,
+	    &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
+	    "should be working on requests for the same file handle");
+
+	SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RWTUN,
+	    &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
+	    "single nfsd thread should be working on at any time");
+
+	SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+	    OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
+	    fhenew_stats_sysctl, "A", "");
+
+}
+
+static void
+fha_uninit(struct fha_params *softc)
+{
+	int i;
+
+	sysctl_ctx_free(&softc->sysctl_ctx);
+	for (i = 0; i < FHA_HASH_SIZE; i++)
+		mtx_destroy(&softc->fha_hash[i].mtx);
+}
+
+/*
+ * This just specifies that offsets should obey affinity when within
+ * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
+ */
+static void
+fha_extract_info(struct svc_req *req, struct fha_info *i)
+{
+	static u_int64_t random_fh = 0;
+	int error;
+	int v3 = (req->rq_vers == 3);
+	rpcproc_t procnum;
+	struct nfsrv_descript lnd, *nd;
+
+	nd = &lnd;
+	/*
+	 * We start off with a random fh.  If we get a reasonable
+	 * procnum, we set the fh.  If there's a concept of offset
+	 * that we're interested in, we set that.
+	 */
+	i->fh = ++random_fh;
+	i->offset = 0;
+	i->locktype = LK_EXCLUSIVE;
+	i->read = i->write = 0;
+
+	/*
+	 * Extract the procnum and convert to v3 form if necessary,
+	 * taking care to deal with out-of-range procnums.  Caller will
+	 * ensure that rq_vers is either 2 or 3.
+	 */
+	procnum = req->rq_proc;
+	if (!v3) {
+		rpcproc_t tmp_procnum;
+
+		tmp_procnum = fhanew_get_procnum(procnum);
+		if (tmp_procnum == -1)
+			goto out;
+		procnum = tmp_procnum;
+	}
+
+	/*
+	 * We do affinity for most.  However, we divide a realm of affinity
+	 * by file offset so as to allow for concurrent random access.  We
+	 * only do this for reads today, but this may change when IFS supports
+	 * efficient concurrent writes.
+	 */
+	if (fhanew_no_offset(procnum))
+		goto out;
+
+	i->read = fhanew_is_read(procnum);
+	i->write = fhanew_is_write(procnum);
+
+	error = fhanew_realign(&req->rq_args, M_NOWAIT);
+	if (error)
+		goto out;
+	nd->nd_md = req->rq_args;
+	nfsm_set(nd, req->rq_xprt->xp_mbufoffs, false);
+
+	/* Grab the filehandle. */
+	error = fhanew_get_fh(&i->fh, v3, nd);
+	if (error)
+		goto out;
+
+	/* Content ourselves with zero offset for all but reads. */
+	if (i->read || i->write)
+		fhanew_get_offset(nd, v3, i);
+
+out:
+	fhanew_set_locktype(procnum, i);
+}
+
+static void
 fhanew_init(void *foo)
 {
 	struct fha_params *softc;
@@ -71,19 +390,6 @@ fhanew_init(void *foo)
 
 	bzero(softc, sizeof(*softc));
 
-	/*
-	 * Setup the callbacks for this FHA personality.
-	 */
-	softc->callbacks.get_procnum = fhanew_get_procnum;
-	softc->callbacks.realign = fhanew_realign;
-	softc->callbacks.get_fh = fhanew_get_fh;
-	softc->callbacks.is_read = fhanew_is_read;
-	softc->callbacks.is_write = fhanew_is_write;
-	softc->callbacks.get_offset = fhanew_get_offset;
-	softc->callbacks.no_offset = fhanew_no_offset;
-	softc->callbacks.set_locktype = fhanew_set_locktype;
-	softc->callbacks.fhe_stats_sysctl = fhenew_stats_sysctl;
-
 	snprintf(softc->server_name, sizeof(softc->server_name),
 	    FHANEW_SERVER_NAME);
 
@@ -130,9 +436,8 @@ fhanew_realign(struct mbuf **mb, int malloc_flags)
 }
 
 int
-fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, caddr_t *dpos)
+fhanew_get_fh(uint64_t *fh, int v3, struct nfsrv_descript *nd)
 {
-	struct nfsrv_descript lnd, *nd;
 	uint32_t *tl;
 	uint8_t *buf;
 	uint64_t t;
@@ -140,11 +445,7 @@ fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, 
 
 	error = 0;
 	len = 0;
-	nd = &lnd;
 
-	nd->nd_md = *md;
-	nd->nd_dpos = *dpos;
-
 	if (v3) {
 		NFSM_DISSECT_NONBLOCK(tl, uint32_t *, NFSX_UNSIGNED);
 		if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) {
@@ -164,9 +465,6 @@ fhanew_get_fh(uint64_t *fh, int v3, struct mbuf **md, 
 	*fh = t;
 
 nfsmout:
-	*md = nd->nd_md;
-	*dpos = nd->nd_dpos;
-
 	return (error);
 }
 
@@ -189,19 +487,14 @@ fhanew_is_write(rpcproc_t procnum)
 }
 
 int
-fhanew_get_offset(struct mbuf **md, caddr_t *dpos, int v3,
+fhanew_get_offset(struct nfsrv_descript *nd, int v3,
 		  struct fha_info *info)
 {
-	struct nfsrv_descript lnd, *nd;
 	uint32_t *tl;
 	int error;
 
 	error = 0;
 
-	nd = &lnd;
-	nd->nd_md = *md;
-	nd->nd_dpos = *dpos;
-
 	if (v3) {
 		NFSM_DISSECT_NONBLOCK(tl, uint32_t *, 2 * NFSX_UNSIGNED);
 		info->offset = fxdr_hyper(tl);
@@ -211,9 +504,6 @@ fhanew_get_offset(struct mbuf **md, caddr_t *dpos, int
 	}
 
 nfsmout:
-	*md = nd->nd_md;
-	*dpos = nd->nd_dpos;
-
 	return (error);
 }
 
@@ -264,15 +554,158 @@ fhanew_set_locktype(rpcproc_t procnum, struct fha_info
 	}
 }
 
+SVCTHREAD *
+fhanew_assign(SVCTHREAD *this_thread, struct svc_req *req)
+{
+	SVCTHREAD *thread;
+	struct fha_info i;
+	struct fha_hash_entry *fhe;
+	struct fha_params *softc;
+
+	softc = &fhanew_softc;
+
+	/* Check to see whether we're enabled. */
+	if (softc->ctls.enable == 0)
+		goto thist;
+
+	/*
+	 * Only do placement if this is an NFS request.
+	 */
+	if (req->rq_prog != NFS_PROG)
+		goto thist;
+
+	if (req->rq_vers != 2 && req->rq_vers != 3)
+		goto thist;
+
+	fha_extract_info(req, &i);
+
+	/*
+	 * We save the offset associated with this request for later
+	 * nfsd matching.
+	 */
+	fhe = fha_hash_entry_lookup(softc, i.fh);
+	req->rq_p1 = fhe;
+	req->rq_p2 = i.locktype;
+	req->rq_p3 = i.offset;
+
+	/*
+	 * Choose a thread, taking into consideration locality, thread load,
+	 * and the number of threads already working on this file.
+	 */
+	thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
+	KASSERT(thread, ("fha_assign: NULL thread!"));
+	fha_hash_entry_add_op(fhe, i.locktype, 1);
+	thread->st_p2++;
+	thread->st_p3 = i.offset;
+
+	/*
+	 * Grab the pool lock here to not let chosen thread go away before
+	 * the new request inserted to its queue while we drop fhe lock.
+	 */
+	mtx_lock(&thread->st_lock);
+	mtx_unlock(fhe->mtx);
+
+	return (thread);
+thist:
+	req->rq_p1 = NULL;
+	mtx_lock(&this_thread->st_lock);
+	return (this_thread);
+}
+
 static int
 fhenew_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
-	return (fhe_stats_sysctl(oidp, arg1, arg2, req, &fhanew_softc));
-}
+	int error, i;
+	struct sbuf sb;
+	struct fha_hash_entry *fhe;
+	bool_t first, hfirst;
+	SVCTHREAD *thread;
+	struct fha_params *softc;
 
+	softc = &fhanew_softc;
 
-SVCTHREAD *
-fhanew_assign(SVCTHREAD *this_thread, struct svc_req *req)
+	sbuf_new(&sb, NULL, 65536, SBUF_FIXEDLEN);
+
+	if (!*softc->pool) {
+		sbuf_printf(&sb, "NFSD not running\n");
+		goto out;
+	}
+
+	for (i = 0; i < FHA_HASH_SIZE; i++)
+		if (!LIST_EMPTY(&softc->fha_hash[i].list))
+			break;
+
+	if (i == FHA_HASH_SIZE) {
+		sbuf_printf(&sb, "No file handle entries.\n");
+		goto out;
+	}
+
+	hfirst = TRUE;
+	for (; i < FHA_HASH_SIZE; i++) {
+		mtx_lock(&softc->fha_hash[i].mtx);
+		if (LIST_EMPTY(&softc->fha_hash[i].list)) {
+			mtx_unlock(&softc->fha_hash[i].mtx);
+			continue;
+		}
+		sbuf_printf(&sb, "%shash %d: {\n", hfirst ? "" : ", ", i);
+		first = TRUE;
+		LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) {
+			sbuf_printf(&sb, "%sfhe %p: {\n", first ? "  " : ", ", fhe);
+
+			sbuf_printf(&sb, "    fh: %ju\n", (uintmax_t) fhe->fh);
+			sbuf_printf(&sb, "    num_rw/exclusive: %d/%d\n",
+			    fhe->num_rw, fhe->num_exclusive);
+			sbuf_printf(&sb, "    num_threads: %d\n", fhe->num_threads);
+
+			LIST_FOREACH(thread, &fhe->threads, st_alink) {
+				sbuf_printf(&sb, "      thread %p offset %ju "
+				    "reqs %d\n", thread,
+				    thread->st_p3, thread->st_p2);
+			}
+
+			sbuf_printf(&sb, "  }");
+			first = FALSE;
+		}
+		sbuf_printf(&sb, "\n}");
+		mtx_unlock(&softc->fha_hash[i].mtx);
+		hfirst = FALSE;
+	}
+
+ out:
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+	sbuf_delete(&sb);
+	return (error);
+}
+
+/*
+ * Called when we're done with an operation.  The request has already
+ * been de-queued.
+ */
+void
+fhanew_nd_complete(SVCTHREAD *thread, struct svc_req *req)
 {
-	return (fha_assign(this_thread, req, &fhanew_softc));
+	struct fha_hash_entry *fhe = req->rq_p1;
+	struct mtx *mtx;
+
+	/*
+	 * This may be called for reqs that didn't go through
+	 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
+	 */
+	if (!fhe)
+		return;
+
+	mtx = fhe->mtx;
+	mtx_lock(mtx);
+	fha_hash_entry_add_op(fhe, req->rq_p2, -1);
+	thread->st_p2--;
+	KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p",
+	    thread->st_p2, thread));
+	if (thread->st_p2 == 0) {
+		fha_hash_entry_remove_thread(fhe, thread);
+		if (0 == fhe->num_rw + fhe->num_exclusive)
+			fha_hash_entry_remove(fhe);
+	}
+	mtx_unlock(mtx);
 }

Modified: projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdcache.c
==============================================================================
--- projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdcache.c	Sun Mar  8 19:09:13 2020	(r358771)
+++ projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdcache.c	Sun Mar  8 19:18:33 2020	(r358772)
@@ -1023,8 +1023,22 @@ nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
 		len += mbuf_len(m);
 		m = mbuf_next(m);
 	}
-	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
-	*cksum = in_cksum(m1, cklen);
+	/*
+	 * in_cksum() doesn't work for ext_pgs mbufs, so just return a
+	 * random checksum to avoid a false hit.
+	 * Since NFSv4.1 and NFSv4.2 does not actually use
+	 * the DRC, due to sessions, I think this should be ok.
+	 * Also, most NFS over TCP implementations do not implement
+	 * a DRC at all.  Unfortunately, the DRC is used for NFSv4.0
+	 * for the cases where there are sequenced operations, such as
+	 * file lock operations, so it must still be enabled for NFSv4.0.
+	 */
+	if ((m1->m_flags & M_NOMAP) == 0) {
+		cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN :
+		    len;
+		*cksum = in_cksum(m1, cklen);
+	} else
+		*cksum = arc4random();
 	return (len);
 }
 

Modified: projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdkrpc.c
==============================================================================
--- projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdkrpc.c	Sun Mar  8 19:09:13 2020	(r358771)
+++ projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdkrpc.c	Sun Mar  8 19:18:33 2020	(r358772)
@@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$");
 #include <rpc/rpc.h>
 #include <rpc/rpcsec_gss.h>
 
-#include <nfs/nfs_fha.h>
 #include <fs/nfsserver/nfs_fha_new.h>
 
 #include <security/mac/mac_framework.h>
@@ -162,11 +161,9 @@ nfssvc_program(struct svc_req *rqst, SVCXPRT *xprt)
 	 */
 	nd.nd_mrep = rqst->rq_args;
 	rqst->rq_args = NULL;
-#ifdef notnow
 	newnfs_realign(&nd.nd_mrep, M_WAITOK);
-#endif
 	nd.nd_md = nd.nd_mrep;
-	nfsm_set(&nd, false);
+	nfsm_set(&nd, rqst->rq_xprt->xp_mbufoffs, false);
 	nd.nd_nam = svc_getrpccaller(rqst);
 	nd.nd_nam2 = rqst->rq_addr;
 	nd.nd_mreq = NULL;
@@ -604,7 +601,7 @@ nfsrvd_init(int terminating)
 		    SYSCTL_STATIC_CHILDREN(_vfs_nfsd));
 		nfsrvd_pool->sp_rcache = NULL;
 		nfsrvd_pool->sp_assign = fhanew_assign;
-		nfsrvd_pool->sp_done = fha_nd_complete;
+		nfsrvd_pool->sp_done = fhanew_nd_complete;
 		NFSD_LOCK();
 	}
 }

Modified: projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdport.c
==============================================================================
--- projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdport.c	Sun Mar  8 19:09:13 2020	(r358771)
+++ projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdport.c	Sun Mar  8 19:18:33 2020	(r358772)
@@ -122,14 +122,14 @@ static void nfsrv_pnfsremovesetup(struct vnode *, NFSP
 static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *,
     NFSPROC_T *);
 static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *,
-    struct thread *, int, struct mbuf **, char *, struct mbuf **,
-    struct nfsvattr *, struct acl *, off_t *, int, bool *);
+    struct thread *, int, struct mbuf **, struct nfsrv_descript *,
+    struct mbuf **, struct nfsvattr *, struct acl *, off_t *, int, bool *);
 static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *);
 static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *,
     NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **);
 static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *,
     NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **,
-    char *, int *);
+    struct nfsrv_descript *, int *);
 static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *,
     NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *);
 static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
@@ -820,7 +820,7 @@ nfsrv_createiovec(int len, struct mbuf **mpp, struct m
 	i = 0;
 	while (left > 0) {
 		if (m == NULL)
-			panic("nfsvno_read iov");
+			panic("nfsrv_createiovec iov");
 		siz = min(M_TRAILINGSPACE(m), left);
 		if (siz > 0) {
 			iv->iov_base = mtod(m, caddr_t) + m->m_len;
@@ -1012,7 +1012,7 @@ nfsrv_createiovecw(int retlen, struct mbuf *m, char *c
 	len = retlen;
 	while (len > 0) {
 		if (mp == NULL)
-			panic("nfsvno_write");
+			panic("nfsrv_createiovecw");
 		if (i > 0) {
 			i = min(i, len);
 			ivp->iov_base = cp;
@@ -1120,8 +1120,7 @@ nfsrv_createiovecw_extpgs(int retlen, struct mbuf *m, 
  */
 int
 nfsvno_write(struct vnode *vp, off_t off, int retlen, int *stable,
-    struct mbuf *mp, char *cp, int dextpg, int dextpgsiz,
-    struct ucred *cred, struct thread *p)
+    struct nfsrv_descript *nd, struct thread *p)
 {
 	struct iovec *iv;
 	int cnt, ioflags, error;
@@ -1132,23 +1131,25 @@ nfsvno_write(struct vnode *vp, off_t off, int retlen, 
 	 * Attempt to write to a DS file. A return of ENOENT implies
 	 * there is no DS file to write.
 	 */
-	error = nfsrv_proxyds(vp, off, retlen, cred, p, NFSPROC_WRITEDS,
-	    &mp, cp, NULL, NULL, NULL, NULL, 0, NULL);
+	error = nfsrv_proxyds(vp, off, retlen, nd->nd_cred, p,
+	    NFSPROC_WRITEDS, &nd->nd_md, nd, NULL, NULL, NULL,
+	    NULL, 0, NULL);
 	if (error != ENOENT) {
 		*stable = NFSWRITE_FILESYNC;
 		return (error);
 	}
 
-
 	if (*stable == NFSWRITE_UNSTABLE)
 		ioflags = IO_NODELOCKED;
 	else
 		ioflags = (IO_SYNC | IO_NODELOCKED);
-	if ((mp->m_flags & (M_EXT | M_NOMAP)) == (M_EXT | M_NOMAP))
-		error = nfsrv_createiovecw_extpgs(retlen, mp, cp, dextpg,
-		    dextpgsiz, &iv, &cnt);
+	if ((nd->nd_md->m_flags & M_NOMAP) != 0)
+		error = nfsrv_createiovecw_extpgs(retlen, nd->nd_md,
+		    nd->nd_dpos, nd->nd_dextpg, nd->nd_dextpgsiz,
+		    &iv, &cnt);
 	else
-		error = nfsrv_createiovecw(retlen, mp, cp, &iv, &cnt);
+		error = nfsrv_createiovecw(retlen, nd->nd_md,
+		    nd->nd_dpos, &iv, &cnt);
 	if (error != 0)
 		return (error);
 	uiop->uio_iov = iv;
@@ -1162,7 +1163,7 @@ nfsvno_write(struct vnode *vp, off_t off, int retlen, 
 	ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
 	/* XXX KDM make this more systematic? */
 	nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid;
-	error = VOP_WRITE(vp, uiop, ioflags, cred);
+	error = VOP_WRITE(vp, uiop, ioflags, nd->nd_cred);
 	if (error == 0)
 		nh->nh_nextoff = uiop->uio_offset;
 	free(iv, M_TEMP);
@@ -4635,7 +4636,7 @@ nfsrv_dssetacl(struct vnode *vp, struct acl *aclp, str
 
 static int
 nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
-    struct thread *p, int ioproc, struct mbuf **mpp, char *cp,
+    struct thread *p, int ioproc, struct mbuf **mpp, struct nfsrv_descript *nd,
     struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp,
     off_t *offp, int content, bool *eofp)
 {
@@ -4767,7 +4768,7 @@ tryagain:
 			}
 		} else if (ioproc == NFSPROC_WRITEDS)
 			error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp,
-			    &nmp[0], mirrorcnt, mpp, cp, &failpos);
+			    &nmp[0], mirrorcnt, mpp, nd, &failpos);
 		else if (ioproc == NFSPROC_SETATTR)
 			error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0],
 			    mirrorcnt, nap, &failpos);
@@ -5145,46 +5146,54 @@ nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, st
 			}
 	
 			/*
-			 * Now, adjust first mbuf so that any XDR before the
-			 * read data is skipped over.
+			 * Now, get rid of mbuf data that preceeds the
+			 * current position.  For a regular mbuf, adjust
+			 * m_data, m_len and then find the end of the read
+			 * data and trim off any mbuf(s) after that.
+			 * For an ext_pgs mbuf, split it and free the first
+			 * and third mbuf chains.
 			 */
-			if ((nd->nd_md->m_flags & (M_EXT | M_NOMAP)) ==
-			    (M_EXT | M_NOMAP))
-				nfsm_trimatpos_extpgs(nd);
-			else {
+			tlen = NFSM_RNDUP(retlen);
+			if ((m->m_flags & M_NOMAP) != 0) {
+				trimlen = nfsm_extpgs_calc_offs(m,
+				    nd->nd_dextpg, nd->nd_dextpgsiz);
+				nd->nd_mrep = mb_splitatpos_ext(m, trimlen,
+				    M_WAITOK);
+				m_freem(m);
+				m = mb_splitatpos_ext(nd->nd_mrep, tlen,
+				    M_WAITOK);
+				m_freem(m);
+				m = m_last(nd->nd_mrep);
+			} else {
 				trimlen = nd->nd_dpos - mtod(m, char *);
 				if (trimlen > 0) {
 					m->m_len -= trimlen;
-					NFSM_DATAP(m, trimlen);
+					m->m_data += trimlen;
 				}
-			}
 	
-			/*
-			 * Truncate the mbuf chain at retlen bytes of data,
-			 * plus XDR padding that brings the length up to a
-			 * multiple of 4.
-			 */
-			tlen = NFSM_RNDUP(retlen);
-			do {
-				if (m->m_len >= tlen) {
-					if ((m->m_flags & (M_EXT | M_NOMAP)) ==
-					    (M_EXT | M_NOMAP))
-						nfsm_trimback_extpgs(m, tlen);
-					else
+				/*
+				 * Truncate the mbuf chain at retlen bytes of
+				 * data, plus XDR padding that brings the
+				 * length up to a multiple of 4.
+				 */
+				do {
+					if (m->m_len >= tlen) {
 						m->m_len = tlen;
-					tlen = 0;
-					m2 = m->m_next;
-					m->m_next = NULL;
-					m_freem(m2);
-					break;
+						tlen = 0;
+						m2 = m->m_next;
+						m->m_next = NULL;
+						m_freem(m2);
+						break;
+					}
+					tlen -= m->m_len;
+					m = m->m_next;
+				} while (m != NULL);
+				if (tlen > 0) {
+					printf("nfsrv_readdsrpc: busted mbuf "
+					    "list\n");
+					error = ENOENT;
+					goto nfsmout;
 				}
-				tlen -= m->m_len;
-				m = m->m_next;
-			} while (m != NULL);
-			if (tlen > 0) {
-				printf("nfsrv_readdsrpc: busted mbuf list\n");
-				error = ENOENT;
-				goto nfsmout;
 			}
 			*mpp = nd->nd_mrep;
 			*mpendp = m;
@@ -5258,12 +5267,14 @@ nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fh
 
 	/* Put data in mbuf chain. */
 	nd->nd_mb->m_next = m;
+	if ((m->m_flags & M_NOMAP) != 0)
+		nd->nd_flag |= ND_EXTPG;
 
 	/* Set nd_mb and nd_bpos to end of data. */
 	while (m->m_next != NULL)
 		m = m->m_next;
 	nd->nd_mb = m;
-	nd->nd_bpos = mtod(m, char *) + m->m_len;
+	nfsm_set(nd, m->m_len, true);
 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: lastmb len=%d\n", m->m_len);
 
 	/* Do a Getattr for the attributes that change upon writing. */
@@ -5346,12 +5357,13 @@ start_writedsdorpc(void *arg, int pending)
 static int
 nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred,
     NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
-    struct mbuf **mpp, char *cp, int *failposp)
+    struct mbuf **mpp, struct nfsrv_descript *nd, int *failposp)
 {
 	struct nfsrvwritedsdorpc *drpc, *tdrpc = NULL;
 	struct nfsvattr na;
-	struct mbuf *m;
+	struct mbuf *m, *m1, *m2;
 	int error, i, offs, ret, timo;
+	bool gotnomap;
 
 	NFSD_DEBUG(4, "in nfsrv_writedsrpc\n");
 	KASSERT(*mpp != NULL, ("nfsrv_writedsrpc: NULL mbuf chain"));
@@ -5360,11 +5372,27 @@ nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, s
 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
 		    M_WAITOK);
 
-	/* Calculate offset in mbuf chain that data starts. */
-	offs = cp - mtod(*mpp, char *);
-	NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d len=%d\n", offs, len);
+	NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy len=%d\n", len);
 
 	/*
+	 * For M_NOMAP mbufs, the mbuf chain needs to be split into 3 chains
+	 * so that m_copym() can be done with offs == 0 and M_COPYALL.
+	 * *mpp - Everything that preceeds the data to be written.
+	 * m1 - The data to be written.
+	 * m2 - Everything that follows the data to be written.
+	 */
+	m1 = *mpp;
+	gotnomap = false;
+	if ((m1->m_flags & M_NOMAP) != 0) {
+		gotnomap = true;
+		offs = nfsm_extpgs_calc_offs(nd->nd_md, nd->nd_dextpg,
+		    nd->nd_dextpgsiz);
+		m1 = mb_splitatpos_ext(m1, offs, M_WAITOK);
+		m2 = mb_splitatpos_ext(m1, NFSM_RNDUP(len), M_WAITOK);
+	} else
+		offs = nd->nd_dpos - mtod(m1, char *);
+
+	/*
 	 * Do the write RPC for every DS, using a separate kernel process
 	 * for every DS except the last one.
 	 */
@@ -5379,7 +5407,11 @@ nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, s
 		tdrpc->p = p;
 		tdrpc->inprog = 0;
 		tdrpc->err = 0;
-		tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
+		if (gotnomap)
+			tdrpc->m = m_copym(m1, 0, M_COPYALL, M_WAITOK);
+		else
+			tdrpc->m = m_copym(m1, offs, NFSM_RNDUP(len),
+			    M_WAITOK);
 		ret = EIO;
 		if (nfs_pnfsiothreads != 0) {
 			ret = nfs_pnfsio(start_writedsdorpc, tdrpc);
@@ -5397,7 +5429,10 @@ nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, s
 		nmpp++;
 		fhp++;
 	}
-	m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
+	if (gotnomap)
+		m = m_copym(m1, 0, M_COPYALL, M_WAITOK);
+	else
+		m = m_copym(m1, offs, NFSM_RNDUP(len), M_WAITOK);
 	ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, &na, m, cred, p);
 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
 		*failposp = mirrorcnt - 1;
@@ -5419,6 +5454,14 @@ nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, s
 		else if (error == 0 && tdrpc->err != 0)
 			error = tdrpc->err;
 	}
+
+	/* For gotnomap, chain the lists back to-gether. */
+	if (gotnomap) {
+		m_last(*mpp)->m_next = m1;
+		m_last(m1)->m_next = m2;
+		nd->nd_md = m1;
+		nfsm_set(nd, 0, false);
+	}
 	free(drpc, M_TEMP);
 	return (error);
 }
@@ -6431,7 +6474,7 @@ nfsvno_setxattr(struct vnode *vp, char *name, int len,
 	uiop->uio_td = p;
 	uiop->uio_offset = 0;
 	uiop->uio_resid = len;
-	if ((m->m_flags & (M_EXT | M_NOMAP)) == (M_EXT | M_NOMAP))
+	if ((m->m_flags & M_NOMAP) != 0)
 		error = nfsrv_createiovecw_extpgs(len, m, cp, dextpg,
 		    dextpgsiz, &iv, &cnt);
 	else

Modified: projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdserv.c
==============================================================================
--- projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdserv.c	Sun Mar  8 19:09:13 2020	(r358771)
+++ projects/nfs-over-tls/sys/fs/nfsserver/nfs_nfsdserv.c	Sun Mar  8 19:18:33 2020	(r358772)
@@ -699,8 +699,7 @@ nfsrvd_readlink(struct nfsrv_descript *nd, __unused in
 	if (mp != NULL) {
 		nd->nd_mb->m_next = mp;
 		nd->nd_mb = mpend;
-		if ((mpend->m_flags & (M_EXT | M_NOMAP)) ==
-		    (M_EXT | M_NOMAP)) {
+		if ((mpend->m_flags & M_NOMAP) != 0) {
 			pgs = mpend->m_ext.ext_pgs;
 			nd->nd_bextpg = pgs->npgs - 1;
 			nd->nd_bpos = (char *)(void *)
@@ -863,7 +862,8 @@ nfsrvd_read(struct nfsrv_descript *nd, __unused int is
 		 * Always use ext_pgs if ND_EXTPG is set.
 		 */
 		if ((nd->nd_flag & ND_EXTPG) != 0 || (PMAP_HAS_DMAP != 0 &&
-		    ((nd->nd_flag & ND_TLS) != 0 || nfs_use_ext_pgs)))
+		    ((nd->nd_flag & ND_TLS) != 0 || (nfs_use_ext_pgs &&
+		    cnt > MCLBYTES))))
 			nd->nd_repstat = nfsvno_read(vp, off, cnt, nd->nd_cred,
 			    nd->nd_maxextsiz, p, &m3, &m2);
 		else
@@ -903,8 +903,7 @@ nfsrvd_read(struct nfsrv_descript *nd, __unused int is
 	if (m3) {
 		nd->nd_mb->m_next = m3;
 		nd->nd_mb = m2;
-		if ((m2->m_flags & (M_EXT | M_NOMAP)) ==
-		    (M_EXT | M_NOMAP)) {
+		if ((m2->m_flags & M_NOMAP) != 0) {
 			nd->nd_flag |= ND_EXTPG;
 			pgs = m2->m_ext.ext_pgs;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202003081918.028JIXTw043343>