Date: Wed, 22 Jan 2014 23:52:20 +0000 (UTC) From: Alexander Motin <mav@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r261054 - in stable/10/sys: nfs rpc Message-ID: <201401222352.s0MNqKTY028954@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mav Date: Wed Jan 22 23:52:20 2014 New Revision: 261054 URL: http://svnweb.freebsd.org/changeset/base/261054 Log: MFC r260097: Move most of NFS file handle affinity code out of the heavily congested global RPC thread pool lock and protect it with own set of locks. On synthetic benchmarks this improves peak NFS request rate by 40%. Modified: stable/10/sys/nfs/nfs_fha.c stable/10/sys/nfs/nfs_fha.h stable/10/sys/rpc/svc.c stable/10/sys/rpc/svc.h Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/nfs/nfs_fha.c ============================================================================== --- stable/10/sys/nfs/nfs_fha.c Wed Jan 22 23:51:12 2014 (r261053) +++ stable/10/sys/nfs/nfs_fha.c Wed Jan 22 23:52:20 2014 (r261054) @@ -52,13 +52,10 @@ void fha_init(struct fha_params *softc) { char tmpstr[128]; + int i; - /* - * A small hash table to map filehandles to fha_hash_entry - * structures. - */ - softc->g_fha.hashtable = hashinit(256, M_NFS_FHA, - &softc->g_fha.hashmask); + for (i = 0; i < FHA_HASH_SIZE; i++) + mtx_init(&softc->fha_hash[i].mtx, "fhalock", NULL, MTX_DEF); /* * Set the default tuning parameters. @@ -117,8 +114,11 @@ fha_init(struct fha_params *softc) void fha_uninit(struct fha_params *softc) { + int i; + sysctl_ctx_free(&softc->sysctl_ctx); - hashdestroy(softc->g_fha.hashtable, M_NFS_FHA, softc->g_fha.hashmask); + for (i = 0; i < FHA_HASH_SIZE; i++) + mtx_destroy(&softc->fha_hash[i].mtx); } /* @@ -207,8 +207,13 @@ static void fha_hash_entry_destroy(struct fha_hash_entry *e) { - if (e->num_rw + e->num_exclusive) - panic("nonempty fhe"); + mtx_assert(e->mtx, MA_OWNED); + KASSERT(e->num_rw == 0, + ("%d reqs on destroyed fhe %p", e->num_rw, e)); + KASSERT(e->num_exclusive == 0, + ("%d exclusive reqs on destroyed fhe %p", e->num_exclusive, e)); + KASSERT(e->num_threads == 0, + ("%d threads on destroyed fhe %p", e->num_threads, e)); free(e, M_NFS_FHA); } @@ -216,6 +221,7 @@ static void fha_hash_entry_remove(struct fha_hash_entry *e) { + mtx_assert(e->mtx, MA_OWNED); LIST_REMOVE(e, link); fha_hash_entry_destroy(e); } @@ -224,36 +230,22 @@ static struct fha_hash_entry * fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh) { SVCPOOL *pool; - - pool = *softc->pool; - + struct fha_hash_slot *fhs; struct fha_hash_entry *fhe, *new_fhe; - LIST_FOREACH(fhe, &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], - link) + pool = *softc->pool; + fhs = &softc->fha_hash[fh % FHA_HASH_SIZE]; + new_fhe = fha_hash_entry_new(fh); + new_fhe->mtx = &fhs->mtx; + mtx_lock(&fhs->mtx); + LIST_FOREACH(fhe, &fhs->list, link) if (fhe->fh == fh) break; - if (!fhe) { - /* Allocate a new entry. */ - mtx_unlock(&pool->sp_lock); - new_fhe = fha_hash_entry_new(fh); - mtx_lock(&pool->sp_lock); - - /* Double-check to make sure we still need the new entry. */ - LIST_FOREACH(fhe, - &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], link) - if (fhe->fh == fh) - break; - if (!fhe) { - fhe = new_fhe; - LIST_INSERT_HEAD( - &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], - fhe, link); - } else - fha_hash_entry_destroy(new_fhe); - } - + fhe = new_fhe; + LIST_INSERT_HEAD(&fhs->list, fhe, link); + } else + fha_hash_entry_destroy(new_fhe); return (fhe); } @@ -261,6 +253,8 @@ static void fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) { + mtx_assert(fhe->mtx, MA_OWNED); + thread->st_p2 = 0; LIST_INSERT_HEAD(&fhe->threads, thread, st_alink); fhe->num_threads++; } @@ -269,6 +263,9 @@ static void fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread) { + mtx_assert(fhe->mtx, MA_OWNED); + KASSERT(thread->st_p2 == 0, + ("%d reqs on removed thread %p", thread->st_p2, thread)); LIST_REMOVE(thread, st_alink); fhe->num_threads--; } @@ -280,6 +277,7 @@ static void fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count) { + mtx_assert(fhe->mtx, MA_OWNED); if (LK_EXCLUSIVE == locktype) fhe->num_exclusive += count; else @@ -306,7 +304,7 @@ fha_hash_entry_choose_thread(struct fha_ pool = *softc->pool; LIST_FOREACH(thread, &fhe->threads, st_alink) { - req_count = thread->st_reqcount; + req_count = thread->st_p2; /* If there are any writes in progress, use the first thread. */ if (fhe->num_exclusive) { @@ -322,7 +320,7 @@ fha_hash_entry_choose_thread(struct fha_ * exceed our per-thread load limit in the process. */ offset1 = i->offset; - offset2 = STAILQ_FIRST(&thread->st_reqs)->rq_p3; + offset2 = thread->st_p3; if (((offset1 >= offset2) && ((offset1 - offset2) < (1 << softc->ctls.bin_shift))) @@ -360,28 +358,11 @@ fha_hash_entry_choose_thread(struct fha_ */ if ((softc->ctls.max_nfsds_per_fh == 0) || (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) { - /* - * We can add a new thread, so try for an idle thread - * first, and fall back to this_thread if none are idle. - */ - if (STAILQ_EMPTY(&this_thread->st_reqs)) { - thread = this_thread; + thread = this_thread; #if 0 - ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, - "fha: %p(%d)t", thread, thread->st_reqcount); -#endif - } else if ((thread = LIST_FIRST(&pool->sp_idlethreads))) { -#if 0 - ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, - "fha: %p(%d)i", thread, thread->st_reqcount); -#endif - } else { - thread = this_thread; -#if 0 - ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, - "fha: %p(%d)b", thread, thread->st_reqcount); + ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO, + "fha: %p(%d)t", thread, thread->st_p2); #endif - } fha_hash_entry_add_thread(fhe, thread); } else { /* @@ -411,16 +392,16 @@ fha_assign(SVCTHREAD *this_thread, struc /* Check to see whether we're enabled. */ if (softc->ctls.enable == 0) - return (this_thread); + goto thist; /* * Only do placement if this is an NFS request. */ if (req->rq_prog != NFS_PROG) - return (this_thread); + goto thist; if (req->rq_vers != 2 && req->rq_vers != 3) - return (this_thread); + goto thist; fha_extract_info(req, &i, cb); @@ -440,8 +421,21 @@ fha_assign(SVCTHREAD *this_thread, struc thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread); KASSERT(thread, ("fha_assign: NULL thread!")); fha_hash_entry_add_op(fhe, i.locktype, 1); + thread->st_p2++; + thread->st_p3 = i.offset; + + /* + * Grab the pool lock here to not let chosen thread go away before + * the new request inserted to its queue while we drop fhe lock. + */ + mtx_lock(&(*softc->pool)->sp_lock); + mtx_unlock(fhe->mtx); return (thread); +thist: + req->rq_p1 = NULL; + mtx_lock(&(*softc->pool)->sp_lock); + return (this_thread); } /* @@ -452,6 +446,7 @@ void fha_nd_complete(SVCTHREAD *thread, struct svc_req *req) { struct fha_hash_entry *fhe = req->rq_p1; + struct mtx *mtx; /* * This may be called for reqs that didn't go through @@ -460,13 +455,18 @@ fha_nd_complete(SVCTHREAD *thread, struc if (!fhe) return; + mtx = fhe->mtx; + mtx_lock(mtx); fha_hash_entry_add_op(fhe, req->rq_p2, -1); - - if (thread->st_reqcount == 0) { + thread->st_p2--; + KASSERT(thread->st_p2 >= 0, ("Negative request count %d on %p", + thread->st_p2, thread)); + if (thread->st_p2 == 0) { fha_hash_entry_remove_thread(fhe, thread); if (0 == fhe->num_rw + fhe->num_exclusive) fha_hash_entry_remove(fhe); } + mtx_unlock(mtx); } int @@ -489,10 +489,9 @@ fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, st } pool = *softc->pool; - mtx_lock(&pool->sp_lock); count = 0; - for (i = 0; i <= softc->g_fha.hashmask; i++) - if (!LIST_EMPTY(&softc->g_fha.hashtable[i])) + for (i = 0; i < FHA_HASH_SIZE; i++) + if (!LIST_EMPTY(&softc->fha_hash[i].list)) count++; if (count == 0) { @@ -500,8 +499,9 @@ fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, st goto out; } - for (i = 0; i <= softc->g_fha.hashmask; i++) { - LIST_FOREACH(fhe, &softc->g_fha.hashtable[i], link) { + for (i = 0; i < FHA_HASH_SIZE; i++) { + mtx_lock(&softc->fha_hash[i].mtx); + LIST_FOREACH(fhe, &softc->fha_hash[i].list, link) { sbuf_printf(&sb, "%sfhe %p: {\n", first ? "" : ", ", fhe); sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh); @@ -512,8 +512,7 @@ fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, st LIST_FOREACH(thread, &fhe->threads, st_alink) { sbuf_printf(&sb, " thread %p offset %ju " "(count %d)\n", thread, - STAILQ_FIRST(&thread->st_reqs)->rq_p3, - thread->st_reqcount); + thread->st_p3, thread->st_p2); } sbuf_printf(&sb, "}"); @@ -525,11 +524,10 @@ fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, st break; } } + mtx_unlock(&softc->fha_hash[i].mtx); } out: - if (pool) - mtx_unlock(&pool->sp_lock); sbuf_trim(&sb); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); Modified: stable/10/sys/nfs/nfs_fha.h ============================================================================== --- stable/10/sys/nfs/nfs_fha.h Wed Jan 22 23:51:12 2014 (r261053) +++ stable/10/sys/nfs/nfs_fha.h Wed Jan 22 23:52:20 2014 (r261054) @@ -35,11 +35,7 @@ #define FHA_DEF_MAX_NFSDS_PER_FH 8 #define FHA_DEF_MAX_REQS_PER_NFSD 0 /* Unlimited */ -/* This is the global structure that represents the state of the fha system. */ -struct fha_global { - struct fha_hash_entry_list *hashtable; - u_long hashmask; -}; +#define FHA_HASH_SIZE 251 struct fha_ctls { int enable; @@ -62,6 +58,7 @@ struct fha_ctls { * avoid contention between threads over single files. */ struct fha_hash_entry { + struct mtx *mtx; LIST_ENTRY(fha_hash_entry) link; u_int64_t fh; u_int32_t num_rw; @@ -72,6 +69,11 @@ struct fha_hash_entry { LIST_HEAD(fha_hash_entry_list, fha_hash_entry); +struct fha_hash_slot { + struct fha_hash_entry_list list; + struct mtx mtx; +}; + /* A structure used for passing around data internally. */ struct fha_info { u_int64_t fh; @@ -93,7 +95,7 @@ struct fha_callbacks { }; struct fha_params { - struct fha_global g_fha; + struct fha_hash_slot fha_hash[FHA_HASH_SIZE]; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct fha_ctls ctls; Modified: stable/10/sys/rpc/svc.c ============================================================================== --- stable/10/sys/rpc/svc.c Wed Jan 22 23:51:12 2014 (r261053) +++ stable/10/sys/rpc/svc.c Wed Jan 22 23:52:20 2014 (r261054) @@ -71,6 +71,8 @@ static struct svc_callout *svc_find(SVCP char *); static void svc_new_thread(SVCPOOL *pool); static void xprt_unregister_locked(SVCXPRT *xprt); +static void svc_change_space_used(SVCPOOL *pool, int delta); +static bool_t svc_request_space_available(SVCPOOL *pool); /* *************** SVCXPRT related stuff **************** */ @@ -373,7 +375,8 @@ xprt_active(SVCXPRT *xprt) if (!xprt->xp_active) { xprt->xp_active = TRUE; if (xprt->xp_thread == NULL) { - if (!xprt_assignthread(xprt)) + if (!svc_request_space_available(pool) || + !xprt_assignthread(xprt)) TAILQ_INSERT_TAIL(&pool->sp_active, xprt, xp_alink); } @@ -965,56 +968,63 @@ svc_assign_waiting_sockets(SVCPOOL *pool { SVCXPRT *xprt; + mtx_lock(&pool->sp_lock); while ((xprt = TAILQ_FIRST(&pool->sp_active)) != NULL) { if (xprt_assignthread(xprt)) TAILQ_REMOVE(&pool->sp_active, xprt, xp_alink); else break; } + mtx_unlock(&pool->sp_lock); } -static bool_t -svc_request_space_available(SVCPOOL *pool) +static void +svc_change_space_used(SVCPOOL *pool, int delta) { + unsigned int value; - mtx_assert(&pool->sp_lock, MA_OWNED); - - if (pool->sp_space_throttled) { - /* - * Below the low-water yet? If so, assign any waiting sockets. - */ - if (pool->sp_space_used < pool->sp_space_low) { - pool->sp_space_throttled = FALSE; - svc_assign_waiting_sockets(pool); - return TRUE; - } - - return FALSE; - } else { - if (pool->sp_space_used - >= pool->sp_space_high) { + value = atomic_fetchadd_int(&pool->sp_space_used, delta) + delta; + if (delta > 0) { + if (value >= pool->sp_space_high && !pool->sp_space_throttled) { pool->sp_space_throttled = TRUE; pool->sp_space_throttle_count++; - return FALSE; } - - return TRUE; + if (value > pool->sp_space_used_highest) + pool->sp_space_used_highest = value; + } else { + if (value < pool->sp_space_low && pool->sp_space_throttled) { + pool->sp_space_throttled = FALSE; + svc_assign_waiting_sockets(pool); + } } } +static bool_t +svc_request_space_available(SVCPOOL *pool) +{ + + if (pool->sp_space_throttled) + return (FALSE); + return (TRUE); +} + static void svc_run_internal(SVCPOOL *pool, bool_t ismaster) { + struct svc_reqlist reqs; SVCTHREAD *st, *stpref; SVCXPRT *xprt; enum xprt_stat stat; struct svc_req *rqstp; + size_t sz; int error; st = mem_alloc(sizeof(*st)); + st->st_pool = pool; st->st_xprt = NULL; STAILQ_INIT(&st->st_reqs); cv_init(&st->st_cond, "rpcsvc"); + STAILQ_INIT(&reqs); mtx_lock(&pool->sp_lock); LIST_INSERT_HEAD(&pool->sp_threads, st, st_link); @@ -1108,15 +1118,14 @@ svc_run_internal(SVCPOOL *pool, bool_t i * RPCs. */ xprt->xp_lastactive = time_uptime; - stat = XPRT_IDLE; do { + mtx_unlock(&pool->sp_lock); if (!svc_request_space_available(pool)) break; rqstp = NULL; - mtx_unlock(&pool->sp_lock); stat = svc_getreq(xprt, &rqstp); - mtx_lock(&pool->sp_lock); if (rqstp) { + svc_change_space_used(pool, rqstp->rq_size); /* * See if the application has * a preference for some other @@ -1126,17 +1135,12 @@ svc_run_internal(SVCPOOL *pool, bool_t i if (pool->sp_assign) stpref = pool->sp_assign(st, rqstp); + else + mtx_lock(&pool->sp_lock); - pool->sp_space_used += - rqstp->rq_size; - if (pool->sp_space_used - > pool->sp_space_used_highest) - pool->sp_space_used_highest = - pool->sp_space_used; rqstp->rq_thread = stpref; STAILQ_INSERT_TAIL(&stpref->st_reqs, rqstp, rq_link); - stpref->st_reqcount++; /* * If we assigned the request @@ -1156,7 +1160,8 @@ svc_run_internal(SVCPOOL *pool, bool_t i stpref->st_idle = FALSE; cv_signal(&stpref->st_cond); } - } + } else + mtx_lock(&pool->sp_lock); } while (stat == XPRT_MOREREQS && pool->sp_state != SVCPOOL_CLOSING); @@ -1171,25 +1176,30 @@ svc_run_internal(SVCPOOL *pool, bool_t i xprt->xp_thread = NULL; st->st_xprt = NULL; if (xprt->xp_active) { - if (!xprt_assignthread(xprt)) + if (!svc_request_space_available(pool) || + !xprt_assignthread(xprt)) TAILQ_INSERT_TAIL(&pool->sp_active, xprt, xp_alink); } + STAILQ_CONCAT(&reqs, &st->st_reqs); mtx_unlock(&pool->sp_lock); SVC_RELEASE(xprt); - mtx_lock(&pool->sp_lock); + } else { + STAILQ_CONCAT(&reqs, &st->st_reqs); + mtx_unlock(&pool->sp_lock); } /* * Execute what we have queued. */ - while ((rqstp = STAILQ_FIRST(&st->st_reqs)) != NULL) { - size_t sz = rqstp->rq_size; - mtx_unlock(&pool->sp_lock); + sz = 0; + while ((rqstp = STAILQ_FIRST(&reqs)) != NULL) { + STAILQ_REMOVE_HEAD(&reqs, rq_link); + sz += rqstp->rq_size; svc_executereq(rqstp); - mtx_lock(&pool->sp_lock); - pool->sp_space_used -= sz; } + svc_change_space_used(pool, -sz); + mtx_lock(&pool->sp_lock); } if (st->st_xprt) { @@ -1309,24 +1319,13 @@ void svc_freereq(struct svc_req *rqstp) { SVCTHREAD *st; - SVCXPRT *xprt; SVCPOOL *pool; st = rqstp->rq_thread; - xprt = rqstp->rq_xprt; - if (xprt) - pool = xprt->xp_pool; - else - pool = NULL; if (st) { - mtx_lock(&pool->sp_lock); - KASSERT(rqstp == STAILQ_FIRST(&st->st_reqs), - ("Freeing request out of order")); - STAILQ_REMOVE_HEAD(&st->st_reqs, rq_link); - st->st_reqcount--; + pool = st->st_pool; if (pool->sp_done) pool->sp_done(st, rqstp); - mtx_unlock(&pool->sp_lock); } if (rqstp->rq_auth.svc_ah_ops) Modified: stable/10/sys/rpc/svc.h ============================================================================== --- stable/10/sys/rpc/svc.h Wed Jan 22 23:51:12 2014 (r261053) +++ stable/10/sys/rpc/svc.h Wed Jan 22 23:52:20 2014 (r261054) @@ -275,14 +275,16 @@ STAILQ_HEAD(svc_reqlist, svc_req); * thread to read and execute pending RPCs. */ typedef struct __rpc_svcthread { + struct __rpc_svcpool *st_pool; SVCXPRT *st_xprt; /* transport we are processing */ struct svc_reqlist st_reqs; /* RPC requests to execute */ - int st_reqcount; /* number of queued reqs */ int st_idle; /* thread is on idle list */ struct cv st_cond; /* sleeping for work */ LIST_ENTRY(__rpc_svcthread) st_link; /* all threads list */ LIST_ENTRY(__rpc_svcthread) st_ilink; /* idle threads list */ LIST_ENTRY(__rpc_svcthread) st_alink; /* application thread list */ + int st_p2; /* application workspace */ + uint64_t st_p3; /* application workspace */ } SVCTHREAD; LIST_HEAD(svcthread_list, __rpc_svcthread);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201401222352.s0MNqKTY028954>