Date: Fri, 25 Dec 2020 11:36:18 GMT From: "Alexander V. Chernikov" <melifaro@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: f5baf8bb12f3 - Add modular fib lookup framework. Message-ID: <202012251136.0BPBaIh2041079@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by melifaro: URL: https://cgit.FreeBSD.org/src/commit/?id=f5baf8bb12f39d0e8d64508c47eb6c4386ef716d commit f5baf8bb12f39d0e8d64508c47eb6c4386ef716d Author: Alexander V. Chernikov <melifaro@FreeBSD.org> AuthorDate: 2020-12-25 10:39:52 +0000 Commit: Alexander V. Chernikov <melifaro@FreeBSD.org> CommitDate: 2020-12-25 11:33:17 +0000 Add modular fib lookup framework. This change introduces framework that allows to dynamically attach or detach longest prefix match (lpm) lookup algorithms to speed up datapath route tables lookups. Framework takes care of handling initial synchronisation, route subscription, nhop/nhop groups reference and indexing, dataplane attachments and fib instance algorithm setup/teardown. Framework features automatic algorithm selection, allowing for picking the best matching algorithm on-the-fly based on the amount of routes in the routing table. Currently framework code is guarded under FIB_ALGO config option. An idea is to enable it by default in the next couple of weeks. The following algorithms are provided by default: IPv4: * bsearch4 (lockless binary search in a special IP array), tailored for small-fib (<16 routes) * radix4_lockless (lockless immutable radix, re-created on every rtable change), tailored for small-fib (<1000 routes) * radix4 (base system radix backend) * dpdk_lpm4 (DPDK DIR24-8-based lookups), lockless datastrucure, optimized for large-fib (D27412) IPv6: * radix6_lockless (lockless immutable radix, re-created on every rtable change), tailed for small-fib (<1000 routes) * radix6 (base system radix backend) * dpdk_lpm6 (DPDK DIR24-8-based lookups), lockless datastrucure, optimized for large-fib (D27412) Performance changes: Micro benchmarks (I7-7660U, single-core lookups, 2048k dst, code in D27604): IPv4: 8 routes: radix4: ~20mpps radix4_lockless: ~24.8mpps bsearch4: ~69mpps dpdk_lpm4: ~67 mpps 700k routes: radix4_lockless: 3.3mpps dpdk_lpm4: 46mpps IPv6: 8 routes: radix6_lockless: ~20mpps dpdk_lpm6: ~70mpps 100k routes: radix6_lockless: 13.9mpps dpdk_lpm6: 57mpps Forwarding benchmarks: + 10-15% IPv4 forwarding performance (small-fib, bsearch4) + 25% IPv4 forwarding performance (full-view, dpdk_lpm4) + 20% IPv6 forwarding performance (full-view, dpdk_lpm6) Control: Framwork adds the following runtime sysctls: List algos * net.route.algo.inet.algo_list: bsearch4, radix4_lockless, radix4 * net.route.algo.inet6.algo_list: radix6_lockless, radix6, dpdk_lpm6 Debug level (7=LOG_DEBUG, per-route) net.route.algo.debug_level: 5 Algo selection (currently only for fib 0): net.route.algo.inet.algo: bsearch4 net.route.algo.inet6.algo: radix6_lockless Support for manually changing algos in non-default fib will be added soon. Some sysctl names will be changed in the near future. Differential Revision: https://reviews.freebsd.org/D27401 --- sys/conf/files | 3 + sys/conf/options | 1 + sys/net/route.c | 8 + sys/net/route/fib_algo.c | 1608 ++++++++++++++++++++++++++++++++++++++++++ sys/net/route/fib_algo.h | 109 +++ sys/net/route/route_tables.c | 29 +- sys/net/route/route_var.h | 10 + sys/netinet/in_fib.c | 37 + sys/netinet/in_fib_algo.c | 765 ++++++++++++++++++++ sys/netinet6/in6_fib.c | 35 + sys/netinet6/in6_fib.h | 2 + sys/netinet6/in6_fib_algo.c | 361 ++++++++++ 12 files changed, 2962 insertions(+), 6 deletions(-) diff --git a/sys/conf/files b/sys/conf/files index aa376b0bb9a3..2a71e21ace71 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4178,6 +4178,7 @@ net/route/nhgrp_ctl.c optional route_mpath net/route/nhop.c standard net/route/nhop_ctl.c standard net/route/nhop_utils.c standard +net/route/fib_algo.c optional fib_algo net/route/route_ctl.c standard net/route/route_ddb.c optional ddb net/route/route_helpers.c standard @@ -4329,6 +4330,7 @@ netinet/in_debug.c optional inet ddb netinet/in_kdtrace.c optional inet | inet6 netinet/ip_carp.c optional inet carp | inet6 carp netinet/in_fib.c optional inet +netinet/in_fib_algo.c optional inet fib_algo netinet/in_gif.c optional gif inet | netgraph_gif inet netinet/ip_gre.c optional gre inet netinet/ip_id.c optional inet @@ -4405,6 +4407,7 @@ netinet6/icmp6.c optional inet6 netinet6/in6.c optional inet6 netinet6/in6_cksum.c optional inet6 netinet6/in6_fib.c optional inet6 +netinet6/in6_fib_algo.c optional inet6 fib_algo netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6 netinet6/in6_ifattach.c optional inet6 netinet6/in6_jail.c optional inet6 diff --git a/sys/conf/options b/sys/conf/options index 24f984930dc2..68c8a2e0d3ca 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -454,6 +454,7 @@ PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h +FIB_ALGO opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPDEBUG diff --git a/sys/net/route.c b/sys/net/route.c index 6c051e389d82..2f4ae40e83aa 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -151,6 +151,14 @@ void rt_table_destroy(struct rib_head *rh) { + RIB_WLOCK(rh); + rh->rib_dying = true; + RIB_WUNLOCK(rh); + +#ifdef FIB_ALGO + fib_destroy_rib(rh); +#endif + tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c new file mode 100644 index 000000000000..afb009d4c8cf --- /dev/null +++ b/sys/net/route/fib_algo.c @@ -0,0 +1,1608 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" + +#include <sys/param.h> +#include <sys/eventhandler.h> +#include <sys/kernel.h> +#include <sys/sbuf.h> +#include <sys/lock.h> +#include <sys/rmlock.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <sys/kernel.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/queue.h> +#include <net/vnet.h> + +#include <net/if.h> +#include <net/if_var.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif + +#include <net/route.h> +#include <net/route/nhop.h> +#include <net/route/route_ctl.h> +#include <net/route/route_var.h> +#include <net/route/fib_algo.h> + +#include <machine/stdarg.h> + +/* + * Fib lookup framework. + * + * This framework enables accelerated longest-prefix-match lookups for the + * routing tables by adding the ability to dynamically attach/detach lookup + * algorithms implementation to/from the datapath. + * + * flm - fib lookup modules - implementation of particular lookup algorithm + * fd - fib data - instance of an flm bound to specific routing table + * + * This file provides main framework functionality. + * + * The following are the features provided by the framework + * + * 1) nexhops abstraction -> provides transparent referencing, indexing + * and efficient idx->ptr mappings for nexthop and nexthop groups. + * 2) Routing table synchronisation + * 3) dataplane attachment points + * 4) automatic algorithm selection based on the provided preference. + * + * + * DATAPATH + * For each supported address family, there is a an allocated array of fib_dp + * structures, indexed by fib number. Each array entry contains callback function + * and its argument. This function will be called with a family-specific lookup key, + * scope and provided argument. This array gets re-created every time when new algo + * instance gets created. Please take a look at the replace_rtables_family() function + * for more details. + * + */ + +SYSCTL_DECL(_net_route); +SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Fib algorithm lookups"); + +#ifdef INET6 +bool algo_fixed_inet6 = false; +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv6 longest prefix match lookups"); +#endif +#ifdef INET +bool algo_fixed_inet = false; +SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "IPv4 longest prefix match lookups"); +#endif + +struct nhop_ref_table { + uint32_t count; + int32_t refcnt[0]; +}; + +/* + * Data structure for the fib lookup instance tied to the particular rib. + */ +struct fib_data { + uint32_t number_nhops; /* current # of nhops */ + uint8_t hit_nhops; /* true if out of nhop limit */ + uint8_t init_done; /* true if init is competed */ + uint32_t fd_dead:1; /* Scheduled for deletion */ + uint32_t fd_linked:1; /* true if linked */ + uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */ + uint32_t fd_force_eval:1;/* true if rebuild scheduled */ + uint8_t fd_family; /* family */ + uint32_t fd_fibnum; /* fibnum */ + uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */ + uint32_t fd_algo_mask; /* bitmask for algo data */ + struct callout fd_callout; /* rebuild callout */ + void *fd_algo_data; /* algorithm data */ + struct nhop_object **nh_idx; /* nhop idx->ptr array */ + struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */ + struct rib_head *fd_rh; /* RIB table we're attached to */ + struct rib_subscription *fd_rs; /* storing table subscription */ + struct fib_dp fd_dp; /* fib datapath data */ + struct vnet *fd_vnet; /* vnet fib belongs to */ + struct epoch_context fd_epoch_ctx; /* epoch context for deletion */ + struct fib_lookup_module *fd_flm;/* pointer to the lookup module */ + uint32_t fd_num_changes; /* number of changes since last callout */ + TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */ +}; + +static void rebuild_fd_callout(void *_data); +static void destroy_fd_instance_epoch(epoch_context_t ctx); +static enum flm_op_result attach_datapath(struct fib_data *fd); +static bool is_idx_free(struct fib_data *fd, uint32_t index); +static void set_algo_fixed(struct rib_head *rh); + +static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh); +static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh); + +static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh, + struct fib_lookup_module *orig_flm); +static void fib_unref_algo(struct fib_lookup_module *flm); +static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum); + +struct mtx fib_mtx; +#define FIB_MOD_LOCK() mtx_lock(&fib_mtx) +#define FIB_MOD_UNLOCK() mtx_unlock(&fib_mtx) +#define FIB_MOD_LOCK_ASSERT() mtx_assert(&fib_mtx, MA_OWNED) + + +/* Algorithm has to be this percent better than the current to switch */ +#define BEST_DIFF_PERCENT (5 * 256 / 100) +/* Schedule algo re-evaluation X seconds after a change */ +#define ALGO_EVAL_DELAY_MS 30000 +/* Force algo re-evaluation after X changes */ +#define ALGO_EVAL_NUM_ROUTES 100 +/* Try to setup algorithm X times */ +#define FIB_MAX_TRIES 32 +/* Max amount of supported nexthops */ +#define FIB_MAX_NHOPS 262144 +#define FIB_CALLOUT_DELAY_MS 50 + +/* Debug */ +static int flm_debug_level = LOG_NOTICE; +SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN, + &flm_debug_level, 0, "debuglevel"); +#define FLM_MAX_DEBUG_LEVEL LOG_DEBUG + +#define _PASS_MSG(_l) (flm_debug_level >= (_l)) +#define ALGO_PRINTF(_fmt, ...) printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__) +#define _ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \ + printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\ + print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__) +#define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \ + printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__) +#define RH_PRINTF(_l, _rh, _fmt, ...) if (_PASS_MSG(_l)) { \ + _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\ +} +#define FD_PRINTF(_l, _fd, _fmt, ...) FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__) +#define _FD_PRINTF(_l, _fd, _fmt, ...) if (_PASS_MSG(_l)) { \ + _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \ + __func__, _fmt, ## __VA_ARGS__); \ +} +#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG +#define FD_PRINTF_LOG_DEBUG _FD_PRINTF +#else +#define FD_PRINTF_LOG_DEBUG() +#endif +#if FLM_MAX_DEBUG_LEVEL>=LOG_INFO +#define FD_PRINTF_LOG_INFO _FD_PRINTF +#else +#define FD_PRINTF_LOG_INFO() +#endif +#define FD_PRINTF_LOG_NOTICE _FD_PRINTF +#define FD_PRINTF_LOG_ERR _FD_PRINTF +#define FD_PRINTF_LOG_WARNING _FD_PRINTF + + +/* List of all registered lookup algorithms */ +static TAILQ_HEAD(, fib_lookup_module) all_algo_list; + +/* List of all fib lookup instances in the vnet */ +VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list); +#define V_fib_data_list VNET(fib_data_list) + +/* Datastructure for storing non-transient fib lookup module failures */ +struct fib_error { + int fe_family; + uint32_t fe_fibnum; /* failed rtable */ + struct fib_lookup_module *fe_flm; /* failed module */ + TAILQ_ENTRY(fib_error) entries;/* list of all errored entries */ +}; +VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list); +#define V_fib_error_list VNET(fib_error_list) + +/* Per-family array of fibnum -> {func, arg} mappings used in datapath */ +struct fib_dp_header { + struct epoch_context fdh_epoch_ctx; + uint32_t fdh_num_tables; + struct fib_dp fdh_idx[0]; +}; + +/* + * Tries to add new non-transient algorithm error to the list of + * errors. + * Returns true on success. + */ +static bool +flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum) +{ + struct fib_error *fe; + + fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO); + if (fe == NULL) + return (false); + fe->fe_flm = flm; + fe->fe_family = flm->flm_family; + fe->fe_fibnum = fibnum; + + FIB_MOD_LOCK(); + /* Avoid duplicates by checking if error already exists first */ + if (flm_error_check(flm, fibnum)) { + FIB_MOD_UNLOCK(); + free(fe, M_TEMP); + return (true); + } + TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries); + FIB_MOD_UNLOCK(); + + return (true); +} + +/* + * True if non-transient error has been registered for @flm in @fibnum. + */ +static bool +flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum) +{ + const struct fib_error *fe; + + TAILQ_FOREACH(fe, &V_fib_error_list, entries) { + if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum)) + return (true); + } + + return (false); +} + +/* + * Clear all errors of algo specified by @flm. + */ +static void +fib_error_clear_flm(struct fib_lookup_module *flm) +{ + struct fib_error *fe, *fe_tmp; + + FIB_MOD_LOCK_ASSERT(); + + TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { + if (fe->fe_flm == flm) { + TAILQ_REMOVE(&V_fib_error_list, fe, entries); + free(fe, M_TEMP); + } + } +} + +/* + * Clears all errors in current VNET. + */ +static void +fib_error_clear() +{ + struct fib_error *fe, *fe_tmp; + + FIB_MOD_LOCK_ASSERT(); + + TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) { + TAILQ_REMOVE(&V_fib_error_list, fe, entries); + free(fe, M_TEMP); + } +} + +static const char * +print_family(int family) +{ + + if (family == AF_INET) + return ("inet"); + else if (family == AF_INET6) + return ("inet6"); + else + return ("unknown"); +} + +/* + * Debug function used by lookup algorithms. + * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) " + */ +void +fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...) +{ + char buf[128]; + va_list ap; + + if (level > flm_debug_level) + return; + + va_start(ap, fmt); + vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name, + func, "%s", buf); +} + +/* + * Outputs list of algorithms supported by the provided address family. + */ +static int +print_algos_sysctl(struct sysctl_req *req, int family) +{ + struct fib_lookup_module *flm; + struct sbuf sbuf; + int error, count = 0; + + error = sysctl_wire_old_buffer(req, 0); + if (error == 0) { + sbuf_new_for_sysctl(&sbuf, NULL, 512, req); + TAILQ_FOREACH(flm, &all_algo_list, entries) { + if (flm->flm_family == family) { + if (count++ > 0) + sbuf_cat(&sbuf, ", "); + sbuf_cat(&sbuf, flm->flm_name); + } + } + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + } + return (error); +} + +#ifdef INET6 +static int +print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos_sysctl(req, AF_INET6)); +} +SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms"); +#endif + +#ifdef INET +static int +print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS) +{ + + return (print_algos_sysctl(req, AF_INET)); +} +SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms"); +#endif + +/* + * Calculate delay between repeated failures. + * Returns current delay in milliseconds. + */ +static uint32_t +callout_calc_delay_ms(struct fib_data *fd) +{ + uint32_t shift; + + if (fd->fd_failed_rebuilds > 10) + shift = 10; + else + shift = fd->fd_failed_rebuilds; + + return ((1 << shift) * FIB_CALLOUT_DELAY_MS); +} + +static void +schedule_callout(struct fib_data *fd, int delay_ms) +{ + + callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms, + rebuild_fd_callout, fd, 0); +} + +static void +schedule_fd_rebuild(struct fib_data *fd) +{ + + FIB_MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_need_rebuild = true; + + /* + * Potentially re-schedules pending callout + * initiated by schedule_algo_eval. + */ + FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt"); + schedule_callout(fd, callout_calc_delay_ms(fd)); + } + FIB_MOD_UNLOCK(); +} + +static void +schedule_algo_eval(struct fib_data *fd) +{ + + if (fd->fd_num_changes++ == 0) { + /* Start callout to consider switch */ + FIB_MOD_LOCK(); + if (!callout_pending(&fd->fd_callout)) + schedule_callout(fd, ALGO_EVAL_DELAY_MS); + FIB_MOD_UNLOCK(); + } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) { + /* Reset callout to exec immediately */ + FIB_MOD_LOCK(); + if (!fd->fd_need_rebuild) { + fd->fd_force_eval = true; + schedule_callout(fd, 1); + } + FIB_MOD_UNLOCK(); + } +} + +/* + * Rib subscription handler. Checks if the algorithm is ready to + * receive updates, handles nexthop refcounting and passes change + * data to the algorithm callback. + */ +static void +handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc, + void *_data) +{ + struct fib_data *fd = (struct fib_data *)_data; + enum flm_op_result result; + + RIB_WLOCK_ASSERT(rnh); + + /* + * There is a small gap between subscribing for route changes + * and initiating rtable dump. Avoid receiving route changes + * prior to finishing rtable dump by checking `init_done`. + */ + if (!fd->init_done) + return; + /* + * If algo requested rebuild, stop sending updates by default. + * This simplifies nexthop refcount handling logic. + */ + if (fd->fd_need_rebuild) + return; + + /* Consider scheduling algorithm re-evaluation */ + schedule_algo_eval(fd); + + /* + * Maintain guarantee that every nexthop returned by the dataplane + * lookup has > 0 refcount, so can be safely referenced within current + * epoch. + */ + if (rc->rc_nh_new != NULL) { + if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) { + /* ran out of indexes */ + schedule_fd_rebuild(fd); + return; + } + } + + result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data); + + switch (result) { + case FLM_SUCCESS: + /* Unref old nexthop on success */ + if (rc->rc_nh_old != NULL) + fib_unref_nhop(fd, rc->rc_nh_old); + break; + case FLM_REBUILD: + + /* + * Algo is not able to apply the update. + * Schedule algo rebuild. + */ + schedule_fd_rebuild(fd); + break; + case FLM_ERROR: + + /* + * Algo reported a non-recoverable error. + * Record the error and schedule rebuild, which will + * trigger best algo selection. + */ + FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error"); + if (!flm_error_add(fd->fd_flm, fd->fd_fibnum)) + FD_PRINTF(LOG_ERR, fd, "failed to ban algo"); + schedule_fd_rebuild(fd); + } +} + +static void +estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd) +{ + + if (old_fd == NULL) { + // TODO: read from rtable + fd->number_nhops = 16; + return; + } + + if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS) + fd->number_nhops = 2 * old_fd->number_nhops; + else + fd->number_nhops = old_fd->number_nhops; +} + +struct walk_cbdata { + struct fib_data *fd; + flm_dump_t *func; + enum flm_op_result result; +}; + +/* + * Handler called after all rtenties have been dumped. + * Performs post-dump framework checks and calls + * algo:flm_dump_end_cb(). + * + * Updates walk_cbdata result. + */ +static void +sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + struct fib_data *fd = w->fd; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (rnh->rib_dying) { + w->result = FLM_ERROR; + return; + } + + if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS) + return; + + /* Post-dump hook, dump successful */ + + if (fd->hit_nhops) { + FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops", + fd->nh_ref_table->count); + w->result = FLM_REBUILD; + return; + } + + w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp); + + if (w->result == FLM_SUCCESS) { + /* Mark init as done to allow routing updates */ + fd->init_done = 1; + } +} + +/* + * Callback for each entry in rib. + * Calls algo:flm_dump_rib_item_cb func as a part of initial + * route table synchronisation. + */ +static int +sync_algo_cb(struct rtentry *rt, void *_data) +{ + struct walk_cbdata *w = (struct walk_cbdata *)_data; + + RIB_WLOCK_ASSERT(w->fd->fd_rh); + + if (w->result == FLM_SUCCESS && w->func) { + + /* + * Reference nexthops to maintain guarantee that + * each nexthop returned by datapath has > 0 references + * and can be safely referenced within current epoch. + */ + struct nhop_object *nh = rt_get_raw_nhop(rt); + if (fib_ref_nhop(w->fd, nh) != 0) + w->result = w->func(rt, w->fd->fd_algo_data); + else + w->result = FLM_REBUILD; + } + + return (0); +} + +/* + * Dump all routing table state to the algo instance. + */ +static enum flm_op_result +sync_algo(struct fib_data *fd) +{ + struct walk_cbdata w = { + .fd = fd, + .func = fd->fd_flm->flm_dump_rib_item_cb, + .result = FLM_SUCCESS, + }; + + rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w); + + FD_PRINTF(LOG_INFO, fd, "initial dump completed."); + + return (w.result); +} + +/* + * Schedules epoch-backed @fd instance deletion. + * * Unlinks @fd from the list of active algo instances. + * * Removes rib subscription. + * * Stops callout. + * * Schedules actual deletion. + * + * Assume @fd is already unlinked from the datapath. + */ +static int +schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout) +{ + bool is_dead; + + NET_EPOCH_ASSERT(); + + FIB_MOD_LOCK(); + is_dead = fd->fd_dead; + if (!is_dead) + fd->fd_dead = true; + if (fd->fd_linked) { + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + } + FIB_MOD_UNLOCK(); + if (is_dead) + return (0); + + FD_PRINTF(LOG_INFO, fd, "DETACH"); + + if (fd->fd_rs != NULL) + rib_unsibscribe(fd->fd_rs); + + /* + * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls + * will be executed, hence no _new_ callout schedules will happen. + * + * There can be 2 possible scenarious here: + * 1) we're running inside a callout when we're deleting ourselves + * due to migration to a newer fd + * 2) we're running from rt_table_destroy() and callout is scheduled + * for execution OR is executing + * + * For (2) we need to wait for the callout termination, as the routing table + * will be destroyed after this function returns. + * For (1) we cannot call drain, but can ensure that this is the last invocation. + */ + + if (in_callout) + callout_stop(&fd->fd_callout); + else + callout_drain(&fd->fd_callout); + + FD_PRINTF(LOG_INFO, fd, "destroying old instance"); + epoch_call(net_epoch_preempt, destroy_fd_instance_epoch, + &fd->fd_epoch_ctx); + + return (0); +} + +/* + * Wipe all fd instances from the list matching rib specified by @rh. + * If @keep_first is set, remove all but the first record. + */ +static void +fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout) +{ + struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head); + struct fib_data *fd, *fd_tmp; + struct epoch_tracker et; + + FIB_MOD_LOCK(); + TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) { + if (fd->fd_rh == rh) { + if (keep_first) { + keep_first = false; + continue; + } + TAILQ_REMOVE(&V_fib_data_list, fd, entries); + fd->fd_linked = false; + TAILQ_INSERT_TAIL(&tmp_head, fd, entries); + } + } + FIB_MOD_UNLOCK(); + + /* Pass 2: remove each entry */ + NET_EPOCH_ENTER(et); + TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) { + schedule_destroy_fd_instance(fd, in_callout); + } + NET_EPOCH_EXIT(et); +} + +void +fib_destroy_rib(struct rib_head *rh) +{ + + /* + * rnh has `is_dying` flag set, so setup of new fd's will fail at + * sync_algo() stage, preventing new entries to be added to the list + * of active algos. Remove all existing entries for the particular rib. + */ + fib_cleanup_algo(rh, false, false); +} + +/* + * Finalises fd destruction by freeing all fd resources. + */ +static void +destroy_fd_instance(struct fib_data *fd) +{ + + FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd); + + /* Call destroy callback first */ + if (fd->fd_algo_data != NULL) + fd->fd_flm->flm_destroy_cb(fd->fd_algo_data); + + /* Nhop table */ + if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) { + for (int i = 0; i < fd->number_nhops; i++) { + if (!is_idx_free(fd, i)) { + FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", + i, fd->nh_idx[i]); + nhop_free_any(fd->nh_idx[i]); + } + } + free(fd->nh_idx, M_RTABLE); + } + if (fd->nh_ref_table != NULL) + free(fd->nh_ref_table, M_RTABLE); + + fib_unref_algo(fd->fd_flm); + + free(fd, M_RTABLE); +} + +/* + * Epoch callback indicating fd is safe to destroy + */ +static void +destroy_fd_instance_epoch(epoch_context_t ctx) +{ + struct fib_data *fd; + + fd = __containerof(ctx, struct fib_data, fd_epoch_ctx); + + destroy_fd_instance(fd); +} + +/* + * Tries to setup fd instance. + * - Allocates fd/nhop table + * - Runs algo:flm_init_cb algo init + * - Subscribes fd to the rib + * - Runs rtable dump + * - Adds instance to the list of active instances. + * + * Returns: operation result. Fills in @pfd with resulting fd on success. + * + */ +static enum flm_op_result +try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh, + struct fib_data *old_fd, struct fib_data **pfd) +{ + struct fib_data *fd; + size_t size; + enum flm_op_result result; + *** 2314 LINES SKIPPED ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202012251136.0BPBaIh2041079>