Date: Wed, 7 Feb 2018 18:59:55 +0000 (UTC) From: "Andrey V. Elsukov" <ae@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r328988 - in head/sys: conf modules/ipfw netinet netpfil/ipfw Message-ID: <201802071859.w17IxtxX060804@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: ae Date: Wed Feb 7 18:59:54 2018 New Revision: 328988 URL: https://svnweb.freebsd.org/changeset/base/328988 Log: Rework ipfw dynamic states implementation to be lockless on fast path. o added struct ipfw_dyn_info that keeps all needed for ipfw_chk and for dynamic states implementation information; o added DYN_LOOKUP_NEEDED() macro that can be used to determine the need of new lookup of dynamic states; o ipfw_dyn_rule now becomes obsolete. Currently it used to pass information from kernel to userland only. o IPv4 and IPv6 states now described by different structures dyn_ipv4_state and dyn_ipv6_state; o IPv6 scope zones support is added; o ipfw(4) now depends from Concurrency Kit; o states are linked with "entry" field using CK_SLIST. This allows lockless lookup and protected by mutex modifications. o the "expired" SLIST field is used for states expiring. o struct dyn_data is used to keep generic information for both IPv4 and IPv6; o struct dyn_parent is used to keep O_LIMIT_PARENT information; o IPv4 and IPv6 states are stored in different hash tables; o O_LIMIT_PARENT states now are kept separately from O_LIMIT and O_KEEP_STATE states; o per-cpu dyn_hp pointers are used to implement hazard pointers and they prevent freeing states that are locklessly used by lookup threads; o mutexes to protect modification of lists in hash tables now kept in separate arrays. 65535 limit to maximum number of hash buckets now removed. o Separate lookup and install functions added for IPv4 and IPv6 states and for parent states. o By default now is used Jenkinks hash function. Obtained from: Yandex LLC MFC after: 42 days Sponsored by: Yandex LLC Differential Revision: https://reviews.freebsd.org/D12685 Modified: head/sys/conf/files head/sys/modules/ipfw/Makefile head/sys/netinet/ip_fw.h head/sys/netpfil/ipfw/ip_fw2.c head/sys/netpfil/ipfw/ip_fw_dynamic.c head/sys/netpfil/ipfw/ip_fw_private.h head/sys/netpfil/ipfw/ip_fw_sockopt.c Modified: head/sys/conf/files ============================================================================== --- head/sys/conf/files Wed Feb 7 18:50:36 2018 (r328987) +++ head/sys/conf/files Wed Feb 7 18:59:54 2018 (r328988) @@ -4374,7 +4374,8 @@ netpfil/ipfw/ip_dn_io.c optional inet dummynet netpfil/ipfw/ip_dn_glue.c optional inet dummynet netpfil/ipfw/ip_fw2.c optional inet ipfirewall netpfil/ipfw/ip_fw_bpf.c optional inet ipfirewall -netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall +netpfil/ipfw/ip_fw_dynamic.c optional inet ipfirewall \ + compile-with "${NORMAL_C} -I$S/contrib/ck/include" netpfil/ipfw/ip_fw_eaction.c optional inet ipfirewall netpfil/ipfw/ip_fw_log.c optional inet ipfirewall netpfil/ipfw/ip_fw_pfil.c optional inet ipfirewall Modified: head/sys/modules/ipfw/Makefile ============================================================================== --- head/sys/modules/ipfw/Makefile Wed Feb 7 18:50:36 2018 (r328987) +++ head/sys/modules/ipfw/Makefile Wed Feb 7 18:59:54 2018 (r328988) @@ -9,7 +9,7 @@ SRCS+= ip_fw_sockopt.c ip_fw_table.c ip_fw_table_algo. SRCS+= ip_fw_table_value.c SRCS+= opt_inet.h opt_inet6.h opt_ipdivert.h opt_ipfw.h -CFLAGS+= -DIPFIREWALL +CFLAGS+= -DIPFIREWALL -I${SRCTOP}/sys/contrib/ck/include # #If you want it verbose #CFLAGS+= -DIPFIREWALL_VERBOSE Modified: head/sys/netinet/ip_fw.h ============================================================================== --- head/sys/netinet/ip_fw.h Wed Feb 7 18:50:36 2018 (r328987) +++ head/sys/netinet/ip_fw.h Wed Feb 7 18:59:54 2018 (r328988) @@ -671,7 +671,7 @@ struct ipfw_flow_id { uint32_t src_ip; uint16_t dst_port; uint16_t src_port; - uint8_t fib; + uint8_t fib; /* XXX: must be uint16_t */ uint8_t proto; uint8_t _flags; /* protocol-specific flags */ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ @@ -682,6 +682,7 @@ struct ipfw_flow_id { }; #endif +#define IS_IP4_FLOW_ID(id) ((id)->addr_type == 4) #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) /* Modified: head/sys/netpfil/ipfw/ip_fw2.c ============================================================================== --- head/sys/netpfil/ipfw/ip_fw2.c Wed Feb 7 18:50:36 2018 (r328987) +++ head/sys/netpfil/ipfw/ip_fw2.c Wed Feb 7 18:59:54 2018 (r328988) @@ -1387,8 +1387,7 @@ ipfw_chk(struct ip_fw_args *args) * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ - int dyn_dir = MATCH_UNKNOWN; - uint16_t dyn_name = 0; + struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; @@ -1420,6 +1419,7 @@ ipfw_chk(struct ip_fw_args *args) proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ + DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the @@ -2605,7 +2605,8 @@ do { \ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, - (ipfw_insn_limit *)cmd, args, tablearg)) { + (ipfw_insn_limit *)cmd, args, ulp, + pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ @@ -2619,34 +2620,15 @@ do { \ /* * dynamic rules are checked at the first * keep-state or check-state occurrence, - * with the result being stored in dyn_dir - * and dyn_name. + * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). - * - * (dyn_dir == MATCH_UNKNOWN) means this is - * first lookup for such f_id. Do lookup. - * - * (dyn_dir != MATCH_UNKNOWN && - * dyn_name != 0 && dyn_name != cmd->arg1) - * means previous lookup didn't find dynamic - * rule for specific state name and current - * lookup will search rule with another state - * name. Redo lookup. - * - * (dyn_dir != MATCH_UNKNOWN && dyn_name == 0) - * means previous lookup was for `any' name - * and it didn't find rule. No need to do - * lookup again. */ - if ((dyn_dir == MATCH_UNKNOWN || - (dyn_name != 0 && - dyn_name != cmd->arg1)) && - (q = ipfw_dyn_lookup_state(&args->f_id, - ulp, pktlen, &dyn_dir, - (dyn_name = cmd->arg1))) != NULL) { + if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && + (q = ipfw_dyn_lookup_state(args, ulp, + pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule @@ -2654,13 +2636,7 @@ do { \ * cmdlen. */ f = q; - /* XXX we would like to have f_pos - * readily accessible in the dynamic - * rule, instead of having to - * lookup q->rule. - */ - f_pos = ipfw_find_rule(chain, - f->rulenum, f->id); + f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; @@ -2877,7 +2853,8 @@ do { \ case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; - if (q != f || dyn_dir == MATCH_FORWARD) { + if (q != f || + dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); @@ -2937,7 +2914,8 @@ do { \ case O_FORWARD_IP6: if (args->eh) /* not valid on layer2 pkts */ break; - if (q != f || dyn_dir == MATCH_FORWARD) { + if (q != f || + dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); @@ -3089,7 +3067,7 @@ do { \ * @args content, and it may be * used for new state lookup later. */ - dyn_dir = MATCH_UNKNOWN; + DYN_INFO_INIT(&dyn_info); } break; Modified: head/sys/netpfil/ipfw/ip_fw_dynamic.c ============================================================================== --- head/sys/netpfil/ipfw/ip_fw_dynamic.c Wed Feb 7 18:50:36 2018 (r328987) +++ head/sys/netpfil/ipfw/ip_fw_dynamic.c Wed Feb 7 18:59:54 2018 (r328988) @@ -1,6 +1,8 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * + * Copyright (c) 2017-2018 Yandex LLC + * Copyright (c) 2017-2018 Andrey V. Elsukov <ae@FreeBSD.org> * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without @@ -28,32 +30,27 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#define DEB(x) -#define DDB(x) x - -/* - * Dynamic rule support for ipfw - */ - -#include "opt_ipfw.h" #include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipfw.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ -#include "opt_inet6.h" #include <sys/param.h> #include <sys/systm.h> -#include <sys/malloc.h> +#include <sys/hash.h> #include <sys/mbuf.h> #include <sys/kernel.h> -#include <sys/ktr.h> #include <sys/lock.h> +#include <sys/pcpu.h> +#include <sys/queue.h> #include <sys/rmlock.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/sysctl.h> #include <sys/syslog.h> -#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/ethernet.h> #include <net/if.h> #include <net/if_var.h> #include <net/pfil.h> @@ -61,8 +58,9 @@ __FBSDID("$FreeBSD$"); #include <netinet/in.h> #include <netinet/ip.h> -#include <netinet/ip_var.h> /* ip_defttl */ +#include <netinet/ip_var.h> #include <netinet/ip_fw.h> +#include <netinet/ip_var.h> #include <netinet/tcp_var.h> #include <netinet/udp.h> @@ -70,6 +68,7 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include <netinet6/in6_var.h> #include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> #endif #include <netpfil/ipfw/ip_fw_private.h> @@ -79,86 +78,261 @@ __FBSDID("$FreeBSD$"); #ifdef MAC #include <security/mac/mac_framework.h> #endif +#include <ck_queue.h> /* - * Description of dynamic rules. + * Description of dynamic states. * - * Dynamic rules are stored in lists accessed through a hash table - * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can - * be modified through the sysctl variable dyn_buckets which is - * updated when the table becomes empty. + * Dynamic states are stored in lists accessed through a hash tables + * whose size is curr_dyn_buckets. This value can be modified through + * the sysctl variable dyn_buckets. * - * XXX currently there is only one list, ipfw_dyn. + * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, + * and dyn_ipv6_parent. * - * When a packet is received, its address fields are first masked - * with the mask defined for the rule, then hashed, then matched - * against the entries in the corresponding list. - * Dynamic rules can be used for different purposes: + * When a packet is received, its address fields hashed, then matched + * against the entries in the corresponding list by addr_type. + * Dynamic states can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * - * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * The lifetime of dynamic states is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * - * The total number of dynamic rules is equal to UMA zone items count. - * The max number of dynamic rules is dyn_max. When we reach + * The total number of dynamic states is equal to UMA zone items count. + * The max number of dynamic states is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * - * Each dynamic rule holds a pointer to the parent ipfw rule so - * we know what action to perform. Dynamic rules are removed when - * the parent rule is deleted. This can be changed by dyn_keep_states - * sysctl. + * Each state holds a pointer to the parent ipfw rule so we know what + * action to perform. Dynamic rules are removed when the parent rule is + * deleted. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ -struct ipfw_dyn_bucket { - struct mtx mtx; /* Bucket protecting lock */ - ipfw_dyn_rule *head; /* Pointer to first rule */ +/* By default use jenkins hash function */ +#define IPFIREWALL_JENKINSHASH + +#define DYN_COUNTER_INC(d, dir, pktlen) do { \ + (d)->pcnt_ ## dir++; \ + (d)->bcnt_ ## dir += pktlen; \ + } while (0) + +struct dyn_data { + void *parent; /* pointer to parent rule */ + uint32_t chain_id; /* cached ruleset id */ + uint32_t f_pos; /* cached rule index */ + + uint32_t hashval; /* hash value used for hash resize */ + uint16_t fibnum; /* fib used to send keepalives */ + uint8_t _pad[3]; + uint8_t set; /* parent rule set number */ + uint16_t rulenum; /* parent rule number */ + uint32_t ruleid; /* parent rule id */ + + uint32_t state; /* TCP session state and flags */ + uint32_t ack_fwd; /* most recent ACKs in forward */ + uint32_t ack_rev; /* and reverse direction (used */ + /* to generate keepalives) */ + uint32_t sync; /* synchronization time */ + uint32_t expire; /* expire time */ + + uint64_t pcnt_fwd; /* bytes counter in forward */ + uint64_t bcnt_fwd; /* packets counter in forward */ + uint64_t pcnt_rev; /* bytes counter in reverse */ + uint64_t bcnt_rev; /* packets counter in reverse */ }; +#define DPARENT_COUNT_DEC(p) do { \ + MPASS(p->count > 0); \ + ck_pr_dec_32(&(p)->count); \ +} while (0) +#define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) +#define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) +struct dyn_parent { + void *parent; /* pointer to parent rule */ + uint32_t count; /* number of linked states */ + uint8_t _pad; + uint8_t set; /* parent rule set number */ + uint16_t rulenum; /* parent rule number */ + uint32_t ruleid; /* parent rule id */ + uint32_t hashval; /* hash value used for hash resize */ + uint32_t expire; /* expire time */ +}; + +struct dyn_ipv4_state { + uint8_t type; /* State type */ + uint8_t proto; /* UL Protocol */ + uint16_t kidx; /* named object index */ + uint16_t sport, dport; /* ULP source and destination ports */ + in_addr_t src, dst; /* IPv4 source and destination */ + + union { + struct dyn_data *data; + struct dyn_parent *limit; + }; + CK_SLIST_ENTRY(dyn_ipv4_state) entry; + SLIST_ENTRY(dyn_ipv4_state) expired; +}; +CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); +static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4); +static VNET_DEFINE(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); + +SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); +static VNET_DEFINE(struct dyn_ipv4_slist, dyn_expired_ipv4); +#define V_dyn_ipv4 VNET(dyn_ipv4) +#define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) +#define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) + +#ifdef INET6 +struct dyn_ipv6_state { + uint8_t type; /* State type */ + uint8_t proto; /* UL Protocol */ + uint16_t kidx; /* named object index */ + uint16_t sport, dport; /* ULP source and destination ports */ + struct in6_addr src, dst; /* IPv6 source and destination */ + uint32_t zoneid; /* IPv6 scope zone id */ + union { + struct dyn_data *data; + struct dyn_parent *limit; + }; + CK_SLIST_ENTRY(dyn_ipv6_state) entry; + SLIST_ENTRY(dyn_ipv6_state) expired; +}; +CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); +static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6); +static VNET_DEFINE(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); + +SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); +static VNET_DEFINE(struct dyn_ipv6_slist, dyn_expired_ipv6); +#define V_dyn_ipv6 VNET(dyn_ipv6) +#define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) +#define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) +#endif /* INET6 */ + /* - * Static variables followed by global ones + * Per-CPU pointer indicates that specified state is currently in use + * and must not be reclaimed by expiration callout. */ -static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v); -static VNET_DEFINE(u_int32_t, dyn_buckets_max); -static VNET_DEFINE(u_int32_t, curr_dyn_buckets); -static VNET_DEFINE(struct callout, ipfw_timeout); -#define V_ipfw_dyn_v VNET(ipfw_dyn_v) -#define V_dyn_buckets_max VNET(dyn_buckets_max) -#define V_curr_dyn_buckets VNET(curr_dyn_buckets) -#define V_ipfw_timeout VNET(ipfw_timeout) +static void **dyn_hp_cache; +static DPCPU_DEFINE(void *, dyn_hp); +#define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) +#define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) +#define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) +#define DYNSTATE_CRITICAL_ENTER() critical_enter() +#define DYNSTATE_CRITICAL_EXIT() do { \ + DYNSTATE_RELEASE(); \ + critical_exit(); \ +} while (0); -static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone); -#define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone) +/* + * We keep two version numbers, one is updated when new entry added to + * the list. Second is updated when an entry deleted from the list. + * Versions are updated under bucket lock. + * + * Bucket "add" version number is used to know, that in the time between + * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state + * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did + * not install some state in this bucket. Using this info we can avoid + * additional state lookup, because we are sure that we will not install + * the state twice. + * + * Also doing the tracking of bucket "del" version during lookup we can + * be sure, that state entry was not unlinked and freed in time between + * we read the state pointer and protect it with hazard pointer. + * + * An entry unlinked from CK list keeps unchanged until it is freed. + * Unlinked entries are linked into expired lists using "expired" field. + */ -#define IPFW_BUCK_LOCK_INIT(b) \ - mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF) -#define IPFW_BUCK_LOCK_DESTROY(b) \ - mtx_destroy(&(b)->mtx) -#define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx) -#define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx) -#define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED) +/* + * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. + * dyn_bucket_lock is used to get write access to lists in specific bucket. + * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, + * and ipv6_parent lists. + */ +static VNET_DEFINE(struct mtx, dyn_expire_lock); +static VNET_DEFINE(struct mtx *, dyn_bucket_lock); +#define V_dyn_expire_lock VNET(dyn_expire_lock) +#define V_dyn_bucket_lock VNET(dyn_bucket_lock) +/* + * Bucket's add/delete generation versions. + */ +static VNET_DEFINE(uint32_t *, dyn_ipv4_add); +static VNET_DEFINE(uint32_t *, dyn_ipv4_del); +static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_add); +static VNET_DEFINE(uint32_t *, dyn_ipv4_parent_del); +#define V_dyn_ipv4_add VNET(dyn_ipv4_add) +#define V_dyn_ipv4_del VNET(dyn_ipv4_del) +#define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) +#define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) -static VNET_DEFINE(int, dyn_keep_states); -#define V_dyn_keep_states VNET(dyn_keep_states) +#ifdef INET6 +static VNET_DEFINE(uint32_t *, dyn_ipv6_add); +static VNET_DEFINE(uint32_t *, dyn_ipv6_del); +static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_add); +static VNET_DEFINE(uint32_t *, dyn_ipv6_parent_del); +#define V_dyn_ipv6_add VNET(dyn_ipv6_add) +#define V_dyn_ipv6_del VNET(dyn_ipv6_del) +#define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) +#define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) +#endif /* INET6 */ +#define DYN_BUCKET(h, b) ((h) & (b - 1)) +#define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) +#define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) + +#define DYN_BUCKET_LOCK_INIT(lock, b) \ + mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) +#define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) +#define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) +#define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) +#define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) + +#define DYN_EXPIRED_LOCK_INIT() \ + mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) +#define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) +#define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) +#define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) + +static VNET_DEFINE(uint32_t, dyn_buckets_max); +static VNET_DEFINE(uint32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, dyn_timeout); +#define V_dyn_buckets_max VNET(dyn_buckets_max) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_dyn_timeout VNET(dyn_timeout) + +/* Maximum length of states chain in a bucket */ +static VNET_DEFINE(uint32_t, curr_max_length); +#define V_curr_max_length VNET(curr_max_length) + +static VNET_DEFINE(uma_zone_t, dyn_data_zone); +static VNET_DEFINE(uma_zone_t, dyn_parent_zone); +static VNET_DEFINE(uma_zone_t, dyn_ipv4_zone); +#ifdef INET6 +static VNET_DEFINE(uma_zone_t, dyn_ipv6_zone); +#define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) +#endif /* INET6 */ +#define V_dyn_data_zone VNET(dyn_data_zone) +#define V_dyn_parent_zone VNET(dyn_parent_zone) +#define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) + /* * Timeouts for various events in handing dynamic rules. */ -static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); -static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); -static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); -static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); -static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); -static VNET_DEFINE(u_int32_t, dyn_short_lifetime); +static VNET_DEFINE(uint32_t, dyn_ack_lifetime); +static VNET_DEFINE(uint32_t, dyn_syn_lifetime); +static VNET_DEFINE(uint32_t, dyn_fin_lifetime); +static VNET_DEFINE(uint32_t, dyn_rst_lifetime); +static VNET_DEFINE(uint32_t, dyn_udp_lifetime); +static VNET_DEFINE(uint32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) @@ -174,10 +348,10 @@ static VNET_DEFINE(u_int32_t, dyn_short_lifetime); * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ - -static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); -static VNET_DEFINE(u_int32_t, dyn_keepalive_period); -static VNET_DEFINE(u_int32_t, dyn_keepalive); +#define DYN_KEEPALIVE_MAXQ 512 +static VNET_DEFINE(uint32_t, dyn_keepalive_interval); +static VNET_DEFINE(uint32_t, dyn_keepalive_period); +static VNET_DEFINE(uint32_t, dyn_keepalive); static VNET_DEFINE(time_t, dyn_keepalive_last); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) @@ -185,113 +359,208 @@ static VNET_DEFINE(time_t, dyn_keepalive_last); #define V_dyn_keepalive VNET(dyn_keepalive) #define V_dyn_keepalive_last VNET(dyn_keepalive_last) -static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ - -#define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone) +static VNET_DEFINE(uint32_t, dyn_max); /* max # of dynamic states */ +static VNET_DEFINE(uint32_t, dyn_count); /* number of states */ +static VNET_DEFINE(uint32_t, dyn_parent_max); /* max # of parent states */ +static VNET_DEFINE(uint32_t, dyn_parent_count); /* number of parent states */ #define V_dyn_max VNET(dyn_max) +#define V_dyn_count VNET(dyn_count) +#define V_dyn_parent_max VNET(dyn_parent_max) +#define V_dyn_parent_count VNET(dyn_parent_count) -/* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */ -static int ipfw_dyn_count; /* number of objects */ +#define DYN_COUNT_DEC(name) do { \ + MPASS((V_ ## name) > 0); \ + ck_pr_dec_32(&(V_ ## name)); \ +} while (0) +#define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) +#define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) -#ifdef USERSPACE /* emulation of UMA object counters for userspace */ -#define uma_zone_get_cur(x) ipfw_dyn_count -#endif /* USERSPACE */ +static time_t last_log; /* Log ratelimiting */ -static int last_log; /* Log ratelimiting */ +/* + * Get/set maximum number of dynamic states in given VNET instance. + */ +static int +sysctl_dyn_max(SYSCTL_HANDLER_ARGS) +{ + uint32_t nstates; + int error; -static void ipfw_dyn_tick(void *vnetx); -static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int); -#ifdef SYSCTL_NODE + nstates = V_dyn_max; + error = sysctl_handle_32(oidp, &nstates, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); -static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS); -static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS); + V_dyn_max = nstates; + uma_zone_set_max(V_dyn_data_zone, V_dyn_max); + return (0); +} -SYSBEGIN(f2) +static int +sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) +{ + uint32_t nstates; + int error; + nstates = V_dyn_parent_max; + error = sysctl_handle_32(oidp, &nstates, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + V_dyn_parent_max = nstates; + uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); + return (0); +} + +static int +sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) +{ + uint32_t nbuckets; + int error; + + nbuckets = V_dyn_buckets_max; + error = sysctl_handle_32(oidp, &nbuckets, 0, req); + /* Read operation or some error */ + if ((error != 0) || (req->newptr == NULL)) + return (error); + + if (nbuckets > 256) + V_dyn_buckets_max = 1 << fls(nbuckets - 1); + else + return (EINVAL); + return (0); +} + SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, - "Max number of dyn. buckets"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, + "Current number of dynamic states."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, + "Current number of parent states. "); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, - "Current Number of dyn. buckets"); -SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, - CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", - "Number of dyn. rules"); + "Current number of buckets for states hash table."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, + "Current maximum length of states chains in hash buckets."); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_buckets, + "IU", "Max number of buckets for dynamic states hash table."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, - CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", - "Max number of dyn. rules"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_max, + "IU", "Max number of dynamic states."); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, + CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW, 0, 0, sysctl_dyn_parent_max, + "IU", "Max number of parent dynamic states."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, - "Lifetime of dyn. rules for acks"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + "Lifetime of dynamic states for TCP ACK."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, - "Lifetime of dyn. rules for syn"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + "Lifetime of dynamic states for TCP SYN."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, - "Lifetime of dyn. rules for fin"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + "Lifetime of dynamic states for TCP FIN."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, - "Lifetime of dyn. rules for rst"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + "Lifetime of dynamic states for TCP RST."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, - "Lifetime of dyn. rules for UDP"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + "Lifetime of dynamic states for UDP."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, - "Lifetime of dyn. rules for other situations"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + "Lifetime of dynamic states for other situations."); +SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, - "Enable keepalives for dyn. rules"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, - "Do not flush dynamic states on rule deletion"); + "Enable keepalives for dynamic states."); -SYSEND +#ifdef IPFIREWALL_DYNDEBUG +#define DYN_DEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ +} while (0) +#else +#define DYN_DEBUG(fmt, ...) +#endif /* !IPFIREWALL_DYNDEBUG */ -#endif /* SYSCTL_NODE */ - - #ifdef INET6 -static __inline int -hash_packet6(const struct ipfw_flow_id *id) -{ - u_int32_t i; - i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[3]); - return ntohl(i); -} -#endif +/* Functions to work with IPv6 states */ +static struct dyn_ipv6_state *dyn_lookup_ipv6_state( + const struct ipfw_flow_id *, uint32_t, const void *, + struct ipfw_dyn_info *, int); +static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, + uint32_t, const void *, int, const void *, uint32_t, uint16_t, uint32_t, + uint16_t); +static struct dyn_ipv6_state *dyn_alloc_ipv6_state( + const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t); +static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, uint8_t, + const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, + struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); +static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, + ipfw_dyn_rule *); -/* - * IMPORTANT: the hash function for dynamic rules must be commutative - * in source and destination (ip,port), because rules are bidirectional - * and we want to find both in the same bucket. - */ -static __inline int -hash_packet(const struct ipfw_flow_id *id, int buckets) -{ - u_int32_t i; +static uint32_t dyn_getscopeid(const struct ip_fw_args *); +static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, + const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, + uint16_t); +static void dyn_enqueue_keepalive_ipv6(struct mbufq *, + const struct dyn_ipv6_state *); +static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); -#ifdef INET6 - if (IS_IP6_FLOW_ID(id)) - i = hash_packet6(id); - else +static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( + const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, + uint32_t); +static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( + const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, + uint32_t); +static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t, + uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, + uint16_t); #endif /* INET6 */ - i = (id->dst_ip) ^ (id->src_ip); - i ^= (id->dst_port) ^ (id->src_port); - return (i & (buckets - 1)); -} -#if 0 -#define DYN_DEBUG(fmt, ...) do { \ - printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ -} while (0) -#else -#define DYN_DEBUG(fmt, ...) -#endif +/* Functions to work with limit states */ +static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, + struct ip_fw *, uint32_t, uint32_t, uint16_t); +static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( + const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); +static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( + const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); +static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t, + uint8_t, uint32_t); +static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t, + uint8_t, const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t); +static void dyn_tick(void *); +static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); +static void dyn_free_states(struct ip_fw_chain *); +static void dyn_export_parent(const struct dyn_parent *, uint16_t, + ipfw_dyn_rule *); +static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t, + ipfw_dyn_rule *); +static uint32_t dyn_update_tcp_state(struct dyn_data *, + const struct ipfw_flow_id *, const struct tcphdr *, int); +static void dyn_update_proto_state(struct dyn_data *, + const struct ipfw_flow_id *, const void *, int, int); + +/* Functions to work with IPv4 states */ +struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, + const void *, struct ipfw_dyn_info *, int); +static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, + const void *, int, const void *, uint32_t, uint16_t, uint32_t, uint16_t); +static struct dyn_ipv4_state *dyn_alloc_ipv4_state( + const struct ipfw_flow_id *, uint16_t, uint8_t); +static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, uint8_t, + const struct ipfw_flow_id *, const void *, int, uint32_t, + struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); +static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, + ipfw_dyn_rule *); + +/* + * Named states support. + */ static char *default_state_name = "default"; struct dyn_state_obj { struct named_object no; @@ -438,7 +707,6 @@ dyn_destroy(struct ip_fw_chain *ch, struct named_objec KASSERT(no->refcnt == 1, ("Destroying object '%s' (type %u, idx %u) with refcnt %u", no->name, no->etlv, no->kidx, no->refcnt)); - DYN_DEBUG("kidx %d", no->kidx); obj = SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; @@ -474,7 +742,137 @@ static struct opcode_obj_rewrite dyn_opcodes[] = { dyn_create, dyn_destroy }, }; -/** + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +#ifndef IPFIREWALL_JENKINSHASH +static __inline uint32_t +hash_packet(const struct ipfw_flow_id *id) +{ + uint32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3])); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip); + i ^= (id->dst_port) ^ (id->src_port); + return (i); +} + +static __inline uint32_t +hash_parent(const struct ipfw_flow_id *id, const void *rule) +{ + + return (hash_packet(id) ^ ((uintptr_t)rule)); +} + +#else /* IPFIREWALL_JENKINSHASH */ + +static VNET_DEFINE(uint32_t, dyn_hashseed); +#define V_dyn_hashseed VNET(dyn_hashseed) + +static __inline int +addrcmp4(const struct ipfw_flow_id *id) +{ + + if (id->src_ip < id->dst_ip) + return (0); + if (id->src_ip > id->dst_ip) + return (1); + if (id->src_port <= id->dst_port) + return (0); + return (1); +} + +#ifdef INET6 +static __inline int +addrcmp6(const struct ipfw_flow_id *id) +{ + int ret; + + ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); + if (ret < 0) + return (0); + if (ret > 0) + return (1); + if (id->src_port <= id->dst_port) + return (0); + return (1); +} + +static __inline uint32_t +hash_packet6(const struct ipfw_flow_id *id) +{ + struct tuple6 { + struct in6_addr addr[2]; + uint16_t port[2]; + } t6; + + if (addrcmp6(id) == 0) { + t6.addr[0] = id->src_ip6; + t6.addr[1] = id->dst_ip6; + t6.port[0] = id->src_port; + t6.port[1] = id->dst_port; + } else { + t6.addr[0] = id->dst_ip6; + t6.addr[1] = id->src_ip6; + t6.port[0] = id->dst_port; + t6.port[1] = id->src_port; + } + return (jenkins_hash32((const uint32_t *)&t6, + sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); +} +#endif + +static __inline uint32_t +hash_packet(const struct ipfw_flow_id *id) +{ + struct tuple4 { + in_addr_t addr[2]; + uint16_t port[2]; + } t4; + + if (IS_IP4_FLOW_ID(id)) { + /* All fields are in host byte order */ + if (addrcmp4(id) == 0) { + t4.addr[0] = id->src_ip; + t4.addr[1] = id->dst_ip; + t4.port[0] = id->src_port; + t4.port[1] = id->dst_port; + } else { + t4.addr[0] = id->dst_ip; + t4.addr[1] = id->src_ip; + t4.port[0] = id->dst_port; + t4.port[1] = id->src_port; + } + return (jenkins_hash32((const uint32_t *)&t4, + sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); + } else +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + return (hash_packet6(id)); +#endif + return (0); +} + +static __inline uint32_t +hash_parent(const struct ipfw_flow_id *id, const void *rule) +{ + + return (jenkins_hash32((const uint32_t *)&rule, + sizeof(rule) / sizeof(uint32_t), hash_packet(id))); +} +#endif /* IPFIREWALL_JENKINSHASH */ + +/* * Print customizable flow id description via log(9) facility. */ static void @@ -502,903 +900,1809 @@ print_dyn_rule_flags(const struct ipfw_flow_id *id, in } log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", prefix, dyn_type, src, id->src_port, dst, - id->dst_port, DYN_COUNT, postfix); + id->dst_port, V_dyn_count, postfix); } #define print_dyn_rule(id, dtype, prefix, postfix) \ print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) -#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) -#define TIME_LE(a,b) ((int)((a)-(b)) < 0) +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define TIME_LE(a,b) ((int)((a)-(b)) < 0) *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802071859.w17IxtxX060804>