Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 16 Jul 2012 23:22:49 +0200
From:      Luigi Rizzo <rizzo@iet.unipi.it>
To:        "Alexander V. Chernikov" <melifaro@FreeBSD.org>
Cc:        Doug Barton <dougb@freebsd.org>, net@freebsd.org
Subject:   Re: FreeBSD 10G forwarding performance @Intel
Message-ID:  <20120716212249.GA14607@onelab2.iet.unipi.it>
In-Reply-To: <500452A5.3070501@FreeBSD.org>
References:  <4FF36438.2030902@FreeBSD.org> <4FF3E2C4.7050701@FreeBSD.org> <4FF3FB14.8020006@FreeBSD.org> <4FF402D1.4000505@FreeBSD.org> <20120704091241.GA99164@onelab2.iet.unipi.it> <4FF412B9.3000406@FreeBSD.org> <20120704154856.GC3680@onelab2.iet.unipi.it> <4FF59955.5090406@FreeBSD.org> <20120706061126.GA65432@onelab2.iet.unipi.it> <500452A5.3070501@FreeBSD.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Mon, Jul 16, 2012 at 09:43:01PM +0400, Alexander V. Chernikov wrote:
> On 06.07.2012 10:11, Luigi Rizzo wrote:
> >On Thu, Jul 05, 2012 at 05:40:37PM +0400, Alexander V. Chernikov wrote:
> >>On 04.07.2012 19:48, Luigi Rizzo wrote:
> >the thing discussed a few years ago (at least the one i took out of the
> >discussion) was that the counter fields in rules should hold the
> >index of a per-cpu counter associated to the rule. So CTR_INC(rule->ctr)
> >becomes something like pcpu->ipfw_ctrs[rule->ctr]++
> >Once you create a new rule you also grab one free index from ipfw_ctrs[],
> >and the same should go for dummynet counters.
> 
> Old kernel from previous letters, same setup:
> 
> net.inet.ip.fw.enable=0
> 2.3 MPPS
> net.inet.ip.fw.update_counters=0
> net.inet.ip.fw.enable=1
> 1.93MPPS
> net.inet.ip.fw.update_counters=1
> 1.74MPPS
> 
> Kernel with ipfw pcpu counters:
> 
> net.inet.ip.fw.enable=0
> 2.3 MPPS
> net.inet.ip.fw.update_counters=0
> net.inet.ip.fw.enable=1
> 1.93MPPS
> net.inet.ip.fw.update_counters=1
> 1.93MPPS
> 
> Counters seems to be working without any (significant) overhead.
> (Maybe I'm wrong somewhere?)

well, it seems  that the counters are costing some 10% which is
not negligible (60ns per packet according to your test).
Also i'd be curious if you get better savings if you
have actual conflicts on the rulesets (e.g. what happens
with a ruleset that has, say, ten "count ip from any to any" rules) ?

> Additionally, I've got (from my previous pcpu attempt) a small patch 
> permitting ipfw to re-use rule map allocation instead of reallocating 
> on every rule. This saves a bit of system time:
> 
> loading 20k rules with ipfw binary gives us:
> 5.1s system time before and 4.1s system time after.

not bad but in this case i wonder if one wouldn't
get much higher savings by support multiple rule loading with a
single syscall. The format used in IPFW3 should help that.

cheers
luigi

> 
> -- 
> WBR, Alexander

> >From 931ac84b88829fe15bb4410bd2b06614f41c99b5 Mon Sep 17 00:00:00 2001
> From: "Alexander V. Chernikov" <melifaro@ipfw.ru>
> Date: Mon, 9 Jul 2012 23:10:02 +0400
> Subject: [PATCH 1/3] Do not allocate data buffers per each rule
>  addition/deletion
> 
> ---
>  sys/netinet/ipfw/ip_fw2.c        |    2 +
>  sys/netinet/ipfw/ip_fw_private.h |    4 ++
>  sys/netinet/ipfw/ip_fw_sockopt.c |  134 ++++++++++++++++++++++++++++++++++----
>  3 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
> index 98766f3..bf06c1e 100644
> --- a/sys/netinet/ipfw/ip_fw2.c
> +++ b/sys/netinet/ipfw/ip_fw2.c
> @@ -2701,6 +2701,8 @@ vnet_ipfw_uninit(const void *unused)
>  	}
>  	if (chain->map)
>  		free(chain->map, M_IPFW);
> +	if (chain->b_map)
> +		free(chain->b_map, M_IPFW);
>  	IPFW_WUNLOCK(chain);
>  	IPFW_UH_WUNLOCK(chain);
>  	if (reap != NULL)
> diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
> index 2806741..272cef3 100644
> --- a/sys/netinet/ipfw/ip_fw_private.h
> +++ b/sys/netinet/ipfw/ip_fw_private.h
> @@ -232,6 +232,10 @@ struct ip_fw_chain {
>  #endif
>  	uint32_t	id;		/* ruleset id */
>  	uint32_t	gencnt;		/* generation count */
> +	int		n_allocated;	/* Size of primary map in items */
> +	struct ip_fw	**b_map;	/* Backup map to use in swapping */
> +	int		b_allocated;	/* Size of backup map in items */
> +
>  };
>  
>  struct sockopt;	/* used by tcp_var.h */
> diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
> index a245143..4e71a11 100644
> --- a/sys/netinet/ipfw/ip_fw_sockopt.c
> +++ b/sys/netinet/ipfw/ip_fw_sockopt.c
> @@ -97,29 +97,92 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
>  	return hi;
>  }
>  
> +
> +
> +/*
> + * In order to minimize block allocations on every rule addition we
> + * use the following technique:
> + * 1) round up allocation to MAP_GRANULARITY
> + * 2) Save previously allocated map to the special bakup pointer.
> + *
> + * This permits us do backup <> primary (and vise versa) map swapping
> + * without reallocations in most cases.
> + *
> + */
> +
> +struct map_ptrs {
> +	int		backup;		/* 1 if backup map used */
> +	int		items;		/* number of items allocated */
> +	struct ip_fw	**map;
> +	struct ip_fw	**free_map;	/* map pointer to free */
> +};
> +int
> +get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs);
> +
> +#define	MAP_GRANULARITY	128	/* 1k overhead per buffer */
> +#define	MAP_ROUNDUP(i)	(i) + MAP_GRANULARITY - ((i) % MAP_GRANULARITY)
>  /*
>   * allocate a new map, returns the chain locked. extra is the number
>   * of entries to add or delete.
>   */
> -static struct ip_fw **
> -get_map(struct ip_fw_chain *chain, int extra, int locked)
> +int
> +get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs)
>  {
>  
>  	for (;;) {
>  		struct ip_fw **map;
>  		int i;
>  
> +		/* Be optimistic by default */
> +		if (!locked)
> +			IPFW_UH_WLOCK(chain);
> +
>  		i = chain->n_rules + extra;
> +
> +		/*
> +		 * We have to fit to the following conditions in order
> +		 * to use backup map:
> +		 *
> +		 * 1) i <= chain->b_allocated (add case)
> +		 * 2) roundup(i) == chain->b_allocated (del case) (saves memory)
> +		 *
> +		 * Since roundup adds 0..(MAP_GRANULARITY-1)
> +		 * rule 2 requires rule 1 to be true.
> +		 */
> +		CTR4(KTR_NET, "%s: req: %d roundup: %d, b_allocated: %d",
> +			__func__, i, MAP_ROUNDUP(i), chain->b_allocated);
> +
> +		if (MAP_ROUNDUP(i) == chain->b_allocated) {
> +			/* no new allocation. Just return backup map */
> +			ptrs->backup = 1;
> +			ptrs->items = chain->b_allocated;
> +			ptrs->map = chain->b_map;
> +			CTR3(KTR_NET, "%s: BACKUP SELECTED: %p items: %d", __func__, ptrs->map, ptrs->items);
> +
> +			return 1;
> +		}
> +
> +		/* Pessimistic scenario - we need reallocation */
> +		i = MAP_ROUNDUP(i);
> +
> +		if (!locked)
> +			IPFW_UH_WUNLOCK(chain);
> +
>  		map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
>  			locked ? M_NOWAIT : M_WAITOK);
>  		if (map == NULL) {
>  			printf("%s: cannot allocate map\n", __FUNCTION__);
> -			return NULL;
> +			return 0;
>  		}
>  		if (!locked)
>  			IPFW_UH_WLOCK(chain);
> -		if (i >= chain->n_rules + extra) /* good */
> -			return map;
> +		if (i >= chain->n_rules + extra) { /* good */
> +			ptrs->backup = 0;
> +			ptrs->items = i;
> +			ptrs->map = map;
> +			CTR3(KTR_NET, "%s: NEW MAP: %p items: %d", __func__, ptrs->map, ptrs->items);
> +			return 1;
> +		}
>  		/* otherwise we lost the race, free and retry */
>  		if (!locked)
>  			IPFW_UH_WUNLOCK(chain);
> @@ -128,10 +191,11 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
>  }
>  
>  /*
> - * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
> + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK.
> + * Returns old map pointer to free.
>   */
>  static struct ip_fw **
> -swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
> +swap_map(struct ip_fw_chain *chain, struct map_ptrs *ptrs, int new_len)
>  {
>  	struct ip_fw **old_map;
>  
> @@ -139,8 +203,43 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
>  	chain->id++;
>  	chain->n_rules = new_len;
>  	old_map = chain->map;
> -	chain->map = new_map;
> +	chain->map = ptrs->map;
>  	IPFW_WUNLOCK(chain);
> +
> +	CTR4(KTR_NET, "%s: old: %p new: %p, backup: %d",
> +		__func__, old_map, chain->map, ptrs->backup);
> +
> +	/* Finalize map change */
> +	if (ptrs->backup) {
> +		/*
> +		 * Old map has (at least) the same size as current map.
> +		 * Save it as backup map.
> +		 */
> +		chain->b_allocated = chain->n_allocated;
> +		chain->n_allocated = ptrs->items;
> +		chain->b_map = old_map;
> +		ptrs->free_map = NULL;
> +		return old_map;
> +	}
> +
> +	/*
> +	 * Backup map is not used. Free old map and use
> +	 * current map as candidate.
> +	 */
> +
> +	/* Set free pointer to old backup map */
> +	ptrs->free_map = chain->b_map;
> +	/* Set backup map to be old current map */
> +	chain->b_map = old_map;
> +
> +	/* Save backup map size */
> +	chain->b_allocated = chain->n_allocated;
> +	chain->n_allocated = ptrs->items;
> +
> +	CTR6(KTR_NET, "%s: new: %p alloc: %d backup: %p alloc: %d free: %p",
> +		__func__, chain->map, chain->n_allocated,
> +		chain->b_map, chain->b_allocated, ptrs->free_map);
> +
>  	return old_map;
>  }
>  
> @@ -156,6 +255,7 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  {
>  	struct ip_fw *rule;
>  	int i, l, insert_before;
> +	struct map_ptrs ptrs;
>  	struct ip_fw **map;	/* the new array of pointers */
>  
>  	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
> @@ -164,12 +264,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  	l = RULESIZE(input_rule);
>  	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
>  	/* get_map returns with IPFW_UH_WLOCK if successful */
> -	map = get_map(chain, 1, 0 /* not locked */);
> -	if (map == NULL) {
> +	if (get_map(chain, 1, 0 /* not locked */, &ptrs) == 0) {
>  		free(rule, M_IPFW);
>  		return ENOSPC;
>  	}
>  
> +	map = ptrs.map;
> +
>  	bcopy(input_rule, rule, l);
>  	/* clear fields not settable from userland */
>  	rule->x_next = NULL;
> @@ -201,7 +302,8 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  	}
>  
>  	rule->id = chain->id + 1;
> -	map = swap_map(chain, map, chain->n_rules + 1);
> +	swap_map(chain, &ptrs, chain->n_rules + 1);
> +	map = ptrs.free_map;
>  	chain->static_len += l;
>  	IPFW_UH_WUNLOCK(chain);
>  	if (map)
> @@ -283,6 +385,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>  	uint32_t num;	/* rule number or old_set */
>  	uint8_t cmd, new_set;
>  	int start, end, i, ofs, n;
> +	struct map_ptrs ptrs;
>  	struct ip_fw **map = NULL;
>  	int error = 0;
>  
> @@ -353,12 +456,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>  		}
>  
>  		/* We have something to delete. Allocate the new map */
> -		map = get_map(chain, -n, 1 /* locked */);
> -		if (map == NULL) {
> +		if (get_map(chain, -n, 1 /* locked */, &ptrs) == 0) {
>  			error = EINVAL;
>  			break;
>  		}
>  
> +		map = ptrs.map;
> +
>  		/* 1. bcopy the initial part of the map */
>  		if (start > 0)
>  			bcopy(chain->map, map, start * sizeof(struct ip_fw *));
> @@ -372,7 +476,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>  		bcopy(chain->map + end, map + ofs,
>  			(chain->n_rules - end) * sizeof(struct ip_fw *));
>  		/* 4. swap the maps (under BH_LOCK) */
> -		map = swap_map(chain, map, chain->n_rules - n);
> +		map = swap_map(chain, &ptrs, chain->n_rules - n);
>  		/* 5. now remove the rules deleted from the old map */
>  		for (i = start; i < end; i++) {
>  			int l;
> @@ -385,6 +489,8 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>  			rule->x_next = chain->reap;
>  			chain->reap = rule;
>  		}
> +		/* Free map if we need to */
> +		map = ptrs.free_map;
>  		break;
>  
>  	/*
> -- 
> 1.7.9.4
> 

> >From baad4a88bba898baa3aa5f0dffc8a40182bcfcd6 Mon Sep 17 00:00:00 2001
> From: Charlie Root <root@test15.yandex.net>
> Date: Thu, 12 Jul 2012 17:19:10 +0000
> Subject: [PATCH 1/2] Make accounting in ipfw optional
> 
> ---
>  sys/netinet/ipfw/ip_fw2.c |   32 +++++++++++++++++++++-----------
>  1 files changed, 21 insertions(+), 11 deletions(-)
> 
> diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
> index e5a13e4..7bb73b0 100644
> --- a/sys/netinet/ipfw/ip_fw2.c
> +++ b/sys/netinet/ipfw/ip_fw2.c
> @@ -112,6 +112,8 @@ static int default_to_accept = 1;
>  static int default_to_accept;
>  #endif
>  
> +static int update_counters = 0;
> +
>  VNET_DEFINE(int, autoinc_step);
>  
>  VNET_DEFINE(unsigned int, fw_tables_max);
> @@ -190,6 +192,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
>      "Permit single packet IPv6 fragments");
>  #endif /* INET6 */
>  
> +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, update_counters, CTLFLAG_RW,
> +    &update_counters, 0, "Update counters on match");
>  SYSEND
>  
>  #endif /* SYSCTL_NODE */
> @@ -2011,16 +2015,20 @@ do {								\
>  				break;
>  
>  			case O_COUNT:
> -				f->pcnt++;	/* update stats */
> -				f->bcnt += pktlen;
> -				f->timestamp = time_uptime;
> +				if (update_counters) {
> +					f->pcnt++;	/* update stats */
> +					f->bcnt += pktlen;
> +					f->timestamp = time_uptime;
> +				}
>  				l = 0;		/* exit inner loop */
>  				break;
>  
>  			case O_SKIPTO:
> -			    f->pcnt++;	/* update stats */
> -			    f->bcnt += pktlen;
> -			    f->timestamp = time_uptime;
> +			    if (update_counters) {
> +				    f->pcnt++;	/* update stats */
> +				    f->bcnt += pktlen;
> +				    f->timestamp = time_uptime;
> +			    }
>  			    /* If possible use cached f_pos (in f->next_rule),
>  			     * whose version is written in f->next_rule
>  			     * (horrible hacks to avoid changing the ABI).
> @@ -2375,11 +2383,13 @@ do {								\
>  	}		/* end of outer for, scan rules */
>  
>  	if (done) {
> -		struct ip_fw *rule = chain->map[f_pos];
> -		/* Update statistics */
> -		rule->pcnt++;
> -		rule->bcnt += pktlen;
> -		rule->timestamp = time_uptime;
> +		if (update_counters) {
> +			struct ip_fw *rule = chain->map[f_pos];
> +			/* Update statistics */
> +			rule->pcnt++;
> +			rule->bcnt += pktlen;
> +			rule->timestamp = time_uptime;
> +		}
>  	} else {
>  		retval = IP_FW_DENY;
>  		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
> -- 
> 1.7.6.1
> 

> diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
> index 589c19c..b1f83bb 100644
> --- a/sys/netinet/ip_fw.h
> +++ b/sys/netinet/ip_fw.h
> @@ -493,9 +493,14 @@ struct ip_fw {
>  	uint32_t	id;		/* rule id */
>  
>  	/* These fields are present in all rules.			*/
> +#if 0
>  	uint64_t	pcnt;		/* Packet counter		*/
>  	uint64_t	bcnt;		/* Byte counter			*/
>  	uint32_t	timestamp;	/* tv_sec of last match		*/
> +#endif
> +	uint64_t	_pad0;
> +	uint64_t	_pad1;
> +	uint32_t	cntr_idx;	/* Index in pcpu counters */
>  
>  	ipfw_insn	cmd[1];		/* storage for commands		*/
>  };
> diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
> index 7bb73b0..db93a5c 100644
> --- a/sys/netinet/ipfw/ip_fw2.c
> +++ b/sys/netinet/ipfw/ip_fw2.c
> @@ -2016,18 +2016,20 @@ do {								\
>  
>  			case O_COUNT:
>  				if (update_counters) {
> -					f->pcnt++;	/* update stats */
> -					f->bcnt += pktlen;
> -					f->timestamp = time_uptime;
> +					struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
> +					c->pcnt++;	/* update stats */
> +					c->bcnt += pktlen;
> +					c->timestamp = time_uptime;
>  				}
>  				l = 0;		/* exit inner loop */
>  				break;
>  
>  			case O_SKIPTO:
>  			    if (update_counters) {
> -				    f->pcnt++;	/* update stats */
> -				    f->bcnt += pktlen;
> -				    f->timestamp = time_uptime;
> +				    struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
> +				    c->pcnt++;	/* update stats */
> +				    c->bcnt += pktlen;
> +				    c->timestamp = time_uptime;
>  			    }
>  			    /* If possible use cached f_pos (in f->next_rule),
>  			     * whose version is written in f->next_rule
> @@ -2125,9 +2127,10 @@ do {								\
>  					break;
>  				}
>  
> -				f->pcnt++;	/* update stats */
> -				f->bcnt += pktlen;
> -				f->timestamp = time_uptime;
> +				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
> +				c->pcnt++;	/* update stats */
> +				c->bcnt += pktlen;
> +				c->timestamp = time_uptime;
>  				stack = (uint16_t *)(mtag + 1);
>  
>  				/*
> @@ -2263,9 +2266,10 @@ do {								\
>  			case O_SETFIB: {
>  				uint32_t fib;
>  
> -				f->pcnt++;	/* update stats */
> -				f->bcnt += pktlen;
> -				f->timestamp = time_uptime;
> +				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
> +				c->pcnt++;	/* update stats */
> +				c->bcnt += pktlen;
> +				c->timestamp = time_uptime;
>  				fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg:
>  				    cmd->arg1;
>  				if (fib >= rt_numfibs)
> @@ -2315,8 +2319,9 @@ do {								\
>  			case O_REASS: {
>  				int ip_off;
>  
> -				f->pcnt++;
> -				f->bcnt += pktlen;
> +				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
> +				c->pcnt++;
> +				c->bcnt += pktlen;
>  				l = 0;	/* in any case exit inner loop */
>  				ip_off = ntohs(ip->ip_off);
>  
> @@ -2386,9 +2391,10 @@ do {								\
>  		if (update_counters) {
>  			struct ip_fw *rule = chain->map[f_pos];
>  			/* Update statistics */
> -			rule->pcnt++;
> -			rule->bcnt += pktlen;
> -			rule->timestamp = time_uptime;
> +			struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, rule->cntr_idx);
> +			c->pcnt++;
> +			c->bcnt += pktlen;
> +			c->timestamp = time_uptime;
>  		}
>  	} else {
>  		retval = IP_FW_DENY;
> @@ -2439,6 +2445,7 @@ ipfw_init(void)
>  {
>  	int error = 0;
>  
> +	_dpcpu_init();
>  	ipfw_dyn_attach();
>  	/*
>   	 * Only print out this stuff the first time around,
> @@ -2500,6 +2507,7 @@ ipfw_destroy(void)
>  
>  	ipfw_log_bpf(0); /* uninit */
>  	ipfw_dyn_detach();
> +	_dpcpu_uninit();
>  	printf("IP firewall unloaded\n");
>  }
>  
> @@ -2546,6 +2554,9 @@ vnet_ipfw_init(const void *unused)
>  		return (ENOSPC);
>  	}
>  
> +	/* Init per-cpu counters */
> +	ipfw_init_counters(chain);
> +
>  	/* fill and insert the default rule */
>  	rule->act_ofs = 0;
>  	rule->rulenum = IPFW_DEFAULT_RULE;
> @@ -2559,6 +2570,9 @@ vnet_ipfw_init(const void *unused)
>  	IPFW_LOCK_INIT(chain);
>  	ipfw_dyn_init();
>  
> +	/* Allocate first pcpu counter */
> +	rule->cntr_idx = ipfw_alloc_counter(chain);
> +
>  	/* First set up some values that are compile time options */
>  	V_ipfw_vnet_ready = 1;		/* Open for business */
>  
> @@ -2619,10 +2633,11 @@ vnet_ipfw_uninit(const void *unused)
>  	}
>  	if (chain->map)
>  		free(chain->map, M_IPFW);
> +	_dpcpu_free(chain->cntrs);
>  	IPFW_WUNLOCK(chain);
>  	IPFW_UH_WUNLOCK(chain);
>  	if (reap != NULL)
> -		ipfw_reap_rules(reap);
> +		ipfw_reap_rules(chain, reap);
>  	IPFW_LOCK_DESTROY(chain);
>  	ipfw_dyn_uninit(1);	/* free the remaining parts */
>  	return 0;
> diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
> index 46d011c..bd4fad3 100644
> --- a/sys/netinet/ipfw/ip_fw_private.h
> +++ b/sys/netinet/ipfw/ip_fw_private.h
> @@ -222,6 +222,7 @@ struct ip_fw_chain {
>  	struct radix_node_head **tables;	/* IPv4 tables */
>  	struct radix_node_head **xtables;	/* extended tables */
>  	uint8_t		*tabletype;	/* Array of table types */
> +	uintptr_t	*cntrs;		/* Per-cpu counters */
>  #if defined( __linux__ ) || defined( _WIN32 )
>  	spinlock_t rwmtx;
>  	spinlock_t uh_lock;
> @@ -231,8 +232,25 @@ struct ip_fw_chain {
>  #endif
>  	uint32_t	id;		/* ruleset id */
>  	uint32_t	gencnt;		/* generation count */
> +	int		cntr_allocated;	/* Size of allocated counters map */
> +	uint32_t	*cntr_idx;	/* bit index of counters map */
>  };
>  
> +struct ip_fw_cntr {
> +	uint64_t	pcnt;		/* Packet counter		*/
> +	uint64_t	bcnt;		/* Byte counter			*/
> +	uint32_t	timestamp;	/* tv_sec of last match		*/
> +};
> +#define	IPFW_PCPU_CNTR(chain, idx)	((struct ip_fw_cntr *)(chain)->cntrs[curcpu] + idx)
> +#define	IPFW_PCPU_DEFAULT_COUNTERS	128
> +void _dpcpu_init(void);
> +void _dpcpu_uninit(void);
> +uintptr_t *_dpcpu_alloc(size_t size, int flags);
> +void _dpcpu_free(uintptr_t *ptr);
> +void ipfw_init_counters(struct ip_fw_chain *chain);
> +int ipfw_alloc_counter(struct ip_fw_chain *chain);
> +void ipfw_free_counter(struct ip_fw_chain *chain, int idx);
> +
>  struct sockopt;	/* used by tcp_var.h */
>  
>  /*
> @@ -267,7 +285,10 @@ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
>  int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
>  int ipfw_ctl(struct sockopt *sopt);
>  int ipfw_chk(struct ip_fw_args *args);
> -void ipfw_reap_rules(struct ip_fw *head);
> +void ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head);
> +
> +void ipfw_zero_counter(struct ip_fw_chain *chain, int idx);
> +void ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr);
>  
>  /* In ip_fw_pfil */
>  int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
> diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
> index 0aecc2c..db308c2 100644
> --- a/sys/netinet/ipfw/ip_fw_sockopt.c
> +++ b/sys/netinet/ipfw/ip_fw_sockopt.c
> @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/malloc.h>
>  #include <sys/mbuf.h>	/* struct m_tag used by nested headers */
>  #include <sys/kernel.h>
> +#include <sys/smp.h>
>  #include <sys/lock.h>
>  #include <sys/priv.h>
>  #include <sys/proc.h>
> @@ -72,6 +73,212 @@ MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
>   * static variables followed by global ones (none in this file)
>   */
>  
> +MALLOC_DEFINE(M_XDPCPU, "XPCPU", "PCPU data");
> +
> +uma_zone_t _dpcpu_zone;
> +
> +static void ipfw_extend_counters(struct ip_fw_chain *chain);
> +
> +void
> +_dpcpu_init()
> +{
> +	_dpcpu_zone = uma_zcreate("pcpu ptrs", sizeof(uintptr_t) * MAXCPU,
> +		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
> +}
> +
> +void
> +_dpcpu_uninit()
> +{
> +	uma_zdestroy(_dpcpu_zone);
> +}
> +
> +#define	_DPCPU_ALIGN	512
> +uintptr_t *
> +_dpcpu_alloc(size_t size, int flags)
> +{
> +	uintptr_t *ptr;
> +	int id, fail = 0;
> +
> +	if (size % _DPCPU_ALIGN)
> +		size += _DPCPU_ALIGN - (size % _DPCPU_ALIGN);
> +
> +	if ((ptr = uma_zalloc(_dpcpu_zone, flags | M_ZERO)) == NULL)
> +		return NULL;
> +
> +	CPU_FOREACH(id) {
> +		/*
> +		 * We believe in aligned allocation here
> +		 */
> +		if ((ptr[id] = (uintptr_t)malloc(size, M_XDPCPU, flags)) == 0) {
> +			fail = 1;
> +			break;
> +		}
> +
> +		KASSERT((uintptr_t)ptr % _DPCPU_ALIGN == 0, ("malloc(9) returns unaligned data :("));
> +	}
> +
> +	if (fail == 0)
> +		return ptr;
> +
> +	CPU_FOREACH(id) {
> +		if (ptr[id] == 0)
> +			break;
> +
> +		free((void *)ptr[id], M_XDPCPU);
> +	}
> +
> +	uma_zfree(_dpcpu_zone, ptr);
> +
> +	return NULL;
> +}
> +
> +void
> +_dpcpu_free(uintptr_t *ptr)
> +{
> +	int id;
> +
> +	CPU_FOREACH(id) {
> +		if (ptr[id] == 0)
> +			continue;
> +		free((void *)ptr[id], M_XDPCPU);
> +	}
> +
> +	uma_zfree(_dpcpu_zone, ptr);
> +}
> +
> +void
> +ipfw_init_counters(struct ip_fw_chain *chain)
> +{
> +	chain->cntrs = _dpcpu_alloc(IPFW_PCPU_DEFAULT_COUNTERS *
> +		sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
> +	chain->cntr_idx = malloc(IPFW_PCPU_DEFAULT_COUNTERS / 8, M_IPFW, M_WAITOK);
> +	/* Free positions are marked by 1 */
> +	memset(chain->cntr_idx, 0xFF, IPFW_PCPU_DEFAULT_COUNTERS / 8);
> +	chain->cntr_allocated = IPFW_PCPU_DEFAULT_COUNTERS;
> +}
> +
> +
> +static void
> +ipfw_extend_counters(struct ip_fw_chain *chain)
> +{
> +	uintptr_t *new_ptr, *old_ptr;
> +	uint32_t *old_idx, *new_idx;
> +	int i, id;
> +
> +	for (;;) {
> +
> +		i = 2 * chain->cntr_allocated;
> +
> +		new_ptr = _dpcpu_alloc(i * sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
> +		new_idx = malloc(i / 8, M_IPFW, M_WAITOK);
> +		/* Free positions are marked by 1 */
> +		memset(new_idx, 0xFF, i / 8);
> +
> +		IPFW_UH_WLOCK(chain);
> +		if (i == 2 * chain->cntr_allocated) {
> +			CTR4(KTR_NET, "%s: NEW INDEX: %p cntr_idx: %d count: %d", __func__, new_ptr, new_idx, i);
> +			break;
> +		}
> +		/* otherwise we lost the race, free and retry */
> +		IPFW_UH_WUNLOCK(chain);
> +
> +		free(new_idx, M_IPFW);
> +		_dpcpu_free(new_ptr);
> +	}
> +
> +	/* Pointers allocated, let's migrate data */
> +
> +	IPFW_WLOCK(chain);
> +	/* Copy pcpu counters */
> +	CPU_FOREACH(id) {
> +		memcpy((void *)new_ptr[id], (void *)chain->cntrs[id], chain->cntr_allocated * sizeof(struct ip_fw_cntr));
> +	}
> +	/* Copy index */
> +	memcpy(new_idx, chain->cntr_idx, chain->cntr_allocated / 8);
> +	/* Swap old and new pointers */
> +	old_ptr = chain->cntrs;
> +	chain->cntrs = new_ptr;
> +	old_idx = chain->cntr_idx;
> +	chain->cntr_idx = new_idx;
> +	chain->cntr_allocated = i;
> +
> +	IPFW_WUNLOCK(chain);
> +	IPFW_UH_WUNLOCK(chain);
> +
> +	/* Free old counters and index */
> +	free(old_idx, M_IPFW);
> +	_dpcpu_free(old_ptr);
> +}
> +
> +int
> +ipfw_alloc_counter(struct ip_fw_chain *chain)
> +{
> +	int i, x, idx;
> +	uint32_t *v;
> +
> +	for (;;) {
> +		IPFW_UH_WLOCK(chain);
> +		for (i = 0, v = chain->cntr_idx; i < chain->cntr_allocated / 8 / 4; i++, v++) {
> +			if ((x = ffsl(*v)) != 0) {
> +				/* Found. Mark as used */
> +				*v &= ~ (1 << (x - 1));
> +				IPFW_UH_WUNLOCK(chain);
> +				idx = i * 32 + x - 1;
> +				ipfw_zero_counter(chain, idx);
> +				CTR3(KTR_NET, "%s: allocated counter %d, map after: %X", __func__, idx, *v);
> +				return idx;
> +			}
> +		}
> +		IPFW_UH_WUNLOCK(chain);
> +
> +		/* Not found. Let's allocate data and retry */
> +		ipfw_extend_counters(chain);
> +	}
> +}
> +
> +void
> +ipfw_free_counter(struct ip_fw_chain *chain, int idx)
> +{
> +	uint32_t *v;
> +
> +	IPFW_UH_WLOCK(chain);
> +	v = chain->cntr_idx;
> +	v += idx / 32;
> +	CTR3(KTR_NET, "%s: returned counter %d, map before: %X", __func__, idx, *v);
> +	idx %= 32;
> +	*v |= 1 << idx;
> +
> +	IPFW_UH_WUNLOCK(chain);
> +}
> +
> +void
> +ipfw_zero_counter(struct ip_fw_chain *chain, int idx)
> +{
> +	int id;
> +	struct ip_fw_cntr *pcpu;
> +
> +	CTR2(KTR_NET, "%s: zeroing counter %d", __func__, idx);
> +	CPU_FOREACH(id) {
> +		pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
> +		memset(pcpu, 0, sizeof(struct ip_fw_cntr));
> +	}
> +}
> +
> +void
> +ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr)
> +{
> +	int id;
> +	struct ip_fw_cntr *pcpu;
> +
> +	CPU_FOREACH(id) {
> +		pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
> +		cntr->pcnt += pcpu->pcnt;
> +		cntr->bcnt += pcpu->bcnt;
> +		if (cntr->timestamp < pcpu->timestamp)
> +			cntr->timestamp = pcpu->timestamp;
> +	}
> +}
> +
>  /*
>   * Find the smallest rule >= key, id.
>   * We could use bsearch but it is so simple that we code it directly
> @@ -155,7 +362,7 @@ int
>  ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  {
>  	struct ip_fw *rule;
> -	int i, l, insert_before;
> +	int i, l, cntr_idx, insert_before;
>  	struct ip_fw **map;	/* the new array of pointers */
>  
>  	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
> @@ -163,12 +370,14 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  
>  	l = RULESIZE(input_rule);
>  	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
> +	cntr_idx = ipfw_alloc_counter(chain);
>  	if (rule == NULL)
>  		return (ENOSPC);
>  	/* get_map returns with IPFW_UH_WLOCK if successful */
>  	map = get_map(chain, 1, 0 /* not locked */);
>  	if (map == NULL) {
>  		free(rule, M_IPFW);
> +		ipfw_free_counter(chain, cntr_idx);
>  		return ENOSPC;
>  	}
>  
> @@ -176,10 +385,12 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>  	/* clear fields not settable from userland */
>  	rule->x_next = NULL;
>  	rule->next_rule = NULL;
> +	rule->cntr_idx = cntr_idx;
> +#if 0
>  	rule->pcnt = 0;
>  	rule->bcnt = 0;
>  	rule->timestamp = 0;
> -
> +#endif
>  	if (V_autoinc_step < 1)
>  		V_autoinc_step = 1;
>  	else if (V_autoinc_step > 1000)
> @@ -217,12 +428,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
>   * A NULL pointer on input is handled correctly.
>   */
>  void
> -ipfw_reap_rules(struct ip_fw *head)
> +ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head)
>  {
>  	struct ip_fw *rule;
>  
>  	while ((rule = head) != NULL) {
>  		head = head->x_next;
> +		ipfw_free_counter(chain, rule->cntr_idx);
>  		free(rule, M_IPFW);
>  	}
>  }
> @@ -424,7 +636,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>  	rule = chain->reap;
>  	chain->reap = NULL;
>  	IPFW_UH_WUNLOCK(chain);
> -	ipfw_reap_rules(rule);
> +	ipfw_reap_rules(chain, rule);
>  	if (map)
>  		free(map, M_IPFW);
>  	return error;
> @@ -436,13 +648,17 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
>   * so we only care that rules do not disappear.
>   */
>  static void
> -clear_counters(struct ip_fw *rule, int log_only)
> +clear_counters(struct ip_fw_chain *chain, struct ip_fw *rule, int log_only)
>  {
>  	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
>  
>  	if (log_only == 0) {
> +#if 0
>  		rule->bcnt = rule->pcnt = 0;
>  		rule->timestamp = 0;
> +#endif
> +	ipfw_zero_counter(chain, rule->cntr_idx);
> +
>  	}
>  	if (l->o.opcode == O_LOG)
>  		l->log_left = l->max_log;
> @@ -481,7 +697,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
>  			/* Skip rules not in our set. */
>  			if (cmd == 1 && rule->set != set)
>  				continue;
> -			clear_counters(rule, log_only);
> +			clear_counters(chain, rule, log_only);
>  		}
>  		msg = log_only ? "All logging counts reset" :
>  		    "Accounting cleared";
> @@ -491,7 +707,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
>  			rule = chain->map[i];
>  			if (rule->rulenum == rulenum) {
>  				if (cmd == 0 || rule->set == set)
> -					clear_counters(rule, log_only);
> +					clear_counters(chain, rule, log_only);
>  				cleared = 1;
>  			}
>  			if (rule->rulenum > rulenum)
> @@ -854,7 +1070,7 @@ struct ip_fw7 {
>  	ipfw_insn	cmd[1];		/* storage for commands     */
>  };
>  
> -	int convert_rule_to_7(struct ip_fw *rule);
> +int convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule);
>  int convert_rule_to_8(struct ip_fw *rule);
>  
>  #ifndef RULESIZE7
> @@ -887,7 +1103,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
>  		    if (bp + l + sizeof(uint32_t) <= ep) {
>  			int error;
>  			bcopy(rule, bp, l + sizeof(uint32_t));
> -			error = convert_rule_to_7((struct ip_fw *) bp);
> +			error = convert_rule_to_7(chain, (struct ip_fw *) bp);
>  			if (error)
>  				return 0; /*XXX correct? */
>  			/*
> @@ -913,14 +1129,19 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
>  		}
>  		dst = (struct ip_fw *)bp;
>  		bcopy(rule, dst, l);
> +		/* copy statistics */
> +		struct ip_fw_cntr *c = (struct ip_fw_cntr *)&dst->_pad0;
> +		ipfw_export_counter(chain, rule->cntr_idx, c);
> +		if (c->timestamp)
> +			c->timestamp += boot_seconds;
> +
>  		/*
>  		 * XXX HACK. Store the disable mask in the "next"
>  		 * pointer in a wild attempt to keep the ABI the same.
>  		 * Why do we do this on EVERY rule?
>  		 */
>  		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
> -		if (dst->timestamp)
> -			dst->timestamp += boot_seconds;
> +
>  		bp += l;
>  	}
>  	ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
> @@ -1051,7 +1272,7 @@ ipfw_ctl(struct sockopt *sopt)
>  			size = RULESIZE(rule);
>  			if (!error && sopt->sopt_dir == SOPT_GET) {
>  				if (is7) {
> -					error = convert_rule_to_7(rule);
> +					error = convert_rule_to_7(chain, rule);
>  					size = RULESIZE7(rule);
>  					if (error)
>  						return error;
> @@ -1329,7 +1550,7 @@ ipfw_ctl(struct sockopt *sopt)
>  
>  /* Functions to convert rules 7.2 <==> 8.0 */
>  int
> -convert_rule_to_7(struct ip_fw *rule)
> +convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule)
>  {
>  	/* Used to modify original rule */
>  	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
> @@ -1355,9 +1576,12 @@ convert_rule_to_7(struct ip_fw *rule)
>  	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
>  	rule7->next = (struct ip_fw7 *)tmp->x_next;
>  	rule7->cmd_len = tmp->cmd_len;
> +	ipfw_export_counter(chain, rule->cntr_idx, (struct ip_fw_cntr *)&rule7->pcnt);
> +#if 0
>  	rule7->pcnt = tmp->pcnt;
>  	rule7->bcnt = tmp->bcnt;
>  	rule7->timestamp = tmp->timestamp;
> +#endif
>  
>  	/* Copy commands */
>  	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
> @@ -1429,10 +1653,11 @@ convert_rule_to_8(struct ip_fw *rule)
>  	rule->x_next = (struct ip_fw *)tmp->next;
>  	rule->cmd_len = tmp->cmd_len;
>  	rule->id = 0; /* XXX see if is ok = 0 */
> +#if 0
>  	rule->pcnt = tmp->pcnt;
>  	rule->bcnt = tmp->bcnt;
>  	rule->timestamp = tmp->timestamp;
> -
> +#endif
>  	free (tmp, M_TEMP);
>  	return 0;
>  }




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120716212249.GA14607>