Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 16 Jul 2012 21:43:01 +0400
From:      "Alexander V. Chernikov" <melifaro@FreeBSD.org>
To:        Luigi Rizzo <rizzo@iet.unipi.it>
Cc:        Doug Barton <dougb@freebsd.org>, net@freebsd.org
Subject:   Re: FreeBSD 10G forwarding performance @Intel
Message-ID:  <500452A5.3070501@FreeBSD.org>
In-Reply-To: <20120706061126.GA65432@onelab2.iet.unipi.it>
References:  <4FF361CA.4000506@FreeBSD.org> <20120703214419.GC92445@onelab2.iet.unipi.it> <4FF36438.2030902@FreeBSD.org> <4FF3E2C4.7050701@FreeBSD.org> <4FF3FB14.8020006@FreeBSD.org> <4FF402D1.4000505@FreeBSD.org> <20120704091241.GA99164@onelab2.iet.unipi.it> <4FF412B9.3000406@FreeBSD.org> <20120704154856.GC3680@onelab2.iet.unipi.it> <4FF59955.5090406@FreeBSD.org> <20120706061126.GA65432@onelab2.iet.unipi.it>

next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------040503000001040306070306
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

On 06.07.2012 10:11, Luigi Rizzo wrote:
> On Thu, Jul 05, 2012 at 05:40:37PM +0400, Alexander V. Chernikov wrote:
>> On 04.07.2012 19:48, Luigi Rizzo wrote:
> the thing discussed a few years ago (at least the one i took out of the
> discussion) was that the counter fields in rules should hold the
> index of a per-cpu counter associated to the rule. So CTR_INC(rule->ctr)
> becomes something like pcpu->ipfw_ctrs[rule->ctr]++
> Once you create a new rule you also grab one free index from ipfw_ctrs[],
> and the same should go for dummynet counters.

Old kernel from previous letters, same setup:

net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.74MPPS

Kernel with ipfw pcpu counters:

net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.93MPPS

Counters seems to be working without any (significant) overhead.
(Maybe I'm wrong somewhere?)

Additionally, I've got (from my previous pcpu attempt) a small patch 
permitting ipfw to re-use rule map allocation instead of reallocating 
on every rule. This saves a bit of system time:

loading 20k rules with ipfw binary gives us:
5.1s system time before and 4.1s system time after.


-- 
WBR, Alexander

--------------040503000001040306070306
Content-Type: text/plain; charset=UTF-8;
	name="0001-Do-not-allocate-data-buffers-per-each-rule-addition-.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename*0="0001-Do-not-allocate-data-buffers-per-each-rule-addition-.pa";
	filename*1="tch"

>From 931ac84b88829fe15bb4410bd2b06614f41c99b5 Mon Sep 17 00:00:00 2001
From: "Alexander V. Chernikov" <melifaro@ipfw.ru>
Date: Mon, 9 Jul 2012 23:10:02 +0400
Subject: [PATCH 1/3] Do not allocate data buffers per each rule
 addition/deletion

---
 sys/netinet/ipfw/ip_fw2.c        |    2 +
 sys/netinet/ipfw/ip_fw_private.h |    4 ++
 sys/netinet/ipfw/ip_fw_sockopt.c |  134 ++++++++++++++++++++++++++++++++++----
 3 files changed, 126 insertions(+), 14 deletions(-)

diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 98766f3..bf06c1e 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2701,6 +2701,8 @@ vnet_ipfw_uninit(const void *unused)
 	}
 	if (chain->map)
 		free(chain->map, M_IPFW);
+	if (chain->b_map)
+		free(chain->b_map, M_IPFW);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	if (reap != NULL)
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 2806741..272cef3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -232,6 +232,10 @@ struct ip_fw_chain {
 #endif
 	uint32_t	id;		/* ruleset id */
 	uint32_t	gencnt;		/* generation count */
+	int		n_allocated;	/* Size of primary map in items */
+	struct ip_fw	**b_map;	/* Backup map to use in swapping */
+	int		b_allocated;	/* Size of backup map in items */
+
 };
 
 struct sockopt;	/* used by tcp_var.h */
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index a245143..4e71a11 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -97,29 +97,92 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
 	return hi;
 }
 
+
+
+/*
+ * In order to minimize block allocations on every rule addition we
+ * use the following technique:
+ * 1) round up allocation to MAP_GRANULARITY
+ * 2) Save previously allocated map to the special bakup pointer.
+ *
+ * This permits us do backup <> primary (and vise versa) map swapping
+ * without reallocations in most cases.
+ *
+ */
+
+struct map_ptrs {
+	int		backup;		/* 1 if backup map used */
+	int		items;		/* number of items allocated */
+	struct ip_fw	**map;
+	struct ip_fw	**free_map;	/* map pointer to free */
+};
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs);
+
+#define	MAP_GRANULARITY	128	/* 1k overhead per buffer */
+#define	MAP_ROUNDUP(i)	(i) + MAP_GRANULARITY - ((i) % MAP_GRANULARITY)
 /*
  * allocate a new map, returns the chain locked. extra is the number
  * of entries to add or delete.
  */
-static struct ip_fw **
-get_map(struct ip_fw_chain *chain, int extra, int locked)
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs)
 {
 
 	for (;;) {
 		struct ip_fw **map;
 		int i;
 
+		/* Be optimistic by default */
+		if (!locked)
+			IPFW_UH_WLOCK(chain);
+
 		i = chain->n_rules + extra;
+
+		/*
+		 * We have to fit to the following conditions in order
+		 * to use backup map:
+		 *
+		 * 1) i <= chain->b_allocated (add case)
+		 * 2) roundup(i) == chain->b_allocated (del case) (saves memory)
+		 *
+		 * Since roundup adds 0..(MAP_GRANULARITY-1)
+		 * rule 2 requires rule 1 to be true.
+		 */
+		CTR4(KTR_NET, "%s: req: %d roundup: %d, b_allocated: %d",
+			__func__, i, MAP_ROUNDUP(i), chain->b_allocated);
+
+		if (MAP_ROUNDUP(i) == chain->b_allocated) {
+			/* no new allocation. Just return backup map */
+			ptrs->backup = 1;
+			ptrs->items = chain->b_allocated;
+			ptrs->map = chain->b_map;
+			CTR3(KTR_NET, "%s: BACKUP SELECTED: %p items: %d", __func__, ptrs->map, ptrs->items);
+
+			return 1;
+		}
+
+		/* Pessimistic scenario - we need reallocation */
+		i = MAP_ROUNDUP(i);
+
+		if (!locked)
+			IPFW_UH_WUNLOCK(chain);
+
 		map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
 			locked ? M_NOWAIT : M_WAITOK);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
-			return NULL;
+			return 0;
 		}
 		if (!locked)
 			IPFW_UH_WLOCK(chain);
-		if (i >= chain->n_rules + extra) /* good */
-			return map;
+		if (i >= chain->n_rules + extra) { /* good */
+			ptrs->backup = 0;
+			ptrs->items = i;
+			ptrs->map = map;
+			CTR3(KTR_NET, "%s: NEW MAP: %p items: %d", __func__, ptrs->map, ptrs->items);
+			return 1;
+		}
 		/* otherwise we lost the race, free and retry */
 		if (!locked)
 			IPFW_UH_WUNLOCK(chain);
@@ -128,10 +191,11 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
 }
 
 /*
- * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK.
+ * Returns old map pointer to free.
  */
 static struct ip_fw **
-swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+swap_map(struct ip_fw_chain *chain, struct map_ptrs *ptrs, int new_len)
 {
 	struct ip_fw **old_map;
 
@@ -139,8 +203,43 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
 	chain->id++;
 	chain->n_rules = new_len;
 	old_map = chain->map;
-	chain->map = new_map;
+	chain->map = ptrs->map;
 	IPFW_WUNLOCK(chain);
+
+	CTR4(KTR_NET, "%s: old: %p new: %p, backup: %d",
+		__func__, old_map, chain->map, ptrs->backup);
+
+	/* Finalize map change */
+	if (ptrs->backup) {
+		/*
+		 * Old map has (at least) the same size as current map.
+		 * Save it as backup map.
+		 */
+		chain->b_allocated = chain->n_allocated;
+		chain->n_allocated = ptrs->items;
+		chain->b_map = old_map;
+		ptrs->free_map = NULL;
+		return old_map;
+	}
+
+	/*
+	 * Backup map is not used. Free old map and use
+	 * current map as candidate.
+	 */
+
+	/* Set free pointer to old backup map */
+	ptrs->free_map = chain->b_map;
+	/* Set backup map to be old current map */
+	chain->b_map = old_map;
+
+	/* Save backup map size */
+	chain->b_allocated = chain->n_allocated;
+	chain->n_allocated = ptrs->items;
+
+	CTR6(KTR_NET, "%s: new: %p alloc: %d backup: %p alloc: %d free: %p",
+		__func__, chain->map, chain->n_allocated,
+		chain->b_map, chain->b_allocated, ptrs->free_map);
+
 	return old_map;
 }
 
@@ -156,6 +255,7 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 {
 	struct ip_fw *rule;
 	int i, l, insert_before;
+	struct map_ptrs ptrs;
 	struct ip_fw **map;	/* the new array of pointers */
 
 	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -164,12 +264,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 	l = RULESIZE(input_rule);
 	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
 	/* get_map returns with IPFW_UH_WLOCK if successful */
-	map = get_map(chain, 1, 0 /* not locked */);
-	if (map == NULL) {
+	if (get_map(chain, 1, 0 /* not locked */, &ptrs) == 0) {
 		free(rule, M_IPFW);
 		return ENOSPC;
 	}
 
+	map = ptrs.map;
+
 	bcopy(input_rule, rule, l);
 	/* clear fields not settable from userland */
 	rule->x_next = NULL;
@@ -201,7 +302,8 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 	}
 
 	rule->id = chain->id + 1;
-	map = swap_map(chain, map, chain->n_rules + 1);
+	swap_map(chain, &ptrs, chain->n_rules + 1);
+	map = ptrs.free_map;
 	chain->static_len += l;
 	IPFW_UH_WUNLOCK(chain);
 	if (map)
@@ -283,6 +385,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
 	uint32_t num;	/* rule number or old_set */
 	uint8_t cmd, new_set;
 	int start, end, i, ofs, n;
+	struct map_ptrs ptrs;
 	struct ip_fw **map = NULL;
 	int error = 0;
 
@@ -353,12 +456,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
 		}
 
 		/* We have something to delete. Allocate the new map */
-		map = get_map(chain, -n, 1 /* locked */);
-		if (map == NULL) {
+		if (get_map(chain, -n, 1 /* locked */, &ptrs) == 0) {
 			error = EINVAL;
 			break;
 		}
 
+		map = ptrs.map;
+
 		/* 1. bcopy the initial part of the map */
 		if (start > 0)
 			bcopy(chain->map, map, start * sizeof(struct ip_fw *));
@@ -372,7 +476,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
 		bcopy(chain->map + end, map + ofs,
 			(chain->n_rules - end) * sizeof(struct ip_fw *));
 		/* 4. swap the maps (under BH_LOCK) */
-		map = swap_map(chain, map, chain->n_rules - n);
+		map = swap_map(chain, &ptrs, chain->n_rules - n);
 		/* 5. now remove the rules deleted from the old map */
 		for (i = start; i < end; i++) {
 			int l;
@@ -385,6 +489,8 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
 			rule->x_next = chain->reap;
 			chain->reap = rule;
 		}
+		/* Free map if we need to */
+		map = ptrs.free_map;
 		break;
 
 	/*
-- 
1.7.9.4


--------------040503000001040306070306
Content-Type: text/plain; charset=UTF-8;
	name="0001-Make-accounting-in-ipfw-optional.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="0001-Make-accounting-in-ipfw-optional.patch"

>From baad4a88bba898baa3aa5f0dffc8a40182bcfcd6 Mon Sep 17 00:00:00 2001
From: Charlie Root <root@test15.yandex.net>
Date: Thu, 12 Jul 2012 17:19:10 +0000
Subject: [PATCH 1/2] Make accounting in ipfw optional

---
 sys/netinet/ipfw/ip_fw2.c |   32 +++++++++++++++++++++-----------
 1 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index e5a13e4..7bb73b0 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -112,6 +112,8 @@ static int default_to_accept = 1;
 static int default_to_accept;
 #endif
 
+static int update_counters = 0;
+
 VNET_DEFINE(int, autoinc_step);
 
 VNET_DEFINE(unsigned int, fw_tables_max);
@@ -190,6 +192,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, update_counters, CTLFLAG_RW,
+    &update_counters, 0, "Update counters on match");
 SYSEND
 
 #endif /* SYSCTL_NODE */
@@ -2011,16 +2015,20 @@ do {								\
 				break;
 
 			case O_COUNT:
-				f->pcnt++;	/* update stats */
-				f->bcnt += pktlen;
-				f->timestamp = time_uptime;
+				if (update_counters) {
+					f->pcnt++;	/* update stats */
+					f->bcnt += pktlen;
+					f->timestamp = time_uptime;
+				}
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
-			    f->pcnt++;	/* update stats */
-			    f->bcnt += pktlen;
-			    f->timestamp = time_uptime;
+			    if (update_counters) {
+				    f->pcnt++;	/* update stats */
+				    f->bcnt += pktlen;
+				    f->timestamp = time_uptime;
+			    }
 			    /* If possible use cached f_pos (in f->next_rule),
 			     * whose version is written in f->next_rule
 			     * (horrible hacks to avoid changing the ABI).
@@ -2375,11 +2383,13 @@ do {								\
 	}		/* end of outer for, scan rules */
 
 	if (done) {
-		struct ip_fw *rule = chain->map[f_pos];
-		/* Update statistics */
-		rule->pcnt++;
-		rule->bcnt += pktlen;
-		rule->timestamp = time_uptime;
+		if (update_counters) {
+			struct ip_fw *rule = chain->map[f_pos];
+			/* Update statistics */
+			rule->pcnt++;
+			rule->bcnt += pktlen;
+			rule->timestamp = time_uptime;
+		}
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
-- 
1.7.6.1


--------------040503000001040306070306
Content-Type: text/plain; charset=UTF-8;
 name="ipfw_pcpu_8.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="ipfw_pcpu_8.diff"

diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 589c19c..b1f83bb 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -493,9 +493,14 @@ struct ip_fw {
 	uint32_t	id;		/* rule id */
 
 	/* These fields are present in all rules.			*/
+#if 0
 	uint64_t	pcnt;		/* Packet counter		*/
 	uint64_t	bcnt;		/* Byte counter			*/
 	uint32_t	timestamp;	/* tv_sec of last match		*/
+#endif
+	uint64_t	_pad0;
+	uint64_t	_pad1;
+	uint32_t	cntr_idx;	/* Index in pcpu counters */
 
 	ipfw_insn	cmd[1];		/* storage for commands		*/
 };
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 7bb73b0..db93a5c 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2016,18 +2016,20 @@ do {								\
 
 			case O_COUNT:
 				if (update_counters) {
-					f->pcnt++;	/* update stats */
-					f->bcnt += pktlen;
-					f->timestamp = time_uptime;
+					struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+					c->pcnt++;	/* update stats */
+					c->bcnt += pktlen;
+					c->timestamp = time_uptime;
 				}
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    if (update_counters) {
-				    f->pcnt++;	/* update stats */
-				    f->bcnt += pktlen;
-				    f->timestamp = time_uptime;
+				    struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+				    c->pcnt++;	/* update stats */
+				    c->bcnt += pktlen;
+				    c->timestamp = time_uptime;
 			    }
 			    /* If possible use cached f_pos (in f->next_rule),
 			     * whose version is written in f->next_rule
@@ -2125,9 +2127,10 @@ do {								\
 					break;
 				}
 
-				f->pcnt++;	/* update stats */
-				f->bcnt += pktlen;
-				f->timestamp = time_uptime;
+				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+				c->pcnt++;	/* update stats */
+				c->bcnt += pktlen;
+				c->timestamp = time_uptime;
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
@@ -2263,9 +2266,10 @@ do {								\
 			case O_SETFIB: {
 				uint32_t fib;
 
-				f->pcnt++;	/* update stats */
-				f->bcnt += pktlen;
-				f->timestamp = time_uptime;
+				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+				c->pcnt++;	/* update stats */
+				c->bcnt += pktlen;
+				c->timestamp = time_uptime;
 				fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg:
 				    cmd->arg1;
 				if (fib >= rt_numfibs)
@@ -2315,8 +2319,9 @@ do {								\
 			case O_REASS: {
 				int ip_off;
 
-				f->pcnt++;
-				f->bcnt += pktlen;
+				struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+				c->pcnt++;
+				c->bcnt += pktlen;
 				l = 0;	/* in any case exit inner loop */
 				ip_off = ntohs(ip->ip_off);
 
@@ -2386,9 +2391,10 @@ do {								\
 		if (update_counters) {
 			struct ip_fw *rule = chain->map[f_pos];
 			/* Update statistics */
-			rule->pcnt++;
-			rule->bcnt += pktlen;
-			rule->timestamp = time_uptime;
+			struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, rule->cntr_idx);
+			c->pcnt++;
+			c->bcnt += pktlen;
+			c->timestamp = time_uptime;
 		}
 	} else {
 		retval = IP_FW_DENY;
@@ -2439,6 +2445,7 @@ ipfw_init(void)
 {
 	int error = 0;
 
+	_dpcpu_init();
 	ipfw_dyn_attach();
 	/*
  	 * Only print out this stuff the first time around,
@@ -2500,6 +2507,7 @@ ipfw_destroy(void)
 
 	ipfw_log_bpf(0); /* uninit */
 	ipfw_dyn_detach();
+	_dpcpu_uninit();
 	printf("IP firewall unloaded\n");
 }
 
@@ -2546,6 +2554,9 @@ vnet_ipfw_init(const void *unused)
 		return (ENOSPC);
 	}
 
+	/* Init per-cpu counters */
+	ipfw_init_counters(chain);
+
 	/* fill and insert the default rule */
 	rule->act_ofs = 0;
 	rule->rulenum = IPFW_DEFAULT_RULE;
@@ -2559,6 +2570,9 @@ vnet_ipfw_init(const void *unused)
 	IPFW_LOCK_INIT(chain);
 	ipfw_dyn_init();
 
+	/* Allocate first pcpu counter */
+	rule->cntr_idx = ipfw_alloc_counter(chain);
+
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
@@ -2619,10 +2633,11 @@ vnet_ipfw_uninit(const void *unused)
 	}
 	if (chain->map)
 		free(chain->map, M_IPFW);
+	_dpcpu_free(chain->cntrs);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	if (reap != NULL)
-		ipfw_reap_rules(reap);
+		ipfw_reap_rules(chain, reap);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	return 0;
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 46d011c..bd4fad3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -222,6 +222,7 @@ struct ip_fw_chain {
 	struct radix_node_head **tables;	/* IPv4 tables */
 	struct radix_node_head **xtables;	/* extended tables */
 	uint8_t		*tabletype;	/* Array of table types */
+	uintptr_t	*cntrs;		/* Per-cpu counters */
 #if defined( __linux__ ) || defined( _WIN32 )
 	spinlock_t rwmtx;
 	spinlock_t uh_lock;
@@ -231,8 +232,25 @@ struct ip_fw_chain {
 #endif
 	uint32_t	id;		/* ruleset id */
 	uint32_t	gencnt;		/* generation count */
+	int		cntr_allocated;	/* Size of allocated counters map */
+	uint32_t	*cntr_idx;	/* bit index of counters map */
 };
 
+struct ip_fw_cntr {
+	uint64_t	pcnt;		/* Packet counter		*/
+	uint64_t	bcnt;		/* Byte counter			*/
+	uint32_t	timestamp;	/* tv_sec of last match		*/
+};
+#define	IPFW_PCPU_CNTR(chain, idx)	((struct ip_fw_cntr *)(chain)->cntrs[curcpu] + idx)
+#define	IPFW_PCPU_DEFAULT_COUNTERS	128
+void _dpcpu_init(void);
+void _dpcpu_uninit(void);
+uintptr_t *_dpcpu_alloc(size_t size, int flags);
+void _dpcpu_free(uintptr_t *ptr);
+void ipfw_init_counters(struct ip_fw_chain *chain);
+int ipfw_alloc_counter(struct ip_fw_chain *chain);
+void ipfw_free_counter(struct ip_fw_chain *chain, int idx);
+
 struct sockopt;	/* used by tcp_var.h */
 
 /*
@@ -267,7 +285,10 @@ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
 int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
 int ipfw_ctl(struct sockopt *sopt);
 int ipfw_chk(struct ip_fw_args *args);
-void ipfw_reap_rules(struct ip_fw *head);
+void ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head);
+
+void ipfw_zero_counter(struct ip_fw_chain *chain, int idx);
+void ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr);
 
 /* In ip_fw_pfil */
 int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 0aecc2c..db308c2 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mbuf.h>	/* struct m_tag used by nested headers */
 #include <sys/kernel.h>
+#include <sys/smp.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
@@ -72,6 +73,212 @@ MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
  * static variables followed by global ones (none in this file)
  */
 
+MALLOC_DEFINE(M_XDPCPU, "XPCPU", "PCPU data");
+
+uma_zone_t _dpcpu_zone;
+
+static void ipfw_extend_counters(struct ip_fw_chain *chain);
+
+void
+_dpcpu_init()
+{
+	_dpcpu_zone = uma_zcreate("pcpu ptrs", sizeof(uintptr_t) * MAXCPU,
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+void
+_dpcpu_uninit()
+{
+	uma_zdestroy(_dpcpu_zone);
+}
+
+#define	_DPCPU_ALIGN	512
+uintptr_t *
+_dpcpu_alloc(size_t size, int flags)
+{
+	uintptr_t *ptr;
+	int id, fail = 0;
+
+	if (size % _DPCPU_ALIGN)
+		size += _DPCPU_ALIGN - (size % _DPCPU_ALIGN);
+
+	if ((ptr = uma_zalloc(_dpcpu_zone, flags | M_ZERO)) == NULL)
+		return NULL;
+
+	CPU_FOREACH(id) {
+		/*
+		 * We believe in aligned allocation here
+		 */
+		if ((ptr[id] = (uintptr_t)malloc(size, M_XDPCPU, flags)) == 0) {
+			fail = 1;
+			break;
+		}
+
+		KASSERT((uintptr_t)ptr % _DPCPU_ALIGN == 0, ("malloc(9) returns unaligned data :("));
+	}
+
+	if (fail == 0)
+		return ptr;
+
+	CPU_FOREACH(id) {
+		if (ptr[id] == 0)
+			break;
+
+		free((void *)ptr[id], M_XDPCPU);
+	}
+
+	uma_zfree(_dpcpu_zone, ptr);
+
+	return NULL;
+}
+
+void
+_dpcpu_free(uintptr_t *ptr)
+{
+	int id;
+
+	CPU_FOREACH(id) {
+		if (ptr[id] == 0)
+			continue;
+		free((void *)ptr[id], M_XDPCPU);
+	}
+
+	uma_zfree(_dpcpu_zone, ptr);
+}
+
+void
+ipfw_init_counters(struct ip_fw_chain *chain)
+{
+	chain->cntrs = _dpcpu_alloc(IPFW_PCPU_DEFAULT_COUNTERS *
+		sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
+	chain->cntr_idx = malloc(IPFW_PCPU_DEFAULT_COUNTERS / 8, M_IPFW, M_WAITOK);
+	/* Free positions are marked by 1 */
+	memset(chain->cntr_idx, 0xFF, IPFW_PCPU_DEFAULT_COUNTERS / 8);
+	chain->cntr_allocated = IPFW_PCPU_DEFAULT_COUNTERS;
+}
+
+
+static void
+ipfw_extend_counters(struct ip_fw_chain *chain)
+{
+	uintptr_t *new_ptr, *old_ptr;
+	uint32_t *old_idx, *new_idx;
+	int i, id;
+
+	for (;;) {
+
+		i = 2 * chain->cntr_allocated;
+
+		new_ptr = _dpcpu_alloc(i * sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
+		new_idx = malloc(i / 8, M_IPFW, M_WAITOK);
+		/* Free positions are marked by 1 */
+		memset(new_idx, 0xFF, i / 8);
+
+		IPFW_UH_WLOCK(chain);
+		if (i == 2 * chain->cntr_allocated) {
+			CTR4(KTR_NET, "%s: NEW INDEX: %p cntr_idx: %d count: %d", __func__, new_ptr, new_idx, i);
+			break;
+		}
+		/* otherwise we lost the race, free and retry */
+		IPFW_UH_WUNLOCK(chain);
+
+		free(new_idx, M_IPFW);
+		_dpcpu_free(new_ptr);
+	}
+
+	/* Pointers allocated, let's migrate data */
+
+	IPFW_WLOCK(chain);
+	/* Copy pcpu counters */
+	CPU_FOREACH(id) {
+		memcpy((void *)new_ptr[id], (void *)chain->cntrs[id], chain->cntr_allocated * sizeof(struct ip_fw_cntr));
+	}
+	/* Copy index */
+	memcpy(new_idx, chain->cntr_idx, chain->cntr_allocated / 8);
+	/* Swap old and new pointers */
+	old_ptr = chain->cntrs;
+	chain->cntrs = new_ptr;
+	old_idx = chain->cntr_idx;
+	chain->cntr_idx = new_idx;
+	chain->cntr_allocated = i;
+
+	IPFW_WUNLOCK(chain);
+	IPFW_UH_WUNLOCK(chain);
+
+	/* Free old counters and index */
+	free(old_idx, M_IPFW);
+	_dpcpu_free(old_ptr);
+}
+
+int
+ipfw_alloc_counter(struct ip_fw_chain *chain)
+{
+	int i, x, idx;
+	uint32_t *v;
+
+	for (;;) {
+		IPFW_UH_WLOCK(chain);
+		for (i = 0, v = chain->cntr_idx; i < chain->cntr_allocated / 8 / 4; i++, v++) {
+			if ((x = ffsl(*v)) != 0) {
+				/* Found. Mark as used */
+				*v &= ~ (1 << (x - 1));
+				IPFW_UH_WUNLOCK(chain);
+				idx = i * 32 + x - 1;
+				ipfw_zero_counter(chain, idx);
+				CTR3(KTR_NET, "%s: allocated counter %d, map after: %X", __func__, idx, *v);
+				return idx;
+			}
+		}
+		IPFW_UH_WUNLOCK(chain);
+
+		/* Not found. Let's allocate data and retry */
+		ipfw_extend_counters(chain);
+	}
+}
+
+void
+ipfw_free_counter(struct ip_fw_chain *chain, int idx)
+{
+	uint32_t *v;
+
+	IPFW_UH_WLOCK(chain);
+	v = chain->cntr_idx;
+	v += idx / 32;
+	CTR3(KTR_NET, "%s: returned counter %d, map before: %X", __func__, idx, *v);
+	idx %= 32;
+	*v |= 1 << idx;
+
+	IPFW_UH_WUNLOCK(chain);
+}
+
+void
+ipfw_zero_counter(struct ip_fw_chain *chain, int idx)
+{
+	int id;
+	struct ip_fw_cntr *pcpu;
+
+	CTR2(KTR_NET, "%s: zeroing counter %d", __func__, idx);
+	CPU_FOREACH(id) {
+		pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+		memset(pcpu, 0, sizeof(struct ip_fw_cntr));
+	}
+}
+
+void
+ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr)
+{
+	int id;
+	struct ip_fw_cntr *pcpu;
+
+	CPU_FOREACH(id) {
+		pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+		cntr->pcnt += pcpu->pcnt;
+		cntr->bcnt += pcpu->bcnt;
+		if (cntr->timestamp < pcpu->timestamp)
+			cntr->timestamp = pcpu->timestamp;
+	}
+}
+
 /*
  * Find the smallest rule >= key, id.
  * We could use bsearch but it is so simple that we code it directly
@@ -155,7 +362,7 @@ int
 ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 {
 	struct ip_fw *rule;
-	int i, l, insert_before;
+	int i, l, cntr_idx, insert_before;
 	struct ip_fw **map;	/* the new array of pointers */
 
 	if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -163,12 +370,14 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 
 	l = RULESIZE(input_rule);
 	rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+	cntr_idx = ipfw_alloc_counter(chain);
 	if (rule == NULL)
 		return (ENOSPC);
 	/* get_map returns with IPFW_UH_WLOCK if successful */
 	map = get_map(chain, 1, 0 /* not locked */);
 	if (map == NULL) {
 		free(rule, M_IPFW);
+		ipfw_free_counter(chain, cntr_idx);
 		return ENOSPC;
 	}
 
@@ -176,10 +385,12 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 	/* clear fields not settable from userland */
 	rule->x_next = NULL;
 	rule->next_rule = NULL;
+	rule->cntr_idx = cntr_idx;
+#if 0
 	rule->pcnt = 0;
 	rule->bcnt = 0;
 	rule->timestamp = 0;
-
+#endif
 	if (V_autoinc_step < 1)
 		V_autoinc_step = 1;
 	else if (V_autoinc_step > 1000)
@@ -217,12 +428,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
  * A NULL pointer on input is handled correctly.
  */
 void
-ipfw_reap_rules(struct ip_fw *head)
+ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head)
 {
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
 		head = head->x_next;
+		ipfw_free_counter(chain, rule->cntr_idx);
 		free(rule, M_IPFW);
 	}
 }
@@ -424,7 +636,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
 	rule = chain->reap;
 	chain->reap = NULL;
 	IPFW_UH_WUNLOCK(chain);
-	ipfw_reap_rules(rule);
+	ipfw_reap_rules(chain, rule);
 	if (map)
 		free(map, M_IPFW);
 	return error;
@@ -436,13 +648,17 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
  * so we only care that rules do not disappear.
  */
 static void
-clear_counters(struct ip_fw *rule, int log_only)
+clear_counters(struct ip_fw_chain *chain, struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0) {
+#if 0
 		rule->bcnt = rule->pcnt = 0;
 		rule->timestamp = 0;
+#endif
+	ipfw_zero_counter(chain, rule->cntr_idx);
+
 	}
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
@@ -481,7 +697,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 			/* Skip rules not in our set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
-			clear_counters(rule, log_only);
+			clear_counters(chain, rule, log_only);
 		}
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
@@ -491,7 +707,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 			rule = chain->map[i];
 			if (rule->rulenum == rulenum) {
 				if (cmd == 0 || rule->set == set)
-					clear_counters(rule, log_only);
+					clear_counters(chain, rule, log_only);
 				cleared = 1;
 			}
 			if (rule->rulenum > rulenum)
@@ -854,7 +1070,7 @@ struct ip_fw7 {
 	ipfw_insn	cmd[1];		/* storage for commands     */
 };
 
-	int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule);
 int convert_rule_to_8(struct ip_fw *rule);
 
 #ifndef RULESIZE7
@@ -887,7 +1103,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 		    if (bp + l + sizeof(uint32_t) <= ep) {
 			int error;
 			bcopy(rule, bp, l + sizeof(uint32_t));
-			error = convert_rule_to_7((struct ip_fw *) bp);
+			error = convert_rule_to_7(chain, (struct ip_fw *) bp);
 			if (error)
 				return 0; /*XXX correct? */
 			/*
@@ -913,14 +1129,19 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 		}
 		dst = (struct ip_fw *)bp;
 		bcopy(rule, dst, l);
+		/* copy statistics */
+		struct ip_fw_cntr *c = (struct ip_fw_cntr *)&dst->_pad0;
+		ipfw_export_counter(chain, rule->cntr_idx, c);
+		if (c->timestamp)
+			c->timestamp += boot_seconds;
+
 		/*
 		 * XXX HACK. Store the disable mask in the "next"
 		 * pointer in a wild attempt to keep the ABI the same.
 		 * Why do we do this on EVERY rule?
 		 */
 		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
-		if (dst->timestamp)
-			dst->timestamp += boot_seconds;
+
 		bp += l;
 	}
 	ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
@@ -1051,7 +1272,7 @@ ipfw_ctl(struct sockopt *sopt)
 			size = RULESIZE(rule);
 			if (!error && sopt->sopt_dir == SOPT_GET) {
 				if (is7) {
-					error = convert_rule_to_7(rule);
+					error = convert_rule_to_7(chain, rule);
 					size = RULESIZE7(rule);
 					if (error)
 						return error;
@@ -1329,7 +1550,7 @@ ipfw_ctl(struct sockopt *sopt)
 
 /* Functions to convert rules 7.2 <==> 8.0 */
 int
-convert_rule_to_7(struct ip_fw *rule)
+convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
@@ -1355,9 +1576,12 @@ convert_rule_to_7(struct ip_fw *rule)
 	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
 	rule7->next = (struct ip_fw7 *)tmp->x_next;
 	rule7->cmd_len = tmp->cmd_len;
+	ipfw_export_counter(chain, rule->cntr_idx, (struct ip_fw_cntr *)&rule7->pcnt);
+#if 0
 	rule7->pcnt = tmp->pcnt;
 	rule7->bcnt = tmp->bcnt;
 	rule7->timestamp = tmp->timestamp;
+#endif
 
 	/* Copy commands */
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
@@ -1429,10 +1653,11 @@ convert_rule_to_8(struct ip_fw *rule)
 	rule->x_next = (struct ip_fw *)tmp->next;
 	rule->cmd_len = tmp->cmd_len;
 	rule->id = 0; /* XXX see if is ok = 0 */
+#if 0
 	rule->pcnt = tmp->pcnt;
 	rule->bcnt = tmp->bcnt;
 	rule->timestamp = tmp->timestamp;
-
+#endif
 	free (tmp, M_TEMP);
 	return 0;
 }

--------------040503000001040306070306--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?500452A5.3070501>