Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 13 Feb 2014 09:48:12 +0400
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        Adrian Chadd <adrian@freebsd.org>
Cc:        FreeBSD Net <freebsd-net@freebsd.org>
Subject:   Re: flowtable, collisions, locking and CPU affinity
Message-ID:  <20140213054812.GK26785@FreeBSD.org>
In-Reply-To: <CAJ-VmonNCzFED=20_C2fV1g1jvFNRE=N-H%2B09Wb2OdxdzHp9JQ@mail.gmail.com>
References:  <CAJ-VmonNCzFED=20_C2fV1g1jvFNRE=N-H%2B09Wb2OdxdzHp9JQ@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help

--BXVAT5kNtrzKuDFl
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Fri, Feb 07, 2014 at 04:12:56PM -0800, Adrian Chadd wrote:
A> I've been knee deep in the flowtable code looking at some of the less
A> .. predictable ways it behaves.
A> 
A> One of them is the collisions that do pop up from time to time.
A> 
A> I dug into it in quite some depth and found out what's going on. This
A> assumes it's a per-CPU flowtable.
A> 
A> * A flowtable lookup is performed, on say CPU #0
A> * the flowtable lookup fails, so it goes to do a flowtable insert
A> * .. but since in between the two, the flowtable "lock" is released so
A> it can do a route/adjacency lookup, and that grabs a lock
A> * .. then the flowtable insert is done on a totally different CPU
A> * .. which happens to _have_ the flowtable entry already, so it fails
A> as a collision which already has a matching entry.
A> 
A> Now, the reason for this is primarily because there's no CPU pinning
A> in the lookup path and if there's contention during the route lookup
A> phase, the scheduler may decide to schedule the kernel thread on a
A> totally different CPU to the one that was running the code when the
A> lock was entered.
A> 
A> Now, Gleb's recent changes seem to have made the instances of this
A> drop, but he didn't set out to fix it. So there's something about his
A> changes that has changed the locking/contention profile that I was
A> using to easily reproduce it.
A> 
A> In any case - the reason it's happening above is because there's no
A> actual lock held over the whole lookup/insert path. It's a per-CPU
A> critical enter/exit path, so the only way to guarantee consistency is
A> to use sched_pin() for the entirety of the function.
A> 
A> I'll go and test that out in a moment and see if it quietens the
A> collisions that I see in lab testing.
A> 
A> Has anyone already debugged/diagnosed this? Can anyone think of an
A> alternate (better) way to fix this?

Can't we just reuse the colliding entry?

Can you evaluate patch attached (against head) in your testing
conditions?

-- 
Totus tuus, Glebius.

--BXVAT5kNtrzKuDFl
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="flowtable_collisions.diff"

Index: sys/net/flowtable.c
===================================================================
--- sys/net/flowtable.c	(revision 261825)
+++ sys/net/flowtable.c	(working copy)
@@ -716,21 +716,39 @@ flow_full(struct flowtable *ft)
 }
 
 static int
+flow_matches(struct flentry *fle, uint32_t hash, uint32_t *key, uint8_t
+   proto, uint32_t fibnum)
+{
+
+	if (fle->f_fhash == hash &&
+	    bcmp(&fle->f_flow, key, KEYLEN(fle->f_flags)) == 0 &&
+	    proto == fle->f_proto && fibnum == fle->f_fibnum &&
+	    (fle->f_rt->rt_flags & RTF_UP) &&
+	    fle->f_rt->rt_ifp != NULL &&
+	    (fle->f_lle->la_flags & LLE_VALID))
+		return (1);
+
+	return (0);
+}
+
+static struct flentry *
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
     uint32_t fibnum, struct route *ro, uint16_t flags)
 {
 	struct flist *flist;
 	struct flentry *fle, *iter;
+	bitstr_t *mask;
 	int depth;
-	bitstr_t *mask;
+	uint8_t proto;
 
 	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
 	if (fle == NULL)
-		return (ENOMEM);
+		return (NULL);
 
+	proto = flags_to_proto(flags);
 	bcopy(key, &fle->f_flow, KEYLEN(flags));
 	fle->f_flags |= (flags & FL_IPV6);
-	fle->f_proto = flags_to_proto(flags);
+	fle->f_proto = proto;
 	fle->f_rt = ro->ro_rt;
 	fle->f_lle = ro->ro_lle;
 	fle->f_fhash = hash;
@@ -748,21 +766,24 @@ flowtable_insert(struct flowtable *ft, uint32_t ha
 	}
 
 	depth = 0;
-	FLOWSTAT_INC(ft, ft_collisions);
 	/*
 	 * find end of list and make sure that we were not
 	 * preempted by another thread handling this flow
 	 */
 	SLIST_FOREACH(iter, flist, f_next) {
-		if (iter->f_fhash == hash && !flow_stale(ft, iter)) {
+		if (flow_matches(iter, hash, key, proto, fibnum)) {
 			/*
-			 * there was either a hash collision
-			 * or we lost a race to insert
+			 * We probably migrated to an other CPU after
+			 * lookup in flowtable_lookup_common() failed.
+			 * It appeared that this CPU already has flow
+			 * entry.
 			 */
+			iter->f_uptime = time_uptime;
+			iter->f_flags |= flags;
 			critical_exit();
+			FLOWSTAT_INC(ft, ft_collisions);
 			uma_zfree(flow_zone, fle);
-
-			return (EEXIST);
+			return (iter);
 		}
 		depth++;
 	}
@@ -773,8 +794,9 @@ flowtable_insert(struct flowtable *ft, uint32_t ha
 	SLIST_INSERT_HEAD(flist, fle, f_next);
 skip:
 	critical_exit();
+	FLOWSTAT_INC(ft, ft_inserts);
 
-	return (0);
+	return (fle);
 }
 
 struct flentry *
@@ -885,12 +907,7 @@ flowtable_lookup_common(struct flowtable *ft, stru
 	critical_enter();
 	flist = flowtable_list(ft, hash);
 	SLIST_FOREACH(fle, flist, f_next)
-		if (fle->f_fhash == hash && bcmp(&fle->f_flow, key,
-		    KEYLEN(fle->f_flags)) == 0 &&
-		    proto == fle->f_proto && fibnum == fle->f_fibnum &&
-		    (fle->f_rt->rt_flags & RTF_UP) &&
-		    fle->f_rt->rt_ifp != NULL &&
-		    (fle->f_lle->la_flags & LLE_VALID)) {
+		if (flow_matches(fle, hash, key, proto, fibnum)) {
 			fle->f_uptime = time_uptime;
 			fle->f_flags |= flags;
 			critical_exit();
@@ -968,7 +985,8 @@ flowtable_lookup_common(struct flowtable *ft, stru
 	}
 	ro->ro_lle = lle;
 
-	if (flowtable_insert(ft, hash, key, fibnum, ro, flags) != 0) {
+	fle = flowtable_insert(ft, hash, key, fibnum, ro, flags);
+	if (fle == NULL) {
 		RTFREE(rt);
 		LLE_FREE(lle);
 		return (NULL);
@@ -975,10 +993,11 @@ flowtable_lookup_common(struct flowtable *ft, stru
 	}
 
 success:
-	if (fle != NULL && (m->m_flags & M_FLOWID) == 0) {
+	if (m->m_flags & M_FLOWID) {
 		m->m_flags |= M_FLOWID;
 		m->m_pkthdr.flowid = fle->f_fhash;
 	}
+
 	return (fle);
 }
 
Index: sys/net/flowtable.h
===================================================================
--- sys/net/flowtable.h	(revision 261825)
+++ sys/net/flowtable.h	(working copy)
@@ -39,6 +39,7 @@ struct flowtable_stat {
 	uint64_t	ft_frees;
 	uint64_t	ft_hits;
 	uint64_t	ft_lookups;
+	uint64_t	ft_inserts;
 };
 
 #ifdef	_KERNEL
Index: usr.bin/netstat/flowtable.c
===================================================================
--- usr.bin/netstat/flowtable.c	(revision 261825)
+++ usr.bin/netstat/flowtable.c	(working copy)
@@ -52,6 +52,7 @@ print_stats(struct flowtable_stat *stat)
 	p(ft_lookups, "\t%ju lookup%s\n");
 	p(ft_hits, "\t%ju hit%s\n");
 	p2(ft_misses, "\t%ju miss%s\n");
+	p(ft_inserts, "\t%ju insert%s\n");
 	p(ft_collisions, "\t%ju collision%s\n");
 	p(ft_free_checks, "\t%ju free check%s\n");
 	p(ft_frees, "\t%ju free%s\n");

--BXVAT5kNtrzKuDFl--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20140213054812.GK26785>