Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 8 Feb 2012 15:50:23 +0700 (NOVT)
From:      Eugene Grosbein <eugen@eg.sd.rdtc.ru>
To:        FreeBSD-gnats-submit@FreeBSD.org
Subject:   kern/164901: [regression] [patch] [lagg] igb/lagg poor traffic distribution
Message-ID:  <201202080850.q188oNrw099468@eg.sd.rdtc.ru>
Resent-Message-ID: <201202080900.q1890LSf039648@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         164901
>Category:       kern
>Synopsis:       [regression] [patch] [lagg] igb/lagg poor traffic distribution
>Confidential:   no
>Severity:       serious
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Feb 08 09:00:20 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     Eugene Grosbein
>Release:        FreeBSD 8.2-STABLE i386
>Organization:
RDTC JSC
>Environment:
System: FreeBSD eg.sd.rdtc.ru 8.2-STABLE FreeBSD 8.2-STABLE #36: Fri Dec 23 15:04:05 NOVT 2011 root@eg.sd.rdtc.ru:/usr/local/obj/usr/local/src/sys/EG i386

>Description:

	Suppose, we have a router (BRAS) using two lagg(4) interfaces in LACP mode.

	Two-port lagg0 has IP address and its ports carry untagged IPoE frames.
	lagg1 has no IP address and has two ports (82576-based igb0 and igb1)
	that carry 1000 dot-q vlans with PPPoE frames only.

	In RELENG_7, lagg(4) evenly distributes traffic going from lagg1 to lagg0.
	Since 8.0-RELEASE all this traffic goes out through one	of lagg0's ports only.

	82576-based NICs and igb(4) support Microsoft Receive-Side Scaling (RSS),
	see http://download.intel.com/design/network/datashts/82576_Datasheet.pdf
	
	RSS states that queue number for non-IP frames (PPPoE/GRE/etc.)
	is not computed with hash. So, all these frames get same (zero)
	queue number and igb(4) assigns tag M_FLOWID=0 to mbufs.

	Since 8.0-RELEASE, lagg(4) skips its own hash computation for mbuts
	having M_FLOWID tag attached. Hence, it directs all such traffic
	to its first port only in this setup.

>How-To-Repeat:
	
	See above.

>Fix:

	The following patch fixes the regression by introducing new sysctls
	that disable usage of M_FLOWID per lagg interface:

net.link.lagg.0.use_flowid
net.link.lagg.1.use_flowid

	Default value is 1 that corresponds to current behaviour of lagg(4).
	To fix our issue, we set net.link.lagg.0.use_flowid=0
	that restores pre-8 behaviour for lagg0 only, so it ignores misleading
	M_FLOWID assigned to mbufs by lagg1's ports.

--- sys/net/if_lagg.h.orig	2010-12-27 12:59:59.000000000 +0600
+++ sys/net/if_lagg.h	2012-01-23 16:34:15.000000000 +0700
@@ -21,6 +21,8 @@
 #ifndef _NET_LAGG_H
 #define _NET_LAGG_H
 
+#include <sys/sysctl.h>
+
 /*
  * Global definitions
  */
@@ -202,6 +204,8 @@ struct lagg_softc {
 	eventhandler_tag vlan_attach;
 	eventhandler_tag vlan_detach;
 #endif
+	struct sysctl_ctx_list		ctx;		/* sysctl variables */
+	int				use_flowid;	/* use M_FLOWID */
 };
 
 struct lagg_port {
--- sys/net/if_lagg.c.orig	2011-08-08 19:16:42.000000000 +0700
+++ sys/net/if_lagg.c	2012-01-23 16:33:04.000000000 +0700
@@ -257,6 +257,8 @@ lagg_clone_create(struct if_clone *ifc, 
 	struct ifnet *ifp;
 	int i, error = 0;
 	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
+	struct sysctl_oid *oid;
+	char num[14];			/* sufficient for 32 bits */
 
 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
 	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
@@ -265,6 +267,15 @@ lagg_clone_create(struct if_clone *ifc, 
 		return (ENOSPC);
 	}
 
+	sysctl_ctx_init(&sc->ctx);
+	snprintf(num, sizeof(num), "%u", unit);
+	sc->use_flowid = 1;
+	oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg),
+		OID_AUTO, num, CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
+		"use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid,
+		"Use flow id for load sharing");
+
 	sc->sc_proto = LAGG_PROTO_NONE;
 	for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
 		if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
@@ -344,6 +355,7 @@ lagg_clone_destroy(struct ifnet *ifp)
 
 	LAGG_WUNLOCK(sc);
 
+	sysctl_ctx_free(&sc->ctx);
 	ifmedia_removeall(&sc->sc_media);
 	ether_ifdetach(ifp);
 	if_free_type(ifp, IFT_ETHER);
@@ -1668,7 +1680,7 @@ lagg_lb_start(struct lagg_softc *sc, str
 	struct lagg_port *lp = NULL;
 	uint32_t p = 0;
 
-	if (m->m_flags & M_FLOWID)
+	if (sc->use_flowid && (m->m_flags & M_FLOWID))
 		p = m->m_pkthdr.flowid;
 	else
 		p = lagg_hashmbuf(m, lb->lb_key);
--- sys/net/ieee8023ad_lacp.c.orig	2009-08-03 16:13:06.000000000 +0800
+++ sys/net/ieee8023ad_lacp.c	2012-01-23 13:44:00.000000000 +0700
@@ -812,7 +812,7 @@ lacp_select_tx_port(struct lagg_softc *s
 		return (NULL);
 	}
 
-	if (m->m_flags & M_FLOWID)
+	if (sc->use_flowid && (m->m_flags & M_FLOWID))
 		hash = m->m_pkthdr.flowid;
 	else
 		hash = lagg_hashmbuf(m, lsc->lsc_hashkey);
>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202080850.q188oNrw099468>