Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 6 Apr 2012 06:55:21 +0000 (UTC)
From:      "Alexander V. Chernikov" <melifaro@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r233938 - in head: share/man/man4 sys/net
Message-ID:  <201204060655.q366tM7t096280@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: melifaro
Date: Fri Apr  6 06:55:21 2012
New Revision: 233938
URL: http://svn.freebsd.org/changeset/base/233938

Log:
  - Improve performace for writer-only BPF users.
  
  Linux and Solaris (at least OpenSolaris) has PF_PACKET socket families to send
  raw ethernet frames. The only FreeBSD interface that can be used to send raw frames
  is BPF. As a result, many programs like cdpd, lldpd, various dhcp stuff uses
  BPF only to send data. This leads us to the situation when software like cdpd,
  being run on high-traffic-volume interface significantly reduces overall performance
  since we have to acquire additional locks for every packet.
  
  Here we add sysctl that changes BPF behavior in the following way:
  If program came and opens BPF socket without explicitly specifyin read filter we
  assume it to be write-only and add it to special writer-only per-interface list.
  This makes bpf_peers_present() return 0, so no additional overhead is introduced.
  After filter is supplied, descriptor is added to original per-interface list permitting
  packets to be captured.
  
  Unfortunately, pcap_open_live() sets catch-all filter itself for the purpose of
  setting snap length.
  
  Fortunately, most programs explicitly sets (event catch-all) filter after that.
  tcpdump(1) is a good example.
  
  So a bit hackis approach is taken: we upgrade description only after second
  BIOCSETF is received.
  
  Sysctl is named net.bpf.optimize_writers and is turned off by default.
  
  - While here, document all sysctl variables in bpf.4
  
  Sponsored by Yandex LLC
  
  Reviewed by:    glebius (previous version)
  Reviewed by:    silence on -net@
  Approved by:    (mentor)
  
  MFC after:      4 weeks

Modified:
  head/share/man/man4/bpf.4
  head/sys/net/bpf.c
  head/sys/net/bpf.h
  head/sys/net/bpfdesc.h

Modified: head/share/man/man4/bpf.4
==============================================================================
--- head/share/man/man4/bpf.4	Fri Apr  6 06:53:58 2012	(r233937)
+++ head/share/man/man4/bpf.4	Fri Apr  6 06:55:21 2012	(r233938)
@@ -952,10 +952,33 @@ array initializers:
 .Fn BPF_STMT opcode operand
 and
 .Fn BPF_JUMP opcode operand true_offset false_offset .
-.Sh FILES
-.Bl -tag -compact -width /dev/bpf
-.It Pa /dev/bpf
-the packet filter device
+.Sh SYSCTL VARIABLES
+A set of
+.Xr sysctl 8
+variables controls the behaviour of the
+.Nm
+subsystem
+.Bl -tag -width indent
+.It Va net.bpf.optimize_writers: No 0
+Various programs use BPF to send (but not receive) raw packets
+(cdpd, lldpd, dhcpd, dhcp relays, etc. are good examples of such programs).
+They do not need incoming packets to be send to them. Turning this option on
+makes new BPF users to be attached to write-only interface list until program
+explicitly specifies read filter via
+.Cm pcap_set_filter() .
+This removes any performance degradation for high-speed interfaces.
+.It Va net.bpf.stats:
+Binary interface for retrieving general statistics.
+.It Va net.bpf.zerocopy_enable: No 0
+Permits zero-copy to be used with net BPF readers. Use with caution.
+.It Va net.bpf.maxinsns: No 512
+Maximum number of instructions that BPF program can contain. Use
+.Xr tcpdump 1
+-d option to determine approximate number of instruction for any filter.
+.It Va net.bpf.maxbufsize: No 524288
+Maximum buffer size to allocate for packets buffer.
+.It Va net.bpf.bufsize: No 4096
+Default buffer size to allocate for packets buffer.
 .El
 .Sh EXAMPLES
 The following filter is taken from the Reverse ARP Daemon.

Modified: head/sys/net/bpf.c
==============================================================================
--- head/sys/net/bpf.c	Fri Apr  6 06:53:58 2012	(r233937)
+++ head/sys/net/bpf.c	Fri Apr  6 06:55:21 2012	(r233938)
@@ -176,6 +176,12 @@ SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_
 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
     bpf_stats_sysctl, "bpf statistics portal");
 
+static VNET_DEFINE(int, bpf_optimize_writers) = 0;
+#define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
+SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers,
+    CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0,
+    "Do not send packets until BPF program is set");
+
 static	d_open_t	bpfopen;
 static	d_read_t	bpfread;
 static	d_write_t	bpfwrite;
@@ -572,17 +578,66 @@ static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
 	/*
-	 * Point d at bp, and add d to the interface's list of listeners.
-	 * Finally, point the driver's bpf cookie at the interface so
-	 * it will divert packets to bpf.
+	 * Point d at bp, and add d to the interface's list.
+	 * Since there are many applicaiotns using BPF for
+	 * sending raw packets only (dhcpd, cdpd are good examples)
+	 * we can delay adding d to the list of active listeners until
+	 * some filter is configured.
 	 */
-	BPFIF_WLOCK(bp);
 	d->bd_bif = bp;
-	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
+	BPFIF_WLOCK(bp);
+
+	if (V_bpf_optimize_writers != 0) {
+		/* Add to writers-only list */
+		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
+		/*
+		 * We decrement bd_writer on every filter set operation.
+		 * First BIOCSETF is done by pcap_open_live() to set up
+		 * snap length. After that appliation usually sets its own filter
+		 */
+		d->bd_writer = 2;
+	} else
+		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
+
+	BPFIF_WUNLOCK(bp);
+
+	BPF_LOCK();
 	bpf_bpfd_cnt++;
+	BPF_UNLOCK();
+
+	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
+	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
+
+	if (V_bpf_optimize_writers == 0)
+		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
+}
+
+/*
+ * Add d to the list of active bp filters.
+ * Reuqires bpf_attachd() to be called before
+ */
+static void
+bpf_upgraded(struct bpf_d *d)
+{
+	struct bpf_if *bp;
+
+	bp = d->bd_bif;
+
+	BPFIF_WLOCK(bp);
+	BPFD_WLOCK(d);
+
+	/* Remove from writers-only list */
+	LIST_REMOVE(d, bd_next);
+	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
+	/* Mark d as reader */
+	d->bd_writer = 0;
+
+	BPFD_WUNLOCK(d);
 	BPFIF_WUNLOCK(bp);
 
+	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
+
 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 }
 
@@ -596,12 +651,17 @@ bpf_detachd(struct bpf_d *d)
 	struct bpf_if *bp;
 	struct ifnet *ifp;
 
+	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
+
 	BPF_LOCK_ASSERT();
 
 	bp = d->bd_bif;
 	BPFIF_WLOCK(bp);
 	BPFD_WLOCK(d);
 
+	/* Save bd_writer value */
+	error = d->bd_writer;
+
 	/*
 	 * Remove d from the interface's descriptor list.
 	 */
@@ -615,7 +675,9 @@ bpf_detachd(struct bpf_d *d)
 	/* We're already protected by global lock. */
 	bpf_bpfd_cnt--;
 
-	EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
+	/* Call event handler iff d is attached */
+	if (error == 0)
+		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
@@ -1536,6 +1598,7 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
 #ifdef COMPAT_FREEBSD32
 	struct bpf_program32 *fp32;
 	struct bpf_program fp_swab;
+	int need_upgrade = 0;
 
 	if (cmd == BIOCSETWF32 || cmd == BIOCSETF32 || cmd == BIOCSETFNR32) {
 		fp32 = (struct bpf_program32 *)fp;
@@ -1611,6 +1674,16 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
 #endif
 			if (cmd == BIOCSETF)
 				reset_d(d);
+
+			/*
+			 * Do not require upgrade by first BIOCSETF
+			 * (used to set snaplen) by pcap_open_live()
+			 */
+			if ((d->bd_writer != 0) && (--d->bd_writer == 0))
+				need_upgrade = 1;
+			CTR4(KTR_NET, "%s: filter function set by pid %d, "
+			    "bd_writer counter %d, need_upgrade %d",
+			    __func__, d->bd_pid, d->bd_writer, need_upgrade);
 		}
 		BPFD_WUNLOCK(d);
 		BPFIF_WUNLOCK(d->bd_bif);
@@ -1621,6 +1694,10 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
 			bpf_destroy_jit_filter(ofunc);
 #endif
 
+		/* Move d to active readers list */
+		if (need_upgrade != 0)
+			bpf_upgraded(d);
+
 		return (0);
 	}
 	free((caddr_t)fcode, M_BPF);
@@ -2265,6 +2342,7 @@ bpfattach2(struct ifnet *ifp, u_int dlt,
 		panic("bpfattach");
 
 	LIST_INIT(&bp->bif_dlist);
+	LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	rw_init(&bp->bif_lock, "bpf interface lock");
@@ -2520,6 +2598,13 @@ bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
 	index = 0;
 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
 		BPFIF_RLOCK(bp);
+		/* Send writers-only first */
+		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
+			xbd = &xbdbuf[index++];
+			BPFD_RLOCK(bd);
+			bpfstats_fill_xbpf(xbd, bd);
+			BPFD_RUNLOCK(bd);
+		}
 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
 			xbd = &xbdbuf[index++];
 			BPFD_RLOCK(bd);

Modified: head/sys/net/bpf.h
==============================================================================
--- head/sys/net/bpf.h	Fri Apr  6 06:53:58 2012	(r233937)
+++ head/sys/net/bpf.h	Fri Apr  6 06:55:21 2012	(r233938)
@@ -1104,6 +1104,7 @@ struct bpf_if {
 	u_int bif_hdrlen;		/* length of link header */
 	struct ifnet *bif_ifp;		/* corresponding interface */
 	struct rwlock bif_lock;		/* interface lock */
+	LIST_HEAD(, bpf_d)	bif_wlist;	/* writer-only list */
 #endif
 };
 

Modified: head/sys/net/bpfdesc.h
==============================================================================
--- head/sys/net/bpfdesc.h	Fri Apr  6 06:53:58 2012	(r233937)
+++ head/sys/net/bpfdesc.h	Fri Apr  6 06:55:21 2012	(r233938)
@@ -79,6 +79,7 @@ struct bpf_d {
 	u_char		bd_promisc;	/* true if listening promiscuously */
 	u_char		bd_state;	/* idle, waiting, or timed out */
 	u_char		bd_immediate;	/* true to return on packet arrival */
+	u_char		bd_writer;	/* non-zero if d is writer-only */
 	int		bd_hdrcmplt;	/* false to fill in src lladdr automatically */
 	int		bd_direction;	/* select packet direction */
 	int		bd_tstamp;	/* select time stamping function */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201204060655.q366tM7t096280>