Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 27 Dec 2012 10:46:49 +0100
From:      Luigi Rizzo <rizzo@iet.unipi.it>
To:        current@freebsd.org, jfvogel@gmail.com
Cc:        emulation@freebsd.org
Subject:   [RFC] proposed 'lem' patch to improve behaviour under emulation
Message-ID:  <20121227094649.GA48891@onelab2.iet.unipi.it>

next in thread | raw e-mail | index | archive | help

--wRRV7LY7NUeQGEoC
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

This patch implements two features for the 'lem' driver that
greatly improve the throughput under proper hypervisor.
This is joint work with Vincenzo Maffione and Giuseppe Lettieri,
I am posting it here for review, will then commit it 
if there are no objections.
 
The first change is to implement a sysctl to access the 'itr'
interrupt moderation register for the devices supported by this
driver. It is little more than adding a struct into the device
descriptor, and one line to create the dynamic sysctl entry, same
as it is done for the other mitigation registers.
 
The second change is more interesting and has huge benefits on througput.

Under virtualization, "VM exits" (which happen every time there is
an access to a register of the emulated peripheral) are extremely
expensive.  In the tx path of the 'lem' driver, there is a write
to the TDT register on every packet sent. 

The patch we propose, if enabled through a sysctl (defaults off, 
so no change from current behaviour) defers writes to the TDT 
register when there is a pending transmit interrupt. 
This means that, together with proper emulation of interrupt
mitigation on the hypervisor side, the number of VM exits
is dramatically reduced. To give you an idea, on a modern
system with qemu-kvm and companion patches, UDP throughput is
 
					KVM		QEMU
standard KVM, standard driver            20 Kpps	 6.3 Kpps
modified KVM, standard driver            37 Kpps	28 Kpps
modified KVM, modified driver           200 Kpps	34 Kpps
 
As you can see, on kvm this change gives a 5x speedup to the tx path,
which combines nicely with the 2x speedup that comes from supporting
interrupt mitigation alone in the hypervisor. Without kvm (or kqemu ?)
the benefits are much lower, as the guest becomes too slow.

Patch follows. It would be good if people with real hardware
using the 'lem' driver could test it to make sure it does no
harm on their devices (in any case the sysctl variable
dev.em.0.mit_enable must be set to explicitly enable it
at runtime).

(for those curious to test it under kvm, i am also attaching a
patch that you need to apply to qemu in order to exploit the
effect of interrupt mitigation; it is a followup of a similar
patch i posted in july to the qemu mailing list, and i will
post it the update there as well, shortly. Unfortunately
we do not have kvm on freebsd..)

cheers
luigi

--wRRV7LY7NUeQGEoC
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="patch-e1000-emulation-20121227.diff"

Index: if_lem.c
===================================================================
--- if_lem.c	(revision 244673)
+++ if_lem.c	(working copy)
@@ -32,6 +32,8 @@
 ******************************************************************************/
 /*$FreeBSD$*/
 
+#define MITIGATION
+
 #ifdef HAVE_KERNEL_OPTION_HEADERS
 #include "opt_device_polling.h"
 #include "opt_inet.h"
@@ -281,6 +283,9 @@
 #define EM_TICKS_TO_USECS(ticks)	((1024 * (ticks) + 500) / 1000)
 #define EM_USECS_TO_TICKS(usecs)	((1000 * (usecs) + 512) / 1024)
 
+#define MAX_INTS_PER_SEC	8000
+#define DEFAULT_ITR	     1000000000/(MAX_INTS_PER_SEC * 256)
+
 static int lem_tx_int_delay_dflt = EM_TICKS_TO_USECS(EM_TIDV);
 static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
 static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
@@ -442,6 +447,11 @@
 		    &adapter->tx_abs_int_delay,
 		    E1000_REGISTER(&adapter->hw, E1000_TADV),
 		    lem_tx_abs_int_delay_dflt);
+		lem_add_int_delay_sysctl(adapter, "itr",
+		    "interrupt delay limit in usecs/4",
+		    &adapter->tx_itr,
+		    E1000_REGISTER(&adapter->hw, E1000_ITR),
+		    DEFAULT_ITR);
 	}
 
 	/* Sysctls for limiting the amount of work done in the taskqueue */
@@ -449,6 +459,12 @@
 	    "max number of rx packets to process", &adapter->rx_process_limit,
 	    lem_rx_process_limit);
 
+#ifdef MITIGATION
+	/* Sysctls to control mitigation */
+	lem_add_rx_process_limit(adapter, "mit_enable",
+	    "driver TDT mitigation", &adapter->mit_enable, 0);
+#endif /* MITIGATION */
+
         /* Sysctl for setting the interface flow control */
 	lem_set_flow_cntrl(adapter, "flow_control",
 	    "flow control setting",
@@ -1702,6 +1718,17 @@
 	 */
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef MITIGATION
+	if (adapter->mit_enable) {
+		if (adapter->shadow_tdt & MIT_PENDING_INT) {
+			/* signal intr and data pending */
+			adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+			return (0);
+		} else {
+			adapter->shadow_tdt = MIT_PENDING_INT;
+		}
+	}
+#endif /* MITIGATION */
 	if (adapter->hw.mac.type == e1000_82547 &&
 	    adapter->link_duplex == HALF_DUPLEX)
 		lem_82547_move_tail(adapter);
@@ -3027,6 +3054,16 @@
         adapter->next_tx_to_clean = first;
         adapter->num_tx_desc_avail = num_avail;
 
+#ifdef MITIGATION
+	if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+		/* a tdt write is pending, do it */
+		E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+			0xffff & adapter->shadow_tdt);
+		adapter->shadow_tdt = MIT_PENDING_INT;
+	} else {
+		adapter->shadow_tdt = 0; // disable
+	}
+#endif /* MITIGATION */
         /*
          * If we have enough room, clear IFF_DRV_OACTIVE to
          * tell the stack that it is OK to send packets.
@@ -3246,8 +3283,6 @@
  *  Enable receive unit.
  *
  **********************************************************************/
-#define MAX_INTS_PER_SEC	8000
-#define DEFAULT_ITR	     1000000000/(MAX_INTS_PER_SEC * 256)
 
 static void
 lem_initialize_receive_unit(struct adapter *adapter)
@@ -4588,6 +4623,8 @@
 		return (EINVAL);
 	info->value = usecs;
 	ticks = EM_USECS_TO_TICKS(usecs);
+	if (info->offset == E1000_ITR)	/* units are 256ns here */
+		ticks *= 4;
 
 	adapter = info->adapter;
 	
Index: if_lem.h
===================================================================
--- if_lem.h	(revision 244673)
+++ if_lem.h	(working copy)
@@ -363,6 +363,7 @@
 	struct em_int_delay_info tx_abs_int_delay;
 	struct em_int_delay_info rx_int_delay;
 	struct em_int_delay_info rx_abs_int_delay;
+	struct em_int_delay_info tx_itr;
 
 	/*
 	 * Transmit definitions
@@ -436,6 +437,13 @@
 	boolean_t       pcix_82544;
 	boolean_t       in_detach;
 
+#ifdef MITIGATION
+	/* values for shadow_tdt: 0 = idle, 16-low bits: tdt) */
+#define MIT_PENDING_INT	0x10000	/* pending interrupt */
+#define MIT_PENDING_TDT	0x30000	/* both intr and tdt write are pending */
+	uint32_t shadow_tdt;
+	uint32_t mit_enable;	/* control driver tx mitigation */
+#endif /* MITIGATION */
 
 	struct e1000_hw_stats stats;
 };

--wRRV7LY7NUeQGEoC
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="patch-qemu-mitigation-20121227.diff"

diff -urp qemu-1.3.0-orig/hw/e1000.c qemu-1.3.0/hw/e1000.c
--- qemu-1.3.0-orig/hw/e1000.c	2012-12-03 20:37:05.000000000 +0100
+++ qemu-1.3.0/hw/e1000.c	2012-12-27 09:47:16.000000000 +0100
@@ -35,6 +35,8 @@
 
 #include "e1000_hw.h"
 
+static int mit_on = 1;	/* interrupt mitigation enable */
+
 #define E1000_DEBUG
 
 #ifdef E1000_DEBUG
@@ -129,6 +131,9 @@ typedef struct E1000State_st {
     } eecd_state;
 
     QEMUTimer *autoneg_timer;
+    QEMUTimer *mit_timer;	// handle for the timer
+    uint32_t mit_timer_on;	// mitigation timer active
+    uint32_t mit_cause;		// pending interrupt cause
 } E1000State;
 
 #define	defreg(x)	x = (E1000_##x>>2)
@@ -144,6 +149,7 @@ enum {
     defreg(TPR),	defreg(TPT),	defreg(TXDCTL),	defreg(WUFC),
     defreg(RA),		defreg(MTA),	defreg(CRCERRS),defreg(VFTA),
     defreg(VET),
+    defreg(RDTR),	defreg(RADV),	defreg(TADV),	defreg(ITR),
 };
 
 static void
@@ -639,6 +645,68 @@ static uint64_t tx_desc_base(E1000State
     return (bah << 32) + bal;
 }
 
+/* helper function, 0 means the value is not set */
+static inline void
+mit_update_delay(uint32_t *curr, uint32_t value)
+{
+    if (value && (*curr == 0 || value < *curr))
+	*curr = value;
+}
+
+/*
+ * If necessary, rearm the timer and post an interrupt.
+ * Called at the end of tx/rx routines (mit_timer_on == 0),
+ * and when the timer fires (mit_timer_on == 1).
+ * We provide a partial implementation of interrupt mitigation,
+ * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
+ * RADV and TADV, 256ns units for ITR). RDTR is only used to enable RADV;
+ * relative timers based on TIDV and RDTR are not implemented.
+ */
+static void
+mit_rearm_and_int(void *opaque)
+{
+    E1000State *s = opaque;
+    uint32_t mit_delay = 0;
+
+    /*
+     * Clear the flag. It is only set when the callback fires,
+     * and we need to clear it anyways.
+     */
+    s->mit_timer_on = 0;
+    if (s->mit_cause == 0) /* no events pending, we are done */
+	return;
+    /*
+     * Compute the next mitigation delay according to pending interrupts
+     * and the current values of RADV (provided RDTR!=0), TADV and ITR.
+     * Then rearm the timer.
+     */
+    if (s->mit_cause & (E1000_ICR_TXQE | E1000_ICR_TXDW))
+	mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
+    if (s->mac_reg[RDTR] && (s->mit_cause & E1000_ICS_RXT0))
+	mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
+    mit_update_delay(&mit_delay, s->mac_reg[ITR]);  
+
+    if (mit_delay) {
+	s->mit_timer_on = 1;
+	qemu_mod_timer(s->mit_timer,
+		qemu_get_clock_ns(vm_clock) + mit_delay * 256);
+    }
+    set_ics(s, 0, s->mit_cause);
+    s->mit_cause = 0;
+}
+
+static void
+mit_set_ics(E1000State *s, uint32_t cause)
+{
+    if (mit_on == 0) {
+	set_ics(s, 0, cause);
+	return;
+    }
+    s->mit_cause |= cause;
+    if (!s->mit_timer_on)
+	mit_rearm_and_int(s);
+}
+
 static void
 start_xmit(E1000State *s)
 {
@@ -676,7 +744,7 @@ start_xmit(E1000State *s)
             break;
         }
     }
-    set_ics(s, 0, cause);
+    mit_set_ics(s, cause);
 }
 
 static int
@@ -894,7 +962,7 @@ e1000_receive(NetClientState *nc, const
         s->rxbuf_min_shift)
         n |= E1000_ICS_RXDMT0;
 
-    set_ics(s, 0, n);
+    mit_set_ics(s, n);
 
     return size;
 }
@@ -999,6 +1067,7 @@ static uint32_t (*macreg_readops[])(E100
     getreg(RDH),	getreg(RDT),	getreg(VET),	getreg(ICS),
     getreg(TDBAL),	getreg(TDBAH),	getreg(RDBAH),	getreg(RDBAL),
     getreg(TDLEN),	getreg(RDLEN),
+    getreg(RDTR),	getreg(RADV),	getreg(TADV),	getreg(ITR),
 
     [TOTH] = mac_read_clr8,	[TORH] = mac_read_clr8,	[GPRC] = mac_read_clr4,
     [GPTC] = mac_read_clr4,	[TPR] = mac_read_clr4,	[TPT] = mac_read_clr4,
@@ -1015,6 +1084,8 @@ static void (*macreg_writeops[])(E1000St
     putreg(PBA),	putreg(EERD),	putreg(SWSM),	putreg(WUFC),
     putreg(TDBAL),	putreg(TDBAH),	putreg(TXDCTL),	putreg(RDBAH),
     putreg(RDBAL),	putreg(LEDCTL), putreg(VET),
+    [RDTR] = set_16bit,	[RADV] = set_16bit,	[TADV] = set_16bit,
+    [ITR] = set_16bit,
     [TDLEN] = set_dlen,	[RDLEN] = set_dlen,	[TCTL] = set_tctl,
     [TDT] = set_tctl,	[MDIC] = set_mdic,	[ICS] = set_ics,
     [TDH] = set_16bit,	[RDH] = set_16bit,	[RDT] = set_rdt,
@@ -1286,6 +1357,9 @@ static int pci_e1000_init(PCIDevice *pci
     add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0");
 
     d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
+    d->mit_cause = 0;
+    d->mit_timer_on = 0;
+    d->mit_timer = qemu_new_timer_ns(vm_clock, mit_rearm_and_int, d);
 
     return 0;
 }

--wRRV7LY7NUeQGEoC--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20121227094649.GA48891>