Date: Wed, 16 Aug 2017 18:48:54 +0000 (UTC) From: Conrad Meyer <cem@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r322588 - in head/sys: amd64/include i386/include x86/x86 Message-ID: <201708161848.v7GImsDv079412@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: cem Date: Wed Aug 16 18:48:53 2017 New Revision: 322588 URL: https://svnweb.freebsd.org/changeset/base/322588 Log: x86: Add dynamic interrupt rebalancing Add an option to dynamically rebalance interrupts across cores (hw.intrbalance); off by default. The goal is to minimize preemption. By placing interrupt sources on distinct CPUs, ithreads get preferentially scheduled on distinct CPUs. Overall preemption is reduced and latency is reduced. In our workflow it reduced "fighting" between two high-frequency interrupt sources. Reduced latency was proven by, e.g., SPEC2008. Submitted by: jeff@ (earlier version) Reviewed by: kib@ Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D10435 Modified: head/sys/amd64/include/intr_machdep.h head/sys/i386/include/intr_machdep.h head/sys/x86/x86/intr_machdep.c Modified: head/sys/amd64/include/intr_machdep.h ============================================================================== --- head/sys/amd64/include/intr_machdep.h Wed Aug 16 18:00:32 2017 (r322587) +++ head/sys/amd64/include/intr_machdep.h Wed Aug 16 18:48:53 2017 (r322588) @@ -130,6 +130,7 @@ struct intsrc { u_long *is_straycount; u_int is_index; u_int is_handlers; + u_int is_cpu; }; struct trapframe; Modified: head/sys/i386/include/intr_machdep.h ============================================================================== --- head/sys/i386/include/intr_machdep.h Wed Aug 16 18:00:32 2017 (r322587) +++ head/sys/i386/include/intr_machdep.h Wed Aug 16 18:48:53 2017 (r322588) @@ -130,6 +130,7 @@ struct intsrc { u_long *is_straycount; u_int is_index; u_int is_handlers; + u_int is_cpu; }; struct trapframe; Modified: head/sys/x86/x86/intr_machdep.c ============================================================================== --- head/sys/x86/x86/intr_machdep.c Wed Aug 16 18:00:32 2017 (r322587) +++ head/sys/x86/x86/intr_machdep.c Wed Aug 16 18:48:53 2017 (r322588) @@ -45,10 +45,14 @@ #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/queue.h> +#include <sys/sbuf.h> #include <sys/smp.h> #include <sys/sx.h> +#include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/systm.h> +#include <sys/taskqueue.h> #include <sys/vmmeter.h> #include <machine/clock.h> #include <machine/intr_machdep.h> @@ -71,6 +75,12 @@ typedef void (*mask_fn)(void *); static int intrcnt_index; static struct intsrc *interrupt_sources[NUM_IO_INTS]; +static struct intsrc *interrupt_sorted[NUM_IO_INTS]; +CTASSERT(sizeof(interrupt_sources) == sizeof(interrupt_sorted)); +static int intrbalance; +SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0, + "Interrupt auto-balance interval (seconds). Zero disables."); +static struct timeout_task intrbalance_task; static struct sx intrsrc_lock; static struct mtx intrpic_lock; static struct mtx intrcnt_lock; @@ -325,6 +335,8 @@ intr_assign_cpu(void *arg, int cpu) isrc = arg; sx_xlock(&intrsrc_lock); error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]); + if (error == 0) + isrc->is_cpu = cpu; sx_xunlock(&intrsrc_lock); } else error = 0; @@ -559,6 +571,7 @@ static void intr_shuffle_irqs(void *arg __unused) { struct intsrc *isrc; + u_int cpu; int i; /* Don't bother on UP. */ @@ -578,13 +591,15 @@ intr_shuffle_irqs(void *arg __unused) * this is careful to only advance the * round-robin if the CPU assignment succeeds. */ - if (isrc->is_event->ie_cpu != NOCPU) - (void)isrc->is_pic->pic_assign_cpu(isrc, - cpu_apic_ids[isrc->is_event->ie_cpu]); - else if (isrc->is_pic->pic_assign_cpu(isrc, - cpu_apic_ids[current_cpu]) == 0) - (void)intr_next_cpu(); - + cpu = isrc->is_event->ie_cpu; + if (cpu == NOCPU) + cpu = current_cpu; + if (isrc->is_pic->pic_assign_cpu(isrc, + cpu_apic_ids[cpu]) == 0) { + isrc->is_cpu = cpu; + if (isrc->is_event->ie_cpu == NOCPU) + intr_next_cpu(); + } } } sx_xunlock(&intrsrc_lock); @@ -592,6 +607,123 @@ intr_shuffle_irqs(void *arg __unused) SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); #endif + +/* + * TODO: Export this information in a non-MD fashion, integrate with vmstat -i. + */ +static int +sysctl_hw_intrs(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sbuf; + struct intsrc *isrc; + int error; + int i; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + + sbuf_new_for_sysctl(&sbuf, NULL, 128, req); + sx_slock(&intrsrc_lock); + for (i = 0; i < NUM_IO_INTS; i++) { + isrc = interrupt_sources[i]; + if (isrc == NULL) + continue; + sbuf_printf(&sbuf, "%s:%d @%d: %ld\n", + isrc->is_event->ie_fullname, + isrc->is_index, + isrc->is_cpu, + *isrc->is_count); + } + + sx_sunlock(&intrsrc_lock); + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} +SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW, + 0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count"); + +/* + * Compare two, possibly NULL, entries in the interrupt source array + * by load. + */ +static int +intrcmp(const void *one, const void *two) +{ + const struct intsrc *i1, *i2; + + i1 = *(const struct intsrc * const *)one; + i2 = *(const struct intsrc * const *)two; + if (i1 != NULL && i2 != NULL) + return (*i1->is_count - *i2->is_count); + if (i1 != NULL) + return (1); + if (i2 != NULL) + return (-1); + return (0); +} + +/* + * Balance IRQs across available CPUs according to load. + */ +static void +intr_balance(void *dummy __unused, int pending __unused) +{ + struct intsrc *isrc; + int interval; + u_int cpu; + int i; + + interval = intrbalance; + if (interval == 0) + goto out; + + /* + * Sort interrupts according to count. + */ + sx_xlock(&intrsrc_lock); + memcpy(interrupt_sorted, interrupt_sources, sizeof(interrupt_sorted)); + qsort(interrupt_sorted, NUM_IO_INTS, sizeof(interrupt_sorted[0]), + intrcmp); + + /* + * Restart the scan from the same location to avoid moving in the + * common case. + */ + current_cpu = 0; + + /* + * Assign round-robin from most loaded to least. + */ + for (i = NUM_IO_INTS - 1; i >= 0; i--) { + isrc = interrupt_sorted[i]; + if (isrc == NULL || isrc->is_event->ie_cpu != NOCPU) + continue; + cpu = current_cpu; + intr_next_cpu(); + if (isrc->is_cpu != cpu && + isrc->is_pic->pic_assign_cpu(isrc, + cpu_apic_ids[cpu]) == 0) + isrc->is_cpu = cpu; + } + sx_xunlock(&intrsrc_lock); +out: + taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, + interval ? hz * interval : hz * 60); + +} + +static void +intr_balance_init(void *dummy __unused) +{ + + TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance, + NULL); + taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz); +} +SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL); + #else /* * Always route interrupts to the current processor in the UP case.
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201708161848.v7GImsDv079412>