From owner-freebsd-net@FreeBSD.ORG Wed Aug 22 14:17:26 2012 Return-Path: Delivered-To: freebsd-net@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id 8BFA31065676; Wed, 22 Aug 2012 14:17:26 +0000 (UTC) (envelope-from luigi@onelab2.iet.unipi.it) Received: from onelab2.iet.unipi.it (onelab2.iet.unipi.it [131.114.59.238]) by mx1.freebsd.org (Postfix) with ESMTP id B2CD68FC17; Wed, 22 Aug 2012 14:17:25 +0000 (UTC) Received: by onelab2.iet.unipi.it (Postfix, from userid 275) id 5D42E7300A; Wed, 22 Aug 2012 16:36:32 +0200 (CEST) Date: Wed, 22 Aug 2012 16:36:32 +0200 From: Luigi Rizzo To: Bruce Evans Message-ID: <20120822143632.GA64686@onelab2.iet.unipi.it> References: <20120821112415.GA50078@onelab2.iet.unipi.it> <201208220232.q7M2WLCL020204@ref10-i386.freebsd.org> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <201208220232.q7M2WLCL020204@ref10-i386.freebsd.org> User-Agent: Mutt/1.4.2.3i Cc: freebsd-hackers@FreeBSD.org, freebsd-net@FreeBSD.org, mitya@cabletv.dp.ua, marius@alchemy.franken.de Subject: speed tests (Re: Replace bcopy() to update ether_addr) X-BeenThere: freebsd-net@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Networking and TCP/IP with FreeBSD List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 22 Aug 2012 14:17:26 -0000 On Wed, Aug 22, 2012 at 02:32:21AM +0000, Bruce Evans wrote: > luigi wrote: > > > even more orthogonal: > > > > I found that copying 8n + (5, 6 or 7) bytes was much much slower than > > copying a multiple of 8 bytes. For n=0, 1,2,4,8 bytes are efficient, > > other cases are slow (turned into 2 or 3 different writes). > > > > The netmap code uses a pkt_copy routine that does exactly this > > rounding, gaining some 10-20ns per packet for small sizes. > > I don't believe 10-20ns for just the extra bytes. memcpy() ends up > with a movsb to copy the extra bytes. This can be slow, but I don't > believe 10-20ns (except on machines running at i486 speeds of course). I am adding at the end a test program so people can try things on their hw. Build it with cc -O2 -Werror -Wall -Wextra -lpthread -lrt testlock.c -o testlock and on my i7 i get these results: ./testlock -m memcpy -l 7 -> ~23 Mops/s 43 ns/cycle ./testlock -m bcopy -l 7 -> ~10 Mops/sA 100 ns/cycle ./testlock -m fastcopy -l 7 -> ~64 Mops/s 16 ns/cycle (fastcopy rounds to the next multiple of 8) Changing the length (-l ...) changes the speed, of course. For some reason my machine is fast for 8n+(0,1,2,3) and slow for 8n+(4,5,6,7). cheers luigi -------------------- /* * Copyright (C) 2012 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: testlock.c 11731 2012-08-22 14:19:50Z luigi $ * * Test program to study various ops and concurrency issues. * Create multiple threads, possibly bind to cpus, and run a workload. * * cc -O2 -Werror -Wall testlock.c -o testlock -lpthread * you might need -lrt */ #include #include #include /* pthread_* */ #if defined(__APPLE__) #include #define atomic_add_int(p, n) OSAtomicAdd32(n, (int *)p) #define atomic_cmpset_32(p, o, n) OSAtomicCompareAndSwap32(o, n, (int *)p) #elif defined(linux) int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new) { int ret = *p == old; *p = new; return ret; } #if defined(HAVE_GCC_ATOMICS) int atomic_add_int(volatile int *p, int v) { return __sync_fetch_and_add(p, v); } #else inline uint32_t atomic_add_int(uint32_t *p, int v) { __asm __volatile ( " lock xaddl %0, %1 ; " : "+r" (v), /* 0 (result) */ "=m" (*p) /* 1 */ : "m" (*p)); /* 2 */ return (v); } #endif #else /* FreeBSD */ #include #include #include /* pthread w/ affinity */ #if __FreeBSD_version > 500000 #include /* cpu_set */ #if __FreeBSD_version > 800000 #define HAVE_AFFINITY #endif inline void prefetch (const void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x)); } #else /* FreeBSD 4.x */ int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new) { int ret = *p == old; *p = new; return ret; } #define PRIu64 "llu" #endif /* FreeBSD 4.x */ #endif /* FreeBSD */ #include /* signal */ #include #include #include #include /* PRI* macros */ #include /* strcmp */ #include /* open */ #include /* getopt */ #include /* sysctl */ #include /* timersub */ static inline int min(int a, int b) { return a < b ? a : b; } #define ONE_MILLION 1000000 /* debug support */ #define ND(format, ...) #define D(format, ...) \ fprintf(stderr, "%s [%d] " format "\n", \ __FUNCTION__, __LINE__, ##__VA_ARGS__) int verbose = 0; #if 1//def MY_RDTSC /* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ #define my_rdtsc(t) \ do { \ u_int __regs[4]; \ \ do_cpuid(0, __regs); \ (t) = rdtsc(); \ } while (0) static __inline void do_cpuid(u_int ax, u_int *p) { __asm __volatile("cpuid" : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) : "0" (ax) ); } static __inline uint64_t rdtsc(void) { uint64_t rv; // XXX does not work on linux-64 bit __asm __volatile("rdtscp" : "=A" (rv) : : "%rax"); return (rv); } #endif /* 1 */ struct targ; /*** global arguments for all threads ***/ struct glob_arg { struct { uint32_t ctr[1024]; } v __attribute__ ((aligned(256) )); int m_cycles; /* million cycles */ int nthreads; int cpus; int privs; // 1 if has IO privileges int arg; // microseconds in usleep char *test_name; void (*fn)(struct targ *); uint64_t scale; // scaling factor char *scale_name; // scaling factor }; /* * Arguments for a new thread. */ struct targ { struct glob_arg *g; int completed; u_int *glob_ctr; uint64_t volatile count; struct timeval tic, toc; int me; pthread_t thread; int affinity; }; static struct targ *ta; static int global_nthreads; /* control-C handler */ static void sigint_h(int sig) { int i; (void)sig; /* UNUSED */ for (i = 0; i < global_nthreads; i++) { /* cancel active threads. */ if (ta[i].completed) continue; D("Cancelling thread #%d\n", i); pthread_cancel(ta[i].thread); ta[i].completed = 0; } signal(SIGINT, SIG_DFL); } /* sysctl wrapper to return the number of active CPUs */ static int system_ncpus(void) { #ifdef linux return 1; #else int mib[2] = { CTL_HW, HW_NCPU}, ncpus; size_t len = sizeof(mib); sysctl(mib, len / sizeof(mib[0]), &ncpus, &len, NULL, 0); D("system had %d cpus", ncpus); return (ncpus); #endif } /* * try to get I/O privileges so we can execute cli/sti etc. */ int getprivs(void) { int fd = open("/dev/io", O_RDWR); if (fd < 0) { D("cannot open /dev/io, fd %d", fd); return 0; } return 1; } /* set the thread affinity. */ /* ARGSUSED */ #ifdef HAVE_AFFINITY static int setaffinity(pthread_t me, int i) { cpuset_t cpumask; if (i == -1) return 0; /* Set thread affinity affinity.*/ CPU_ZERO(&cpumask); CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { D("Unable to set affinity"); return 1; } return 0; } #endif static void * td_body(void *data) { struct targ *t = (struct targ *) data; #ifdef HAVE_AFFINITY if (0 == setaffinity(t->thread, t->affinity)) #endif { /* main loop.*/ D("testing %d cycles", t->g->m_cycles); gettimeofday(&t->tic, NULL); t->g->fn(t); gettimeofday(&t->toc, NULL); } t->completed = 1; return (NULL); } void test_sel(struct targ *t) { int m; for (m = 0; m < t->g->m_cycles; m++) { fd_set r; struct timeval to = { 0, t->g->arg}; FD_ZERO(&r); FD_SET(0,&r); // FD_SET(1,&r); select(1, &r, NULL, NULL, &to); t->count++; } } void test_poll(struct targ *t) { int m, ms = t->g->arg/1000; for (m = 0; m < t->g->m_cycles; m++) { struct pollfd x; x.fd = 0; x.events = POLLIN; poll(&x, 1, ms); t->count++; } } void test_usleep(struct targ *t) { int m; for (m = 0; m < t->g->m_cycles; m++) { usleep(t->g->arg); t->count++; } } void test_cli(struct targ *t) { int m, i; if (!t->g->privs) { D("%s", "privileged instructions not available"); return; } for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { __asm __volatile("cli;"); __asm __volatile("and %eax, %eax;"); __asm __volatile("sti;"); t->count++; } } } void test_nop(struct targ *t) { int m, i; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { __asm __volatile("nop;"); __asm __volatile("nop; nop; nop; nop; nop;"); //__asm __volatile("nop; nop; nop; nop; nop;"); t->count++; } } } void test_rdtsc1(struct targ *t) { int m, i; uint64_t v; (void)v; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { my_rdtsc(v); t->count++; } } } void test_rdtsc(struct targ *t) { int m, i; volatile uint64_t v; (void)v; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { v = rdtsc(); t->count++; } } } void test_add(struct targ *t) { int m, i; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { t->glob_ctr[0] ++; t->count++; } } } void test_atomic_add(struct targ *t) { int m, i; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { atomic_add_int(t->glob_ctr, 1); t->count++; } } } void test_atomic_cmpset(struct targ *t) { int m, i; for (m = 0; m < t->g->m_cycles; m++) { for (i = 0; i < ONE_MILLION; i++) { atomic_cmpset_32(t->glob_ctr, m, i); t->count++; } } } void test_time(struct targ *t) { int m; for (m = 0; m < t->g->m_cycles; m++) { #ifndef __APPLE__ struct timespec ts; clock_gettime(t->g->arg, &ts); #endif t->count++; } } void test_gettimeofday(struct targ *t) { int m; struct timeval ts; for (m = 0; m < t->g->m_cycles; m++) { gettimeofday(&ts, NULL); t->count++; } } #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) static void fast_bcopy(void *_src, void *_dst, int l) { uint64_t *src = _src; uint64_t *dst = _dst; if (unlikely(l >= 1024)) { bcopy(src, dst, l); return; } for (; likely(l > 0); l-=64) { *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; } } // XXX if you want to make sure there is no inlining... // static void (*fp)(void *_src, void *_dst, int l) = fast_bcopy; #define HU 0x3ffff static struct glob_arg huge[HU+1]; void test_fastcopy(struct targ *t) { int m, len; len = t->g->arg; if (len > (int)sizeof(struct glob_arg)) len = sizeof(struct glob_arg); D("fast copying %d bytes", len); for (m = 0; m < t->g->m_cycles; m++) { fast_bcopy(t->g, (void *)&huge[m & HU], len); t->count+=1; } } void test_bcopy(struct targ *t) { int m, len; len = t->g->arg; if (len > (int)sizeof(struct glob_arg)) len = sizeof(struct glob_arg); D("bcopying %d bytes", len); for (m = 0; m < t->g->m_cycles; m++) { __builtin_memcpy(t->g, (void *)&huge[m & HU], len); t->count+=1; } } void test_memcpy(struct targ *t) { int m, len; len = t->g->arg; if (len > (int)sizeof(struct glob_arg)) len = sizeof(struct glob_arg); D("memcopying %d bytes", len); for (m = 0; m < t->g->m_cycles; m++) { memcpy((void *)&huge[m & HU], t->g, len); t->count+=1; } } struct entry { void (*fn)(struct targ *); char *name; uint64_t scale; uint64_t m_cycles; }; struct entry tests[] = { { test_sel, "select", 1, 1000 }, { test_poll, "poll", 1, 1000 }, { test_usleep, "usleep", 1, 1000 }, { test_time, "time", 1, 1000 }, { test_gettimeofday, "gettimeofday", 1, 1000000 }, { test_bcopy, "bcopy", 1, 100000000 }, { test_memcpy, "memcpy", 1, 100000000 }, { test_fastcopy, "fastcopy", 1, 100000000 }, { test_add, "add", ONE_MILLION, 100000000 }, { test_nop, "nop", ONE_MILLION, 100000000 }, { test_atomic_add, "atomic-add", ONE_MILLION, 100000000 }, { test_cli, "cli", ONE_MILLION, 100000000 }, { test_rdtsc, "rdtsc", ONE_MILLION, 100000000 }, // unserialized { test_rdtsc1, "rdtsc1", ONE_MILLION, 100000000 }, // serialized { test_atomic_cmpset, "cmpset", ONE_MILLION, 100000000 }, { NULL, NULL, 0, 0 } }; static void usage(void) { const char *cmd = "test"; int i; fprintf(stderr, "Usage:\n" "%s arguments\n" "\t-m name test name\n" "\t-n cycles (millions) of cycles\n" "\t-l arg bytes, usec, ... \n" "\t-t threads total threads\n" "\t-c cores cores to use\n" "\t-a n force affinity every n cores\n" "\t-A n cache contention every n bytes\n" "\t-w report_ms milliseconds between reports\n" "", cmd); fprintf(stderr, "Available tests:\n"); for (i = 0; tests[i].name; i++) { fprintf(stderr, "%12s\n", tests[i].name); } exit(0); } struct glob_arg g; int main(int argc, char **argv) { int i, ch, report_interval, affinity, align; ND("g has size %d", (int)sizeof(g)); report_interval = 250; /* ms */ affinity = 0; /* no affinity */ align = 0; /* global variable */ bzero(&g, sizeof(g)); g.privs = getprivs(); g.nthreads = 1; g.cpus = 1; g.m_cycles = 0; while ( (ch = getopt(argc, argv, "A:a:m:n:w:c:t:vl:")) != -1) { switch(ch) { default: D("bad option %c %s", ch, optarg); usage(); break; case 'A': /* align */ align = atoi(optarg); break; case 'a': /* force affinity */ affinity = atoi(optarg); break; case 'n': /* cycles */ g.m_cycles = atoi(optarg); break; case 'w': /* report interval */ report_interval = atoi(optarg); break; case 'c': g.cpus = atoi(optarg); break; case 't': g.nthreads = atoi(optarg); break; case 'm': g.test_name = optarg; break; case 'l': g.arg = atoi(optarg); break; case 'v': verbose++; break; } } argc -= optind; argv += optind; if (!g.test_name && argc > 0) g.test_name = argv[0]; if (g.test_name) { for (i = 0; tests[i].name; i++) { if (!strcmp(g.test_name, tests[i].name)) { g.fn = tests[i].fn; g.scale = tests[i].scale; if (g.m_cycles == 0) g.m_cycles = tests[i].m_cycles; if (g.scale == ONE_MILLION) g.scale_name = "M"; else if (g.scale == 1000) g.scale_name = "M"; else { g.scale = 1; g.scale_name = ""; } break; } } } if (!g.fn) { D("%s", "missing/unknown test name"); usage(); } i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); usage(); } if (g.cpus == 0) g.cpus = i; if (g.nthreads < 1) { D("bad nthreads %d, using 1", g.nthreads); g.nthreads = 1; } i = sizeof(g.v.ctr) / g.nthreads*sizeof(g.v.ctr[0]); if (align < 0 || align > i) { D("bad align %d, max is %d", align, i); align = i; } /* Install ^C handler. */ global_nthreads = g.nthreads; signal(SIGINT, sigint_h); ta = calloc(g.nthreads, sizeof(*ta)); /* * Now create the desired number of threads, each one * using a single descriptor. */ D("start %d threads on %d cores", g.nthreads, g.cpus); for (i = 0; i < g.nthreads; i++) { struct targ *t = &ta[i]; bzero(t, sizeof(*t)); t->g = &g; t->me = i; t->glob_ctr = &g.v.ctr[(i*align)/sizeof(g.v.ctr[0])]; D("thread %d ptr %p", i, t->glob_ctr); t->affinity = affinity ? (affinity*i) % g.cpus : -1; if (pthread_create(&t->thread, NULL, td_body, t) == -1) { D("Unable to create thread %d", i); t->completed = 1; } } /* the main loop */ { uint64_t my_count = 0, prev = 0; uint64_t count = 0; double delta_t; struct timeval tic, toc; gettimeofday(&toc, NULL); for (;;) { struct timeval now, delta; uint64_t pps; int done = 0; delta.tv_sec = report_interval/1000; delta.tv_usec = (report_interval%1000)*1000; select(0, NULL, NULL, NULL, &delta); gettimeofday(&now, NULL); timersub(&now, &toc, &toc); my_count = 0; for (i = 0; i < g.nthreads; i++) { my_count += ta[i].count; if (ta[i].completed) done++; } pps = toc.tv_sec* ONE_MILLION + toc.tv_usec; if (pps < 10000) continue; pps = (my_count - prev)*ONE_MILLION / pps; D("%" PRIu64 " %scycles/s scale %" PRIu64 " in %dus", pps/g.scale, g.scale_name, g.scale, (int)(toc.tv_sec* ONE_MILLION + toc.tv_usec)); prev = my_count; toc = now; if (done == g.nthreads) break; } D("total %" PRIu64 " cycles", prev); timerclear(&tic); timerclear(&toc); for (i = 0; i < g.nthreads; i++) { pthread_join(ta[i].thread, NULL); if (ta[i].completed == 0) continue; /* * Collect threads o1utput and extract information about * how log it took to send all the packets. */ count += ta[i].count; if (!timerisset(&tic) || timercmp(&ta[i].tic, &tic, <)) tic = ta[i].tic; if (!timerisset(&toc) || timercmp(&ta[i].toc, &toc, >)) toc = ta[i].toc; } /* print output. */ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; D("total %8.6f seconds", delta_t); } return (0); } /* end of file */