Date: Sat, 23 Jun 2001 14:02:41 -0700 (PDT) From: Matt Dillon <dillon@earth.backplane.com> To: Mikhail Teterin <mi@aldan.algebra.com>, jlemon@FreeBSD.ORG, cvs-committers@FreeBSD.ORG, cvs-all@FreeBSD.ORG Subject: Inline optimized bzero (was Re: cvs commit: src/sys/netinet tcp_subr.c) Message-ID: <200106232102.f5NL2fY73920@earth.backplane.com> References: <200106231912.f5NJCUE01011@aldan.algebra.com>
next in thread | previous in thread | raw e-mail | index | archive | help
I would propose adding a new kernel bzero() function, called bzerol(), which is an inline integer-aligned implementation. This implementation should be called for integer-aligned buffers with known (constant) sizes, and generates about the same number of assembly instructions as calling bzero() eats. I did a quick perusal of the kernel code and an unbelievable number of bzero() calls could be converted. Test1 - bcopy 20x2 bytes 204.90 nS/loop Test2 - manual load data 26.61 nS/loop Test3 - man load w/ptrs 36.38 nS/loop Test4 - mlptrs & bzero 163.96 nS/loop Test5 - mlptrszer & call 182.46 nS/loop Test6 - mlptrszerc/bzerol 67.21 nS/loop Test7 - bigbuf/libc-bzero 621.11 nS/loop Test8 - bigbuf/bzerol 669.10 nS/loop /* * MEMTEST.C */ #include <sys/types.h> #include <sys/time.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include <assert.h> #include <unistd.h> #define LOOPS 1000000 struct DBuf { int x[5]; int y[5]; char notonsamecacheline[256]; } DBuf, Template, Template2, *GlobPtr = &Template2; static void showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops); static void test1(void); static void test2(void); static void test3(struct DBuf *template); static void test4(struct DBuf *template); static void test5(struct DBuf *template); static void test6(struct DBuf *template); static void test7(struct DBuf *template); static void test8(struct DBuf *template); static int simplecall(int a, int b, int c); static void bzerol(void *s, int bytes); char XBuf[1024]; /* * bzerol() - aligned bzero. The buffer must be integer aligned and sized. * * This routine should only be called with constant sizes, so GCC can * optimize it. This routine typically optimizes down to just a few * instructions. */ static __inline void bzerol(void *s, int bytes) { assert((bytes & (sizeof(int) - 1)) == 0); switch(bytes) { case sizeof(int) * 5: *((int *)s + 4) = 0; /* fall through */ case sizeof(int) * 4: *((int *)s + 3) = 0; /* fall through */ case sizeof(int) * 3: *((int *)s + 2) = 0; /* fall through */ case sizeof(int) * 2: *((int *)s + 1) = 0; /* fall through */ case sizeof(int) * 1: *(int *)s = 0; /* fall through */ case 0: return; default: if (bytes >= sizeof(int) * 8) { while (bytes >= sizeof(int) * 4) { *(int *)((char *)s + 0 * sizeof(int)) = 0; *(int *)((char *)s + 1 * sizeof(int)) = 0; *(int *)((char *)s + 2 * sizeof(int)) = 0; *(int *)((char *)s + 3 * sizeof(int)) = 0; s = (char *)s + sizeof(int) * 4; bytes -= sizeof(int) * 4; } } while (bytes > 0) { bytes -= 4; *(int *)((char *)s + bytes) = 0; } } } int main(int ac, char **av) { struct timeval tbeg; struct timeval tend; int i; test1(); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test1(); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test1 - bcopy 20x2 bytes", LOOPS); test2(); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test2(); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test2 - manual load data", LOOPS); test3(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test3(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test3 - man load w/ptrs ", LOOPS); test4(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test4(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test4 - mlptrs & bzero ", LOOPS); test5(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test5(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test5 - mlptrszer & call", LOOPS); test6(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test6(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test6 - mlptrszerc/mybzero", LOOPS); test7(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test7(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test7 - bigbuf/libc-bzero", LOOPS); test8(&Template); gettimeofday(&tend, NULL); gettimeofday(&tbeg, NULL); for (i = LOOPS; i; --i) test8(&Template); gettimeofday(&tend, NULL); showtimes(&tbeg, &tend, "Test8 - bigbuf/mybzero ", LOOPS); return(0); } static void showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops) { long us; us = (t2->tv_usec + 1000000 - t1->tv_usec) + (t2->tv_sec - t1->tv_sec - 1) * 1000000; printf("%s\t%6.2f nS/loop\n", str, (double)us * 1000.0 / (double)loops); } static void test1(void) { bcopy(Template.x, DBuf.x, sizeof(DBuf.x)); bcopy(Template.y, DBuf.y, sizeof(DBuf.y)); } static void test2(void) { DBuf.x[0] = 0; DBuf.x[1] = 0; DBuf.x[2] = 0; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = 0; DBuf.y[1] = 0; DBuf.y[2] = 0; DBuf.y[3] = 0; DBuf.y[4] = 0; } static void test3(struct DBuf *template) { DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = template->y[0]; DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static void test4(struct DBuf *template) { bzero(&DBuf.x, sizeof(DBuf.x)); DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = template->y[0]; DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static void test5(struct DBuf *template) { bzero(&DBuf.x, sizeof(DBuf.x)); DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = simplecall(1, 2, 3); DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static void test6(struct DBuf *template) { bzerol(&DBuf.x, sizeof(DBuf.x)); DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = simplecall(1, 2, 3); DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static void test7(struct DBuf *template) { bzero(XBuf, sizeof(XBuf)); DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = simplecall(1, 2, 3); DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static void test8(struct DBuf *template) { bzerol(XBuf, sizeof(XBuf)); DBuf.x[0] = 0; DBuf.x[1] = GlobPtr->x[1]; DBuf.x[2] = template->x[2]; DBuf.x[3] = 0; DBuf.x[4] = 0; DBuf.y[0] = simplecall(1, 2, 3); DBuf.y[1] = template->y[1]; DBuf.y[2] = template->y[2]; DBuf.y[3] = 5; DBuf.y[4] = 0; } static int simplecall(int a, int b, int c) { return(a + b + c); } To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe cvs-all" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200106232102.f5NL2fY73920>