Date: Sat, 23 Jun 2001 14:02:41 -0700 (PDT) From: Matt Dillon <dillon@earth.backplane.com> To: Mikhail Teterin <mi@aldan.algebra.com>, jlemon@FreeBSD.ORG, cvs-committers@FreeBSD.ORG, cvs-all@FreeBSD.ORG Subject: Inline optimized bzero (was Re: cvs commit: src/sys/netinet tcp_subr.c) Message-ID: <200106232102.f5NL2fY73920@earth.backplane.com> References: <200106231912.f5NJCUE01011@aldan.algebra.com>
next in thread | previous in thread | raw e-mail | index | archive | help
I would propose adding a new kernel bzero() function, called bzerol(),
which is an inline integer-aligned implementation.
This implementation should be called for integer-aligned buffers with
known (constant) sizes, and generates about the same number of assembly
instructions as calling bzero() eats.
I did a quick perusal of the kernel code and an unbelievable number of
bzero() calls could be converted.
Test1 - bcopy 20x2 bytes 204.90 nS/loop
Test2 - manual load data 26.61 nS/loop
Test3 - man load w/ptrs 36.38 nS/loop
Test4 - mlptrs & bzero 163.96 nS/loop
Test5 - mlptrszer & call 182.46 nS/loop
Test6 - mlptrszerc/bzerol 67.21 nS/loop
Test7 - bigbuf/libc-bzero 621.11 nS/loop
Test8 - bigbuf/bzerol 669.10 nS/loop
/*
* MEMTEST.C
*/
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#define LOOPS 1000000
struct DBuf {
int x[5];
int y[5];
char notonsamecacheline[256];
} DBuf, Template, Template2, *GlobPtr = &Template2;
static void showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops);
static void test1(void);
static void test2(void);
static void test3(struct DBuf *template);
static void test4(struct DBuf *template);
static void test5(struct DBuf *template);
static void test6(struct DBuf *template);
static void test7(struct DBuf *template);
static void test8(struct DBuf *template);
static int simplecall(int a, int b, int c);
static void bzerol(void *s, int bytes);
char XBuf[1024];
/*
* bzerol() - aligned bzero. The buffer must be integer aligned and sized.
*
* This routine should only be called with constant sizes, so GCC can
* optimize it. This routine typically optimizes down to just a few
* instructions.
*/
static __inline void
bzerol(void *s, int bytes)
{
assert((bytes & (sizeof(int) - 1)) == 0);
switch(bytes) {
case sizeof(int) * 5:
*((int *)s + 4) = 0;
/* fall through */
case sizeof(int) * 4:
*((int *)s + 3) = 0;
/* fall through */
case sizeof(int) * 3:
*((int *)s + 2) = 0;
/* fall through */
case sizeof(int) * 2:
*((int *)s + 1) = 0;
/* fall through */
case sizeof(int) * 1:
*(int *)s = 0;
/* fall through */
case 0:
return;
default:
if (bytes >= sizeof(int) * 8) {
while (bytes >= sizeof(int) * 4) {
*(int *)((char *)s + 0 * sizeof(int)) = 0;
*(int *)((char *)s + 1 * sizeof(int)) = 0;
*(int *)((char *)s + 2 * sizeof(int)) = 0;
*(int *)((char *)s + 3 * sizeof(int)) = 0;
s = (char *)s + sizeof(int) * 4;
bytes -= sizeof(int) * 4;
}
}
while (bytes > 0) {
bytes -= 4;
*(int *)((char *)s + bytes) = 0;
}
}
}
int
main(int ac, char **av)
{
struct timeval tbeg;
struct timeval tend;
int i;
test1();
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test1();
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test1 - bcopy 20x2 bytes", LOOPS);
test2();
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test2();
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test2 - manual load data", LOOPS);
test3(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test3(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test3 - man load w/ptrs ", LOOPS);
test4(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test4(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test4 - mlptrs & bzero ", LOOPS);
test5(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test5(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test5 - mlptrszer & call", LOOPS);
test6(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test6(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test6 - mlptrszerc/mybzero", LOOPS);
test7(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test7(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test7 - bigbuf/libc-bzero", LOOPS);
test8(&Template);
gettimeofday(&tend, NULL);
gettimeofday(&tbeg, NULL);
for (i = LOOPS; i; --i)
test8(&Template);
gettimeofday(&tend, NULL);
showtimes(&tbeg, &tend, "Test8 - bigbuf/mybzero ", LOOPS);
return(0);
}
static void
showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops)
{
long us;
us = (t2->tv_usec + 1000000 - t1->tv_usec) +
(t2->tv_sec - t1->tv_sec - 1) * 1000000;
printf("%s\t%6.2f nS/loop\n", str, (double)us * 1000.0 / (double)loops);
}
static void
test1(void)
{
bcopy(Template.x, DBuf.x, sizeof(DBuf.x));
bcopy(Template.y, DBuf.y, sizeof(DBuf.y));
}
static void
test2(void)
{
DBuf.x[0] = 0;
DBuf.x[1] = 0;
DBuf.x[2] = 0;
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = 0;
DBuf.y[1] = 0;
DBuf.y[2] = 0;
DBuf.y[3] = 0;
DBuf.y[4] = 0;
}
static void
test3(struct DBuf *template)
{
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = template->y[0];
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static void
test4(struct DBuf *template)
{
bzero(&DBuf.x, sizeof(DBuf.x));
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = template->y[0];
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static void
test5(struct DBuf *template)
{
bzero(&DBuf.x, sizeof(DBuf.x));
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = simplecall(1, 2, 3);
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static void
test6(struct DBuf *template)
{
bzerol(&DBuf.x, sizeof(DBuf.x));
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = simplecall(1, 2, 3);
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static void
test7(struct DBuf *template)
{
bzero(XBuf, sizeof(XBuf));
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = simplecall(1, 2, 3);
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static void
test8(struct DBuf *template)
{
bzerol(XBuf, sizeof(XBuf));
DBuf.x[0] = 0;
DBuf.x[1] = GlobPtr->x[1];
DBuf.x[2] = template->x[2];
DBuf.x[3] = 0;
DBuf.x[4] = 0;
DBuf.y[0] = simplecall(1, 2, 3);
DBuf.y[1] = template->y[1];
DBuf.y[2] = template->y[2];
DBuf.y[3] = 5;
DBuf.y[4] = 0;
}
static int
simplecall(int a, int b, int c)
{
return(a + b + c);
}
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe cvs-all" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200106232102.f5NL2fY73920>
