From owner-cvs-src@FreeBSD.ORG Tue Nov 9 21:13:25 2004 Return-Path: Delivered-To: cvs-src@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id 97C4E16A4CE; Tue, 9 Nov 2004 21:13:25 +0000 (GMT) Received: from fledge.watson.org (fledge.watson.org [204.156.12.50]) by mx1.FreeBSD.org (Postfix) with ESMTP id F265143D45; Tue, 9 Nov 2004 21:13:24 +0000 (GMT) (envelope-from robert@fledge.watson.org) Received: from fledge.watson.org (localhost [127.0.0.1]) by fledge.watson.org (8.13.1/8.13.1) with ESMTP id iA9LCGFB061136; Tue, 9 Nov 2004 16:12:16 -0500 (EST) (envelope-from robert@fledge.watson.org) Received: from localhost (robert@localhost)iA9LCGWO061133; Tue, 9 Nov 2004 21:12:16 GMT (envelope-from robert@fledge.watson.org) Date: Tue, 9 Nov 2004 21:12:16 +0000 (GMT) From: Robert Watson X-Sender: robert@fledge.watson.org To: Stephan Uphoff In-Reply-To: <1100027518.29384.87.camel@palm.tree.com> Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII cc: src-committers@FreeBSD.org cc: John Baldwin cc: Alan Cox cc: cvs-src@FreeBSD.org cc: Mike Silbersack cc: cvs-all@FreeBSD.org Subject: Re: cvs commit: src/sys/i386/i386 pmap.c X-BeenThere: cvs-src@freebsd.org X-Mailman-Version: 2.1.1 Precedence: list List-Id: CVS commit messages for the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 09 Nov 2004 21:13:25 -0000 On Tue, 9 Nov 2004, Stephan Uphoff wrote: > > Interestingly, I've now run through some more "macro" benchmarks. I saw a > > couple of percent improvement on UP from the change, but indeed, I saw a > > slight decrease in performance for the rapid packet send benchmark on SMP. > > > > So I guess my recommendation is to get this in the tree for UP, and see if > > we can figure out why it's having the slow-down effect on SMP. > > We are probably talking cache line effects here. > My guess is that we should: > > 1) Make sure that important spin mutexes are alone in a cache line. > 2) Take care not to dirty the cache line unnecessarily. > > I think for 2 we need to change the spin mutex slightly (for SMP) to > never call LOCK cmpxchgl before a simple load operation finds > m->mtx_lock == MTX_UNOWNED since LOCK cmpxchgl always seems to dirty the > cache line. Micro-benchmark attached below. Has some obvious imperfections, but seems functional. Divide the output of the sysctls by 50,000 to get the cycles. Watch out for colisions with hardclock and softclock. Robert N M Watson FreeBSD Core Team, TrustedBSD Projects robert@fledge.watson.org Principal Research Scientist, McAfee Research /*- * Copyright (c) 2004 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS QUADERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #define ITERATIONS 50000 SYSCTL_NODE(, OID_AUTO, test, CTLFLAG_RD, NULL, "test tree"); static __inline void test_critical(void) { critical_enter(); critical_exit(); } static int sysctl_test_critical_timing(SYSCTL_HANDLER_ARGS) { u_int64_t start_cycles, stop_cycles; u_int64_t i; start_cycles = get_cyclecount(); for (i = 0; i < ITERATIONS; i++) test_critical(); stop_cycles = get_cyclecount(); i = stop_cycles - start_cycles; return (sysctl_handle_int(oidp, &i, sizeof(i), req)); } SYSCTL_PROC(_test, OID_AUTO, critical_timing, CTLTYPE_QUAD | CTLFLAG_RW, 0, 0, sysctl_test_critical_timing, "I", "Timing for critical sections"); static __inline void test_sleep_mutex(struct mtx *mutex) { mtx_lock(mutex); mtx_unlock(mutex); } static int sysctl_test_sleep_mutex_timing(SYSCTL_HANDLER_ARGS) { u_int64_t start_cycles, stop_cycles; struct mtx mutex; u_int64_t i; bzero(&mutex, sizeof(mutex)); mtx_init(&mutex, "mutex", NULL, MTX_DEF); start_cycles = get_cyclecount(); for (i = 0; i < ITERATIONS; i++) test_sleep_mutex(&mutex); stop_cycles = get_cyclecount(); i = stop_cycles - start_cycles; mtx_destroy(&mutex); return (sysctl_handle_int(oidp, &i, sizeof(i), req)); } SYSCTL_PROC(_test, OID_AUTO, sleep_mutex_timing, CTLTYPE_QUAD | CTLFLAG_RW, NULL, 0, sysctl_test_sleep_mutex_timing, "I", "Timing for sleep mutexes"); static __inline void test_spin_mutex(struct mtx *mutex) { mtx_lock_spin(mutex); mtx_unlock_spin(mutex); } static int sysctl_test_spin_mutex_timing(SYSCTL_HANDLER_ARGS) { u_int64_t start_cycles, stop_cycles; struct mtx mutex; u_int64_t i; bzero(&mutex, sizeof(mutex)); mtx_init(&mutex, "mutex", NULL, MTX_SPIN); start_cycles = get_cyclecount(); for (i = 0; i < ITERATIONS; i++) test_spin_mutex(&mutex); stop_cycles = get_cyclecount(); i = stop_cycles - start_cycles; mtx_destroy(&mutex); return (sysctl_handle_int(oidp, &i, sizeof(i), req)); } SYSCTL_PROC(_test, OID_AUTO, spin_mutex_timing, CTLTYPE_QUAD | CTLFLAG_RW, NULL, 0, sysctl_test_spin_mutex_timing, "I", "Timing for spin mutexes"); static __inline void test_exclusive_sx(struct sx *sx) { sx_xlock(sx); sx_xunlock(sx); } static int sysctl_test_exclusive_sx_timing(SYSCTL_HANDLER_ARGS) { u_int64_t start_cycles, stop_cycles; struct sx sx; u_int64_t i; bzero(&sx, sizeof(sx)); sx_init(&sx, "sx"); start_cycles = get_cyclecount(); for (i = 0; i < ITERATIONS; i++) test_exclusive_sx(&sx); stop_cycles = get_cyclecount(); i = stop_cycles - start_cycles; sx_destroy(&sx); return (sysctl_handle_int(oidp, &i, sizeof(i), req)); } SYSCTL_PROC(_test, OID_AUTO, exclusive_sx_timing, CTLTYPE_QUAD | CTLFLAG_RW, NULL, 0, sysctl_test_exclusive_sx_timing, "I", "Timing for xlock sx"); static __inline void test_shared_sx(struct sx *sx) { sx_slock(sx); sx_sunlock(sx); } static int sysctl_test_shared_sx_timing(SYSCTL_HANDLER_ARGS) { u_int64_t start_cycles, stop_cycles; struct sx sx; u_int64_t i; bzero(&sx, sizeof(sx)); sx_init(&sx, "sx"); start_cycles = get_cyclecount(); for (i = 0; i < ITERATIONS; i++) test_shared_sx(&sx); stop_cycles = get_cyclecount(); i = stop_cycles - start_cycles; sx_destroy(&sx); return (sysctl_handle_int(oidp, &i, sizeof(i), req)); } SYSCTL_PROC(_test, OID_AUTO, shared_sx_timing, CTLTYPE_QUAD | CTLFLAG_RW, NULL, 0, sysctl_test_shared_sx_timing, "I", "Timing for xlock sx");