Date: Tue, 13 Apr 2010 21:35:00 +0200 From: Roman Divacky <rdivacky@FreeBSD.org> To: Luigi Rizzo <luigi@FreeBSD.org> Cc: svn-src-head@FreeBSD.org, svn-src-all@FreeBSD.org, src-committers@FreeBSD.org Subject: Re: svn commit: r206497 - in head: sbin/geom/class sbin/geom/class/sched sys/geom/sched sys/modules/geom sys/modules/geom/geom_sched sys/modules/geom/geom_sched/gs_sched sys/modules/geom/geom_sched/gsc... Message-ID: <20100413193500.GA46839@freebsd.org> In-Reply-To: <201004121637.o3CGbjSK080066@svn.freebsd.org> References: <201004121637.o3CGbjSK080066@svn.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
you dont seem to have commited the anticipatory scheduler gsched_as. why? On Mon, Apr 12, 2010 at 04:37:45PM +0000, Luigi Rizzo wrote: > Author: luigi > Date: Mon Apr 12 16:37:45 2010 > New Revision: 206497 > URL: http://svn.freebsd.org/changeset/base/206497 > > Log: > Bring in geom_sched, support for scheduling disk I/O requests > in a device independent manner. Also include an example anticipatory > scheduler, gsched_rr, which gives very nice performance improvements > in presence of competing random access patterns. > > This is joint work with Fabio Checconi, developed last year > and presented at BSDCan 2009. You can find details in the > README file or at > > http://info.iet.unipi.it/~luigi/geom_sched/ > > Added: > head/sbin/geom/class/sched/ > head/sbin/geom/class/sched/Makefile (contents, props changed) > head/sbin/geom/class/sched/geom_sched.c (contents, props changed) > head/sbin/geom/class/sched/gsched.8 (contents, props changed) > head/sys/geom/sched/ > head/sys/geom/sched/README (contents, props changed) > head/sys/geom/sched/g_sched.c (contents, props changed) > head/sys/geom/sched/g_sched.h (contents, props changed) > head/sys/geom/sched/gs_rr.c (contents, props changed) > head/sys/geom/sched/gs_scheduler.h (contents, props changed) > head/sys/geom/sched/subr_disk.c (contents, props changed) > head/sys/modules/geom/geom_sched/ > head/sys/modules/geom/geom_sched/Makefile (contents, props changed) > head/sys/modules/geom/geom_sched/Makefile.inc (contents, props changed) > head/sys/modules/geom/geom_sched/gs_sched/ > head/sys/modules/geom/geom_sched/gs_sched/Makefile (contents, props changed) > head/sys/modules/geom/geom_sched/gsched_rr/ > head/sys/modules/geom/geom_sched/gsched_rr/Makefile (contents, props changed) > Modified: > head/sbin/geom/class/Makefile > head/sys/modules/geom/Makefile > > Modified: head/sbin/geom/class/Makefile > ============================================================================== > --- head/sbin/geom/class/Makefile Mon Apr 12 13:46:20 2010 (r206496) > +++ head/sbin/geom/class/Makefile Mon Apr 12 16:37:45 2010 (r206497) > @@ -15,6 +15,7 @@ SUBDIR+=multipath > SUBDIR+=nop > SUBDIR+=part > SUBDIR+=raid3 > +SUBDIR+=sched > SUBDIR+=shsec > SUBDIR+=stripe > SUBDIR+=virstor > > Added: head/sbin/geom/class/sched/Makefile > ============================================================================== > --- /dev/null 00:00:00 1970 (empty, because file is newly added) > +++ head/sbin/geom/class/sched/Makefile Mon Apr 12 16:37:45 2010 (r206497) > @@ -0,0 +1,19 @@ > +# GEOM_LIBRARY_PATH > +# $FreeBSD$ > + > +.PATH: /usr/src/sbin/geom/misc > + > +CFLAGS += -I/usr/src/sbin/geom > + > +CLASS=sched > + > +WARNS?= 6 > +CLASS_DIR?=/lib/geom > + > +SHLIBDIR?=${CLASS_DIR} > +SHLIB_NAME?=geom_${CLASS}.so > +LINKS= ${BINDIR}/geom ${BINDIR}/g${CLASS} > +MAN= g${CLASS}.8 > +SRCS+= geom_${CLASS}.c subr.c > + > +.include <bsd.lib.mk> > > Added: head/sbin/geom/class/sched/geom_sched.c > ============================================================================== > --- /dev/null 00:00:00 1970 (empty, because file is newly added) > +++ head/sbin/geom/class/sched/geom_sched.c Mon Apr 12 16:37:45 2010 (r206497) > @@ -0,0 +1,123 @@ > +/*- > + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $Id$ > + * $FreeBSD$ > + * > + * This file implements the userspace library used by the 'geom' > + * command to load and manipulate disk schedulers. > + */ > + > +#include <sys/cdefs.h> > +#include <sys/param.h> > +#include <sys/linker.h> > +#include <sys/module.h> > + > +#include <stdio.h> > +#include <stdint.h> > +#include <libgeom.h> > + > +#include "core/geom.h" > +#include "misc/subr.h" > + > +#define G_SCHED_VERSION 0 > + > +uint32_t lib_version = G_LIB_VERSION; > +uint32_t version = G_SCHED_VERSION; > + > +/* > + * storage for parameters used by this geom class. > + * Right now only the scheduler name is used. > + */ > +static char algo[] = "rr"; /* default scheduler */ > + > +/* > + * Adapt to differences in geom library. > + * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined > + */ > +#if G_LIB_VERSION == 1 > +#define G_ARGNAME > +#define G_TYPE_BOOL G_TYPE_NUMBER > +#else > +#define G_ARGNAME NULL, > +#endif > + > +static void > +gcmd_createinsert(struct gctl_req *req, unsigned flags __unused) > +{ > + const char *reqalgo; > + char name[64]; > + > + if (gctl_has_param(req, "algo")) > + reqalgo = gctl_get_ascii(req, "algo"); > + else > + reqalgo = algo; > + > + snprintf(name, sizeof(name), "gsched_%s", reqalgo); > + /* > + * Do not complain about errors here, gctl_issue() > + * will fail anyway. > + */ > + if (modfind(name) < 0) > + kldload(name); > + gctl_issue(req); > +} > + > +struct g_command class_commands[] = { > + { "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, > + { > + { 'a', "algo", algo, G_TYPE_STRING }, > + G_OPT_SENTINEL > + }, > + G_ARGNAME "[-v] [-a algorithm_name] dev ..." > + }, > + { "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert, > + { > + { 'a', "algo", algo, G_TYPE_STRING }, > + G_OPT_SENTINEL > + }, > + G_ARGNAME "[-v] [-a algorithm_name] dev ..." > + }, > + { "configure", G_FLAG_VERBOSE, NULL, > + { > + { 'a', "algo", algo, G_TYPE_STRING }, > + G_OPT_SENTINEL > + }, > + G_ARGNAME "[-v] [-a algorithm_name] prov ..." > + }, > + { "destroy", G_FLAG_VERBOSE, NULL, > + { > + { 'f', "force", NULL, G_TYPE_BOOL }, > + G_OPT_SENTINEL > + }, > + G_ARGNAME "[-fv] prov ..." > + }, > + { "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, > + G_ARGNAME "[-v] prov ..." > + }, > + G_CMD_SENTINEL > +}; > > Added: head/sbin/geom/class/sched/gsched.8 > ============================================================================== > --- /dev/null 00:00:00 1970 (empty, because file is newly added) > +++ head/sbin/geom/class/sched/gsched.8 Mon Apr 12 16:37:45 2010 (r206497) > @@ -0,0 +1,161 @@ > +.\" Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo > +.\" All rights reserved. > +.\" $FreeBSD$ > +.\" > +.\" Redistribution and use in source and binary forms, with or without > +.\" modification, are permitted provided that the following conditions > +.\" are met: > +.\" 1. Redistributions of source code must retain the above copyright > +.\" notice, this list of conditions and the following disclaimer. > +.\" 2. Redistributions in binary form must reproduce the above copyright > +.\" notice, this list of conditions and the following disclaimer in the > +.\" documentation and/or other materials provided with the distribution. > +.\" > +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND > +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE > +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > +.\" SUCH DAMAGE. > +.\" > +.Dd April 12, 2010 > +.Dt GSCHED 8 > +.Os > +.Sh NAME > +.Nm gsched > +.Nd "control utility for disk scheduler GEOM class" > +.Sh SYNOPSIS > +.Nm > +.Cm create > +.Op Fl v > +.Op Fl a Ar algorithm > +.Ar provider ... > +.Nm > +.Cm insert > +.Op Fl v > +.Op Fl a Ar algorithm > +.Ar provider ... > +.Nm > +.Cm configure > +.Op Fl v > +.Op Fl a Ar algorithm > +.Ar node ... > +.Nm > +.Cm destroy > +.Op Fl fv > +.Ar node ... > +.Nm > +.Cm reset > +.Op Fl v > +.Ar node ... > +.Nm > +.Cm { list | status | load | unload } > +.Sh DESCRIPTION > +The > +.Nm > +utility (also callable as > +.Nm geom sched ... ) > +changes the scheduling policy of the requests going to a provider. > +.Pp > +The first argument to > +.Nm > +indicates an action to be performed: > +.Bl -tag -width ".Cm configure" > +.It Cm create > +Create a new provider and geom node using the specified scheduling algorithm. > +.Ar algorithm > +is the name of the scheduling algorithm used for the provider. > +Available algorithms include: > +.Ar rr , > +which implements anticipatory scheduling with round robin service > +among clients; > +.Ar as , > +which implements a simple form of anticipatory scheduling with > +no per-client queue. > +.Pp > +If the operation succeeds, the new provider should appear with name > +.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. . > +The kernel module > +.Pa geom_sched.ko > +will be loaded if it is not loaded already. > +.It Cm insert > +Operates as "create", but the insertion is "transparent", > +i.e. the existing provider is rerouted to the newly created geom, > +which in turn forwards requests to the existing geom. > +This operation allows one to start/stop a scheduling service > +on an already existing provider. > +.Pp > +A subsequent 'destroy' will remove the newly created geom and > +hook the provider back to the original geom. > +.Ar algorithm > +.It Cm configure > +Configure existing scheduling provider. It supports the same options > +as the > +.Nm create > +command. > +.It Cm destroy > +Destroy the geom specified in the parameter. > +.It Cm reset > +Do nothing. > +.It Cm list | status | load | unload > +See > +.Xr geom 8 . > +.El > +.Pp > +Additional options: > +.Bl -tag -width ".Fl f" > +.It Fl f > +Force the removal of the specified provider. > +.It Fl v > +Be more verbose. > +.El > +.Sh SYSCTL VARIABLES > +The following > +.Xr sysctl 8 > +variables can be used to control the behavior of the > +.Nm SCHED > +GEOM class. > +The default value is shown next to each variable. > +.Bl -tag -width indent > +.It Va kern.geom.sched.debug : No 0 > +Debug level of the > +.Nm SCHED > +GEOM class. > +This can be set to a number between 0 and 2 inclusive. > +If set to 0 minimal debug information is printed, and if set to 2 the > +maximum amount of debug information is printed. > +.El > +.Sh EXIT STATUS > +Exit status is 0 on success, and 1 if the command fails. > +.Sh EXAMPLES > +The following example shows how to create a scheduling provider for disk > +.Pa /dev/da0 > +, and how to destroy it. > +.Bd -literal -offset indent > +# Load the geom_sched module: > +kldload geom_sched > +# Load some scheduler classes used by geom_sched: > +kldload gsched_rr gsched_as > +# Configure device ad0 to use scheduler 'rr': > +geom sched insert -s rr ad0 > +# Now provider ad0 uses the 'rr' algorithm; > +# the new geom is ad0.sched. > +# Remove the scheduler on the device: > +geom sched destroy -v ad0.sched. > +.Ed > +.Pp > +.Sh SEE ALSO > +.Xr geom 4 , > +.Xr geom 8 > +.Sh HISTORY > +The > +.Nm > +utility appeared in April 2010. > +.Sh AUTHORS > +.An Fabio Checconi Aq fabio@FreeBSD.org > +.An Luigi Rizzo Aq luigi@FreeBSD.org > > Added: head/sys/geom/sched/README > ============================================================================== > --- /dev/null 00:00:00 1970 (empty, because file is newly added) > +++ head/sys/geom/sched/README Mon Apr 12 16:37:45 2010 (r206497) > @@ -0,0 +1,162 @@ > + > + --- GEOM BASED DISK SCHEDULERS FOR FREEBSD --- > + > +This code contains a framework for GEOM-based disk schedulers and a > +couple of sample scheduling algorithms that use the framework and > +implement two forms of "anticipatory scheduling" (see below for more > +details). > + > +As a quick example of what this code can give you, try to run "dd", > +"tar", or some other program with highly SEQUENTIAL access patterns, > +together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns > +(this is not a made-up example: it is pretty common for developers > +to have one or more apps doing random accesses, and others that do > +sequential accesses e.g., loading large binaries from disk, checking > +the integrity of tarballs, watching media streams and so on). > + > +These are the results we get on a local machine (AMD BE2400 dual > +core CPU, SATA 250GB disk): > + > + /mnt is a partition mounted on /dev/ad0s1f > + > + cvs: cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports > + dd-read: dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-) > + dd-writew dd bs=128k if=/dev/zero of=/mnt/largefile > + > + NO SCHEDULER RR SCHEDULER > + dd cvs dd cvs > + > + dd-read only 72 MB/s ---- 72 MB/s --- > + dd-write only 55 MB/s --- 55 MB/s --- > + dd-read+cvs 6 MB/s ok 30 MB/s ok > + dd-write+cvs 55 MB/s slooow 14 MB/s ok > + > +As you can see, when a cvs is running concurrently with dd, the > +performance drops dramatically, and depending on read or write mode, > +one of the two is severely penalized. The use of the RR scheduler > +in this example makes the dd-reader go much faster when competing > +with cvs, and lets cvs progress when competing with a writer. > + > +To try it out: > + > +1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING: > + > + On loading, this module patches one kernel function (g_io_request()) > + so that I/O requests ("bio's") carry a classification tag, useful > + for scheduling purposes. > + > + ON FREEBSD 7, the tag is stored in an existing (though rarely used) > + field of the "struct bio", a solution which makes this module > + incompatible with other modules using it, such as ZFS and gjournal. > + Additionally, g_io_request() is patched in-memory to add a call > + to the function that initializes this field (i386/amd64 only; > + for other architectures you need to manually patch sys/geom/geom_io.c). > + See details in the file g_sched.c. > + > + On FreeBSD 8.0 and above, the above trick is not necessary, > + as the struct bio contains dedicated fields for the classifier, > + and hooks for request classifiers. > + > + If you don't like the above, don't run this code. > + > +2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS > + DOES NOT CONTAIN PRECIOUS DATA. > + This is experimental code, so we make no guarantees, though > + I am routinely using it on my desktop and laptop. > + > +3. EXTRACT AND BUILD THE PROGRAMS > + A 'make install' in the directory should work (with root privs), > + or you can even try the binary modules. > + If you want to build the modules yourself, look at the Makefile. > + > +4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS > + > + The scheduler's module must be loaded first: > + > + # kldload gsched_rr > + > + substitute with gsched_as to test AS. Then, supposing that you are > + using /dev/ad0 for testing, a scheduler can be attached to it with: > + > + # geom sched insert ad0 > + > + The scheduler is inserted transparently in the geom chain, so > + mounted partitions and filesystems will keep working, but > + now requests will go through the scheduler. > + > + To change scheduler on-the-fly, you can reconfigure the geom: > + > + # geom sched configure -a as ad0.sched. > + > + assuming that gsched_as was loaded previously. > + > +5. SCHEDULER REMOVAL > + > + In principle it is possible to remove the scheduler module > + even on an active chain by doing > + > + # geom sched destroy ad0.sched. > + > + However, there is some race in the geom subsystem which makes > + the removal unsafe if there are active requests on a chain. > + So, in order to reduce the risk of data losses, make sure > + you don't remove a scheduler from a chain with ongoing transactions. > + > +--- NOTES ON THE SCHEDULERS --- > + > +The important contribution of this code is the framework to experiment > +with different scheduling algorithms. 'Anticipatory scheduling' > +is a very powerful technique based on the following reasoning: > + > + The disk throughput is much better if it serves sequential requests. > + If we have a mix of sequential and random requests, and we see a > + non-sequential request, do not serve it immediately but instead wait > + a little bit (2..5ms) to see if there is another one coming that > + the disk can serve more efficiently. > + > +There are many details that should be added to make sure that the > +mechanism is effective with different workloads and systems, to > +gain a few extra percent in performance, to improve fairness, > +insulation among processes etc. A discussion of the vast literature > +on the subject is beyond the purpose of this short note. > + > +-------------------------------------------------------------------------- > + > +TRANSPARENT INSERT/DELETE > + > +geom_sched is an ordinary geom module, however it is convenient > +to plug it transparently into the geom graph, so that one can > +enable or disable scheduling on a mounted filesystem, and the > +names in /etc/fstab do not depend on the presence of the scheduler. > + > +To understand how this works in practice, remember that in GEOM > +we have "providers" and "geom" objects. > +Say that we want to hook a scheduler on provider "ad0", > +accessible through pointer 'pp'. Originally, pp is attached to > +geom "ad0" (same name, different object) accessible through pointer old_gp > + > + BEFORE ---> [ pp --> old_gp ...] > + > +A normal "geom sched create ad0" call would create a new geom node > +on top of provider ad0/pp, and export a newly created provider > +("ad0.sched." accessible through pointer newpp). > + > + AFTER create ---> [ newpp --> gp --> cp ] ---> [ pp --> old_gp ... ] > + > +On top of newpp, a whole tree will be created automatically, and we > +can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests > +will go through the scheduler, whereas any partition mounted on > +the pre-existing device entries will not go through the scheduler. > + > +With the transparent insert mechanism, the original provider "ad0"/pp > +is hooked to the newly created geom, as follows: > + > + AFTER insert ---> [ pp --> gp --> cp ] ---> [ newpp --> old_gp ... ] > + > +so anything that was previously using provider pp will now have > +the requests routed through the scheduler node. > + > +A removal ("geom sched destroy ad0.sched.") will restore the original > +configuration. > + > +# $FreeBSD$ > > Added: head/sys/geom/sched/g_sched.c > ============================================================================== > --- /dev/null 00:00:00 1970 (empty, because file is newly added) > +++ head/sys/geom/sched/g_sched.c Mon Apr 12 16:37:45 2010 (r206497) > @@ -0,0 +1,1901 @@ > +/*- > + * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > + */ > + > +/* > + * $Id$ > + * $FreeBSD$ > + * > + * Main control module for geom-based disk schedulers ('sched'). > + * > + * USER VIEW > + * A 'sched' node is typically inserted transparently between > + * an existing provider pp and its original geom gp > + * > + * [pp --> gp ..] > + * > + * using the command "geom sched insert <provider>" and > + * resulting in the following topology > + * > + * [pp --> sched_gp --> cp] [new_pp --> gp ... ] > + * > + * Deletion "geom sched destroy <provider>.sched." restores the > + * original chain. The normal "geom sched create <provide>" > + * is also supported. > + * > + * INTERNALS > + * Internally, the 'sched' uses the following data structures > + * > + * geom{} g_sched_softc{} g_gsched{} > + * +----------+ +---------------+ +-------------+ > + * | softc *-|--->| sc_gsched *-|-->| gs_init | > + * | ... | | | | gs_fini | > + * | | | [ hash table] | | gs_start | > + * +----------+ | | | ... | > + * | | +-------------+ > + * | | > + * | | g_*_softc{} > + * | | +-------------+ > + * | sc_data *-|-->| | > + * +---------------+ | algorithm- | > + * | specific | > + * +-------------+ > + * > + * A g_sched_softc{} is created with a "geom sched insert" call. > + * In turn this instantiates a specific scheduling algorithm, > + * which sets sc_gsched to point to the algorithm callbacks, > + * and calls gs_init() to create the g_*_softc{} . > + * The other callbacks (gs_start, gs_next, ...) are invoked > + * as needed > + * > + * g_sched_softc{} is defined in g_sched.h and mostly used here; > + * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h; > + * g_*_softc{} is defined/implemented by each algorithm (gs_*.c) > + * > + * DATA MOVING > + * When a bio is received on the provider, it goes to the > + * g_sched_start() which calls gs_start() to initially queue it; > + * then we call g_sched_dispatch() that loops around gs_next() > + * to select zero or more bio's to be sent downstream. > + * > + * g_sched_dispatch() can also be called as a result of a timeout, > + * e.g. when doing anticipation or pacing requests. > + * > + * When a bio comes back, it goes to g_sched_done() which in turn > + * calls gs_done(). The latter does any necessary housekeeping in > + * the scheduling algorithm, and may decide to call g_sched_dispatch() > + * to send more bio's downstream. > + * > + * If an algorithm needs per-flow queues, these are created > + * calling gs_init_class() and destroyed with gs_fini_class(), > + * and they are also inserted in the hash table implemented in > + * the g_sched_softc{} > + * > + * If an algorithm is replaced, or a transparently-inserted node is > + * removed with "geom sched destroy", we need to remove all references > + * to the g_*_softc{} and g_sched_softc from the bio's still in > + * the scheduler. g_sched_forced_dispatch() helps doing this. > + * XXX need to explain better. > + */ > + > +#include <sys/cdefs.h> > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/kernel.h> > +#include <sys/module.h> > +#include <sys/lock.h> > +#include <sys/mutex.h> > +#include <sys/bio.h> > +#include <sys/limits.h> > +#include <sys/hash.h> > +#include <sys/sysctl.h> > +#include <sys/malloc.h> > +#include <sys/proc.h> /* we access curthread */ > +#include <geom/geom.h> > +#include "gs_scheduler.h" > +#include "g_sched.h" /* geom hooks */ > + > +/* > + * Size of the per-geom hash table storing traffic classes. > + * We may decide to change it at a later time, it has no ABI > + * implications as it is only used for run-time allocations. > + */ > +#define G_SCHED_HASH_SIZE 32 > + > +static int g_sched_destroy(struct g_geom *gp, boolean_t force); > +static int g_sched_destroy_geom(struct gctl_req *req, > + struct g_class *mp, struct g_geom *gp); > +static void g_sched_config(struct gctl_req *req, struct g_class *mp, > + const char *verb); > +static struct g_geom *g_sched_taste(struct g_class *mp, > + struct g_provider *pp, int flags __unused); > +static void g_sched_dumpconf(struct sbuf *sb, const char *indent, > + struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); > +static void g_sched_init(struct g_class *mp); > +static void g_sched_fini(struct g_class *mp); > + > +struct g_class g_sched_class = { > + .name = G_SCHED_CLASS_NAME, > + .version = G_VERSION, > + .ctlreq = g_sched_config, > + .taste = g_sched_taste, > + .destroy_geom = g_sched_destroy_geom, > + .init = g_sched_init, > + .fini = g_sched_fini > +}; > + > +MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures"); > + > +/* > + * Global variables describing the state of the geom_sched module. > + * There is only one static instance of this structure. > + */ > +LIST_HEAD(gs_list, g_gsched); /* type, link field */ > +struct geom_sched_vars { > + struct mtx gs_mtx; > + struct gs_list gs_scheds; /* list of algorithms */ > + u_int gs_debug; > + u_int gs_sched_count; /* how many algorithms ? */ > + u_int gs_patched; /* g_io_request was patched */ > + > + u_int gs_initialized; > + u_int gs_expire_secs; /* expiration of hash entries */ > + > + struct bio_queue_head gs_pending; > + u_int gs_npending; > + > + /* The following are for stats, usually protected by gs_mtx. */ > + u_long gs_requests; /* total requests */ > + u_long gs_done; /* total done */ > + u_int gs_in_flight; /* requests in flight */ > + u_int gs_writes_in_flight; > + u_int gs_bytes_in_flight; > + u_int gs_write_bytes_in_flight; > + > + char gs_names[256]; /* names of schedulers */ > +}; > + > +static struct geom_sched_vars me = { > + .gs_expire_secs = 10, > +}; > + > +SYSCTL_DECL(_kern_geom); > +SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0, > + "GEOM_SCHED stuff"); > + > +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD, > + &me.gs_write_bytes_in_flight, 0, "Write bytes in flight"); > + > +SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD, > + &me.gs_bytes_in_flight, 0, "Bytes in flight"); > + > +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD, > + &me.gs_writes_in_flight, 0, "Write Requests in flight"); > + > +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD, > + &me.gs_in_flight, 0, "Requests in flight"); > + > +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD, > + &me.gs_done, 0, "Total done"); > + > +SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD, > + &me.gs_requests, 0, "Total requests"); > + > +SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD, > + &me.gs_names, 0, "Algorithm names"); > + > +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD, > + &me.gs_sched_count, 0, "Number of algorithms"); > + > +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW, > + &me.gs_debug, 0, "Debug level"); > + > +SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW, > + &me.gs_expire_secs, 0, "Expire time in seconds"); > + > +/* > + * g_sched calls the scheduler algorithms with this lock held. > + * The locking functions are exposed so the scheduler algorithms can also > + * protect themselves e.g. when running a callout handler. > + */ > +void > +g_sched_lock(struct g_geom *gp) > +{ > + struct g_sched_softc *sc = gp->softc; > + > + mtx_lock(&sc->sc_mtx); > +} > + > +void > +g_sched_unlock(struct g_geom *gp) > +{ > + struct g_sched_softc *sc = gp->softc; > + > + mtx_unlock(&sc->sc_mtx); > +} > + > +/* > + * Support functions to handle references to the module, > + * which are coming from devices using this scheduler. > + */ > +static inline void > +g_gsched_ref(struct g_gsched *gsp) > +{ > + > + atomic_add_int(&gsp->gs_refs, 1); > +} > + > +static inline void > +g_gsched_unref(struct g_gsched *gsp) > +{ > + > + atomic_add_int(&gsp->gs_refs, -1); > +} > + > +/* > + * Update the stats when this request is done. > + */ > +static void > +g_sched_update_stats(struct bio *bio) > +{ > + > + me.gs_done++; > + me.gs_in_flight--; > + me.gs_bytes_in_flight -= bio->bio_length; > + if (bio->bio_cmd & BIO_WRITE) { > + me.gs_writes_in_flight--; > + me.gs_write_bytes_in_flight -= bio->bio_length; > + } > +} > + > +/* > + * Dispatch any pending request. > + */ > +static void > +g_sched_forced_dispatch(struct g_geom *gp) > +{ > + struct g_sched_softc *sc = gp->softc; > + struct g_gsched *gsp = sc->sc_gsched; > + struct bio *bp; > + > + KASSERT(mtx_owned(&sc->sc_mtx), > + ("sc_mtx not owned during forced dispatch")); > + > + while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL) > + g_io_request(bp, LIST_FIRST(&gp->consumer)); > +} > + > +/* > + * The main dispatch loop, called either here after the start > + * routine, or by scheduling algorithms when they receive a timeout > + * or a 'done' notification. Does not share code with the forced > + * dispatch path, since the gs_done() callback can call us. > + */ > +void > +g_sched_dispatch(struct g_geom *gp) > +{ > + struct g_sched_softc *sc = gp->softc; > + struct g_gsched *gsp = sc->sc_gsched; > + struct bio *bp; > + > + KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch")); > + > + if ((sc->sc_flags & G_SCHED_FLUSHING)) > + return; > + > + while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL) > + g_io_request(bp, LIST_FIRST(&gp->consumer)); > +} > + > +/* > + * Recent (8.0 and above) versions of FreeBSD have support to > + * register classifiers of disk requests. The classifier is > + * invoked by g_io_request(), and stores the information into > + * bp->bio_classifier1. > + * > + * Support for older versions, which is left here only for > + * documentation purposes, relies on two hacks: > + * 1. classification info is written into the bio_caller1 > + * field of the topmost node in the bio chain. This field > + * is rarely used, but this module is incompatible with > + * those that use bio_caller1 for other purposes, > + * such as ZFS and gjournal; > + * 2. g_io_request() is patched in-memory when the module is > + * loaded, so that the function calls a classifier as its > + * first thing. g_io_request() is restored when the module > + * is unloaded. This functionality is only supported for > + * x86 and amd64, other architectures need source code changes. > + */ > + > +/* > + * Lookup the identity of the issuer of the original request. > + * In the current implementation we use the curthread of the > + * issuer, but different mechanisms may be implemented later > + * so we do not make assumptions on the return value which for > + * us is just an opaque identifier. > + */ > + > +static inline u_long > +g_sched_classify(struct bio *bp) > +{ > + > +#if __FreeBSD_version > 800098 > + /* we have classifier fields in the struct bio */ > +#define HAVE_BIO_CLASSIFIER > + return ((u_long)bp->bio_classifier1); > +#else > +#warning old version!!! > + while (bp->bio_parent != NULL) > + bp = bp->bio_parent; > + > + return ((u_long)bp->bio_caller1); > +#endif > +} > + > +/* Return the hash chain for the given key. */ > +static inline struct g_hash * > +g_sched_hash(struct g_sched_softc *sc, u_long key) > +{ > + > + return (&sc->sc_hash[key & sc->sc_mask]); > +} > + > +/* > + * Helper function for the children classes, which takes > + * a geom and a bio and returns the private descriptor > + * associated to the request. This involves fetching > + * the classification field and [al]locating the > + * corresponding entry in the hash table. > + */ > +void * > +g_sched_get_class(struct g_geom *gp, struct bio *bp) > +{ > + struct g_sched_softc *sc; > + struct g_sched_class *gsc; > + struct g_gsched *gsp; > + struct g_hash *bucket; > + u_long key; > + > + sc = gp->softc; > + key = g_sched_classify(bp); > + bucket = g_sched_hash(sc, key); > + LIST_FOREACH(gsc, bucket, gsc_clist) { > + if (key == gsc->gsc_key) { > + gsc->gsc_refs++; > + return (gsc->gsc_priv); > + } > + } > + > + gsp = sc->sc_gsched; > + gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size, > + M_GEOM_SCHED, M_NOWAIT | M_ZERO); > + if (!gsc) > + return (NULL); > + > + if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) { > + free(gsc, M_GEOM_SCHED); > + return (NULL); > + } > + > + gsc->gsc_refs = 2; /* 1 for the hash table, 1 for the caller. */ > + gsc->gsc_key = key; > + LIST_INSERT_HEAD(bucket, gsc, gsc_clist); > + > + gsc->gsc_expire = ticks + me.gs_expire_secs * hz; > + > + return (gsc->gsc_priv); > +} > + > +/* > + * Release a reference to the per-client descriptor, > + */ > +void > +g_sched_put_class(struct g_geom *gp, void *priv) > +{ > + struct g_sched_class *gsc; > + struct g_sched_softc *sc; > + > + gsc = g_sched_priv2class(priv); > + gsc->gsc_expire = ticks + me.gs_expire_secs * hz; > + > + if (--gsc->gsc_refs > 0) > + return; > + > + sc = gp->softc; > + sc->sc_gsched->gs_fini_class(sc->sc_data, priv); > + > + LIST_REMOVE(gsc, gsc_clist); > + free(gsc, M_GEOM_SCHED); > +} > + > +static void > +g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, > + struct g_gsched *gsp, void *data) > +{ > + struct g_sched_class *cp, *cp2; > + int i; > + > + if (!hp) > + return; > + > + if (data && gsp->gs_hash_unref) > + gsp->gs_hash_unref(data); > + > + for (i = 0; i < G_SCHED_HASH_SIZE; i++) { > + LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) > + g_sched_put_class(gp, cp->gsc_priv); > + } > + > + hashdestroy(hp, M_GEOM_SCHED, mask); > +} > + > +static struct g_hash * > +g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags) > +{ > + struct g_hash *hash; > + > + if (gsp->gs_priv_size == 0) > + return (NULL); > + > + hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags); > + > + return (hash); > +} > + > +static void > +g_sched_flush_classes(struct g_geom *gp) > +{ > + struct g_sched_softc *sc; > + struct g_sched_class *cp, *cp2; > + int i; > + > + sc = gp->softc; > + > + if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0) > + return; > + > + for (i = 0; i < G_SCHED_HASH_SIZE; i++) { > + LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) { > + if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0) > + g_sched_put_class(gp, cp->gsc_priv); > + } > + } > + > + sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz; > +} > + > +/* > + * Wait for the completion of any outstanding request. To ensure > + * that this does not take forever the caller has to make sure that > + * no new request enter the scehduler before calling us. > + * > + * Must be called with the gp mutex held and topology locked. > + */ > +static int > +g_sched_wait_pending(struct g_geom *gp) > +{ > + struct g_sched_softc *sc = gp->softc; > + int endticks = ticks + hz; > + > + g_topology_assert(); > + > + while (sc->sc_pending && endticks - ticks >= 0) > + msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); > + > + return (sc->sc_pending ? ETIMEDOUT : 0); > +} > + > +static int > > *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20100413193500.GA46839>