Date: Mon, 2 Feb 2009 20:51:26 +0000 (UTC) From: Ulf Lilleengen <lulf@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r188025 - in projects/geom_raid5: sbin/geom/class/raid5 sys/geom/raid5 sys/modules/geom/geom_raid5 Message-ID: <200902022051.n12KpQhM007111@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: lulf Date: Mon Feb 2 20:51:26 2009 New Revision: 188025 URL: http://svn.freebsd.org/changeset/base/188025 Log: - Import the geom_raid5 PP sources with a few modifications such as adding macros for metadata offsets as well as converting kthread to kproc. Added: projects/geom_raid5/sbin/geom/class/raid5/ projects/geom_raid5/sbin/geom/class/raid5/Makefile (contents, props changed) projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c (contents, props changed) projects/geom_raid5/sbin/geom/class/raid5/graid5.8 projects/geom_raid5/sys/geom/raid5/ projects/geom_raid5/sys/geom/raid5/g_raid5.c (contents, props changed) projects/geom_raid5/sys/geom/raid5/g_raid5.h (contents, props changed) projects/geom_raid5/sys/modules/geom/geom_raid5/ projects/geom_raid5/sys/modules/geom/geom_raid5/Makefile (contents, props changed) Added: projects/geom_raid5/sbin/geom/class/raid5/Makefile ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/geom_raid5/sbin/geom/class/raid5/Makefile Mon Feb 2 20:51:26 2009 (r188025) @@ -0,0 +1,7 @@ +# $FreeBSD$ + +.PATH: ${.CURDIR}/../../misc + +CLASS= raid5 + +.include <bsd.lib.mk> Added: projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/geom_raid5/sbin/geom/class/raid5/geom_raid5.c Mon Feb 2 20:51:26 2009 (r188025) @@ -0,0 +1,325 @@ +/*- + * Copyright (c) 2006 Arne Woerner <arne_woerner@yahoo.com> + * testing + tuning-tricks: veronica@fluffles.net + * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd@FreeBSD.org>) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$Id: geom_raid5.c,v 1.33.1.12 2007/11/12 20:24:45 aw Exp aw $"); + +#include <sys/param.h> +#include <errno.h> +#include <paths.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <assert.h> +#include <libgeom.h> +#include <geom/raid5/g_raid5.h> + +#include "core/geom.h" +#include "misc/subr.h" + +uint32_t lib_version = G_LIB_VERSION; +uint32_t version = G_RAID5_VERSION; +static intmax_t default_stripesize = 64*1024; + +static void raid5_main(struct gctl_req *req, unsigned flags); +static void raid5_clear(struct gctl_req *req); +static void raid5_dump(struct gctl_req *req); +static void raid5_label(struct gctl_req *req); + +#ifndef G_TYPE_BOOL +#define G_TYPE_BOOL G_TYPE_NONE +#endif + +#if __FreeBSD_version >= 700000 +#define GCMD67 NULL, +#else +#define GCMD67 +#endif +struct g_command class_commands[] = { + { "clear", G_FLAG_VERBOSE, raid5_main, G_NULL_OPTS, GCMD67 + "[-v] prov ..." + }, + { "destroy", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'y', "noyoyo", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, GCMD67 + "[-fvy] name ..." + }, + { "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS, GCMD67 + "[-v] name prov" + }, + { "insert", G_FLAG_VERBOSE, NULL, + { { 'h', "hardcode", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL}, GCMD67 + "[-hv] name prov" + }, + { "configure", G_FLAG_VERBOSE, NULL, + { { 'h', "hardcode", NULL, G_TYPE_BOOL }, + { 'a', "activate", NULL, G_TYPE_BOOL }, + { 'c', "cowop", NULL, G_TYPE_BOOL }, + { 'n', "nohot", NULL, G_TYPE_BOOL }, + { 'S', "safeop", NULL, G_TYPE_BOOL }, + { 'R', "rebuild", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL}, GCMD67 + "[-RSchnva] name" + }, + { "dump", 0, raid5_main, G_NULL_OPTS, GCMD67 + "prov ..." + }, + { "label", G_FLAG_VERBOSE | G_FLAG_LOADKLD, raid5_main, + { { 'c', "cowop", NULL, G_TYPE_BOOL }, + { 'h', "hardcode", NULL, G_TYPE_BOOL }, + { 'n', "nohot", NULL, G_TYPE_BOOL }, + { 's', "stripesize", &default_stripesize, G_TYPE_NUMBER }, + { 'S', "safeop", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL}, GCMD67 + "[-chvn] [-s stripesize] [-S] name prov ..." + }, + { "stop", G_FLAG_VERBOSE, NULL, + { + { 'f', "force", NULL, G_TYPE_BOOL }, + { 'y', "noyoyo", NULL, G_TYPE_BOOL }, + G_OPT_SENTINEL + }, GCMD67 + "[-fv] name ..." + }, + G_CMD_SENTINEL +}; + +static int verbose = 0; + +static void +raid5_main(struct gctl_req *req, unsigned flags) +{ + const char *name; + + if ((flags & G_FLAG_VERBOSE) != 0) + verbose = 1; + + name = gctl_get_ascii(req, "verb"); + if (name == NULL) { + gctl_error(req, "No '%s' argument.", "verb"); + return; + } + if (strcmp(name, "label") == 0) + raid5_label(req); + else if (strcmp(name, "clear") == 0) + raid5_clear(req); + else if (strcmp(name, "dump") == 0) + raid5_dump(req); + else + gctl_error(req, "Unknown command: %s.", name); +} + +static void +raid5_label(struct gctl_req *req) +{ + struct g_raid5_metadata md; + const char *name; + int error, i, hardcode, nargs, safeop, nohot, cowop; + intmax_t stripesize; + + nargs = gctl_get_int(req, "nargs"); + if (nargs < 3) { + gctl_error(req, "Too few arguments."); + return; + } + nohot = gctl_get_int(req, "nohot"); + hardcode = gctl_get_int(req, "hardcode"); + safeop = gctl_get_int(req, "safeop"); + cowop = gctl_get_int(req, "cowop"); + stripesize = gctl_get_intmax(req, "stripesize"); + if (stripesize > 256*1024) { + gctl_error(req, "stripesize must be less than 512KB."); + return; + } + if (!powerof2(stripesize)) { + int cs; + for (cs=4096; cs < stripesize; cs<<=1); + gctl_error(req, "Invalid stripe size: %jd, recommended: %d.", + stripesize, cs); + return; + } + + + /* + * Clear last sector first to spoil all components if device exists. + */ + for (i = 1; i < nargs; i++) { + name = gctl_get_ascii(req, "arg%d", i); + error = g_metadata_clear(name, NULL); + if (error != 0) { + gctl_error(req, "Can't store metadata on %s: %s.", name, + strerror(error)); + return; + } + } + + strlcpy(md.md_magic, G_RAID5_MAGIC, sizeof(md.md_magic)); + md.md_version = G_RAID5_VERSION; + name = gctl_get_ascii(req, "arg0"); + strlcpy(md.md_name, name, sizeof(md.md_name)); + md.md_id = arc4random(); + md.md_all = nargs - 1; + md.md_stripesize = stripesize; + md.md_verified = 0; + md.md_newest = -1; + md.md_no_hot = nohot; + md.md_state = nohot ? G_RAID5_STATE_CALM : + (G_RAID5_STATE_HOT|G_RAID5_STATE_VERIFY); + if (safeop) + md.md_state |= G_RAID5_STATE_SAFEOP; + if (cowop) + md.md_state |= G_RAID5_STATE_COWOP; + + /* + * Ok, store metadata. + */ + int64_t min = -1; + int64_t waste = 0; + for (i = 1; i < nargs; i++) { + u_char sector[512]; + int64_t pmin; + + name = gctl_get_ascii(req, "arg%d", i); + md.md_no = i - 1; + if (!hardcode) + bzero(md.md_provider, sizeof(md.md_provider)); + else { + if (strncmp(name, _PATH_DEV, strlen(_PATH_DEV)) == 0) + name += strlen(_PATH_DEV); + strlcpy(md.md_provider, name, sizeof(md.md_provider)); + } + md.md_provsize = g_get_mediasize(name); + pmin = md.md_provsize - g_get_sectorsize(name); + waste += pmin % stripesize; + if (min < 0) + min = pmin; + else if (min > pmin) { + waste += (i-1) * (min - pmin); + min = pmin; + } else + waste += pmin - min; + if (md.md_provsize == 0) { + fprintf(stderr, "Can't get mediasize of %s: %s.\n", + name, strerror(errno)); + gctl_error(req, "Not fully done."); + continue; + } + raid5_metadata_encode(&md, sector); + error = g_metadata_store(name, sector, sizeof(sector)); + if (error != 0) { + fprintf(stderr, "Can't store metadata on %s: %s.\n", + name, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + if (verbose) + printf("Metadata value stored on %s.\n", name); + } + if (waste > 0) + printf("Wasting %jd bytes (>=%jdGB).\n", waste, waste>>(3*10)); +} + +static void +raid5_clear(struct gctl_req *req) +{ + const char *name; + int error, i, nargs; + + nargs = gctl_get_int(req, "nargs"); + if (nargs < 1) { + gctl_error(req, "Too few arguments."); + return; + } + + for (i = 0; i < nargs; i++) { + name = gctl_get_ascii(req, "arg%d", i); + error = g_metadata_clear(name, G_RAID5_MAGIC); + if (error != 0) { + fprintf(stderr, "Can't clear metadata on %s: %s.\n", + name, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + if (verbose) + printf("Metadata cleared on %s.\n", name); + } +} + +static void +raid5_metadata_dump(const struct g_raid5_metadata *md) +{ + + printf(" Magic string: %s\n", md->md_magic); + printf(" Metadata version: %u\n", (u_int)md->md_version); + printf(" Device name: %s\n", md->md_name); + printf(" Device ID: %u\n", (u_int)md->md_id); + printf(" Disk number: %u\n", (u_int)md->md_no); + printf("Total number of disks: %u\n", (u_int)md->md_all); + printf(" Provider Size: %jd\n", md->md_provsize); + printf(" Verified: %jd\n", md->md_verified); + printf(" State: %u\n", (u_int)md->md_state); + printf(" Stripe size: %u\n", (u_int)md->md_stripesize); + printf(" Newest: %u\n", (u_int)md->md_newest); + printf(" NoHot: %s\n", md->md_no_hot?"Yes":"No"); + printf(" Hardcoded provider: %s\n", md->md_provider); +} + +static void +raid5_dump(struct gctl_req *req) +{ + struct g_raid5_metadata md, tmpmd; + const char *name; + int error, i, nargs; + + nargs = gctl_get_int(req, "nargs"); + if (nargs < 1) { + gctl_error(req, "Too few arguments."); + return; + } + + for (i = 0; i < nargs; i++) { + name = gctl_get_ascii(req, "arg%d", i); + error = g_metadata_read(name, (u_char *)&tmpmd, sizeof(tmpmd), + G_RAID5_MAGIC); + if (error != 0) { + fprintf(stderr, "Can't read metadata from %s: %s.\n", + name, strerror(error)); + gctl_error(req, "Not fully done."); + continue; + } + raid5_metadata_decode((u_char *)&tmpmd, &md); + printf("Metadata on %s:\n", name); + raid5_metadata_dump(&md); + printf("\n"); + } +} Added: projects/geom_raid5/sbin/geom/class/raid5/graid5.8 ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/geom_raid5/sbin/geom/class/raid5/graid5.8 Mon Feb 2 20:51:26 2009 (r188025) @@ -0,0 +1,309 @@ +.\" Copyright (c) 2006 Arne Woerner <arne_woerner@yahoo.com> +.\" testing + tuning-tricks: veronica@fluffles.net +.\" testing: lev@FreeBSD.org +.\" derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd@FreeBSD.org>) +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $Id: graid5.8,v 1.18 2008/05/22 02:10:47 aw Exp $ +.\" +.Dd Dec 11, 2006 +.Dt GRAID5 8 +.Os +.Sh NAME +.Nm graid5 +.Nd "control utility for raid5 devices" +.Sh SYNOPSIS +.Nm +.Cm destroy +.Op Fl fvy +.Ar name ... +.Nm +.Cm label +.Op Fl hnSv +.Op Fl s Ar stripesize +.Ar name +.Ar prov prov ... +.Nm +.Cm configure +.Op Fl hnRS +.Ar name +.Nm +.Cm stop +.Op Fl fv +.Ar name ... +.Nm +.Cm insert +.Ar name prov +.Nm +.Cm remove +.Ar name prov +.Nm +.Cm clear +.Op Fl v +.Ar prov ... +.Nm +.Cm dump +.Ar prov ... +.Nm +.Cm list +.Nm +.Cm status +.Nm +.Cm load +.Nm +.Cm unload +.Sh DESCRIPTION +The +.Nm +utility is used for setting up a RAID-5 on two or more disks. +The RAID5'ed device can be configured using two different methods: +.Dq manual +or +.Dq automatic . +When using the +.Dq manual +method, no metadata are stored on the devices, so the RAID5 +device has to be configured by hand every time it is needed. +The +.Dq automatic +method uses on-disk metadata to detect devices. +Once devices are labeled, they will be automatically detected and +configured. +.Pp +The first argument to +.Nm +indicates an action to be performed: +.Bl -tag -width ".Cm destroy" +.It Cm label +Set up a RAID5 device from the given devices with the specified +.Ar name . +This is the +.Dq automatic +method, where metadata are stored in every device's last sector. +The kernel module +.Pa geom_raid5.ko +will be loaded if it is not loaded already. +.Pp +Additional options include: +.Bl -tag -width ".Fl s Ar stripesize" +.It Fl h +Hardcode providers' names in metadata. +.It Fl c +CowOp mode: Complete-Only-Write-Operation -- +dont write if not in status COMPLETE. +.It Fl S +SafeOp mode: read the whole stripe for every read and verify parity. +.It Fl n +never-hot-mode: A 2 disk graid5 device doesnt need the hot marker, +if it is used as swap space. Furthermore this flags is useful, if +a rebuild would be harmful even if a write request was pending. +.It Fl s Ar stripesize +Specify stripesize. +Recommendation: MAXPHYS (currently 128KiB) == stripesize. +The +.Ar stripesize +must be a power of 2 and +a multiple of the largest sector size of all the providers. +.El +.It Cm configure +Configure an existing graid5 device: +.Pp +Options are: +.Bl -tag -width "Fl h" +.It Fl h +Trigger: hardcoded option. +.It Fl a +Reset error flag of all disks. +.It Fl c +CowOp mode: Complete-Only-Write-Operation -- +dont write if not in status COMPLETE. +.It Fl n +Trigger: never-hot-mode option. +.It Fl S +Trigger: SafeOp-mode option. +.It Fl R +Trigger: start/stop re-sync. +.El +.It Cm stop +Turn off an existing RAID5 device by its +.Ar name . +This command does not touch on-disk metadata! +.Pp +Options are: +.Bl -tag -width "Fl y" +.It Fl f +Force destroy even if still busy. +.It Fl y +Do not do the Yo-Yo effect. +.El +.It Cm destroy +Same as +.Cm stop . +.It Cm clear +Clear metadata on the given devices. +.It Cm dump +Dump metadata stored on the given devices. +.It Cm list +See +.Xr geom 8 . +.It Cm status +See +.Xr geom 8 . +.It Cm load +See +.Xr geom 8 . +.It Cm unload +See +.Xr geom 8 . +.El +.Pp +Additional options: +.Bl -tag -width ".Fl f +.It Fl f +Force the removal of the specified striped device. +.It Fl v +Be more verbose. +.El +.Sh SYSCTL VARIABLES +The following +.Xr sysctl 8 +variables can be used to control the behavior of the +.Nm RAID5 +GEOM class. +The default value is shown next to each variable. +.Bl -tag -width indent +.It Va kern.geom.raid5.debug : No 0 +Debug level of the +.Nm RAID5 +GEOM class. +This can be set to a number between 0 and 3 inclusive. +If set to 0 minimal debug information is printed, and if set to 3 the +maximum amount of debug information is printed. +.It Va kern.geom.raid5.mhm : No 0 (read-only) +Number of malloc hamster cache misses. +.It Va kern.geom.raid5.mhh : No 0 (read-only) +Number of malloc hamster cache hits. +.It Va kern.geom.raid5.maxmem : No 8000000 (tunable) +This variable can be set any time to any 32bit signed integer value. +It is cropped apropriately (0..128MB) and interpreted as bytes. +.It Va kern.geom.raid5.wqf : No 0 (read-only) +This value shows the number of write requests that were issued early due to +a conflicting read request. +.It Va kern.geom.raid5.wqp : No 0 (read-only) +This value shows the maximum number of pending write requests so far. +.It Va kern.geom.raid5.blked1 : No 0 (read-only) +This value shows the number of new write requests that could not be combined +because the corresponding area already has an issued but incomplete +write request. +.It Va kern.geom.raid5.blked2 : No 0 (read-only) +This value shows number of due write (2-phase) requests, that were blocked by +another such request due to parity area conflict. +.It Va kern.geom.raid5.dsk_ok : No 50 (read-only) +This value shows the healthiness of the underlying devices. +50 is perfect. 40 or lower triggers a soft-device-remove. +0 causes an error announced to the upper layer. +.It Va kern.geom.raid5.veri_nice : No 100 (tunable) +This value (milli seconds) enforces a delay after a user-land read request +for internal verify requests, which are certainly quite hindering for +user-land requests, because they read all disks and in some cases even +write a disk. +.It Va kern.geom.raid5.veri_w : No 0 (read-only) +This value shows the number of parity-failures (during rebuild) +.It Va kern.geom.raid5.veri : No 0 (read-only) +This value shows the number of parity checks (during rebuild). +.It Va kern.geom.raid5.wreq2_cnt : No 0 (read-only) +Number of 2-phase writes (1. phase: read data&parity (or "other" data in case +of three disks); 2. phase: write data&parity). +.It Va kern.geom.raid5.wreq1_cnt : No 0 (read-only) +Number of 1-phase writes (sufficiently long chunks can be written in one +phase). +.It Va kern.geom.raid5.wreq_cnt : No 0 (read-only) +Write requests started by upper layer. +.It Va kern.geom.raid5.rreq_cnt : No 0 (read-only) +Read requests started by upper layer. +.It Va kern.geom.raid5.maxwql : No 0 (tunable) +This variable gives a hint for the maximum length of the write queue. +Write requests are queued until they are long enough or old enough or +until there are too many of them. +.It Va kern.geom.raid5.wdt : No 10 (tunable) +This variable determines the maximum age of a write request before it +is issued. +.It Va kern.geom.raid5.tooc : No 3 (tunable) +This variable determines the time-out-on-create. The provider is not +created before all consumers are present or the timeout is over. +.El +.Sh EXIT STATUS +Exit status is 0 on success, and 1 if the command fails. +.Sh EXAMPLES +The following example shows how to set up a RAID5 device from four disks with a +128KB stripe size for automatic configuration, +create a file system on it, +and mount it: +.Bd -literal -offset indent +graid5 label -v -s 131072 data /dev/da0 /dev/da1 /dev/da2 /dev/da3 +newfs /dev/raid5/data +mount /dev/raid5/data /mnt +[...] +umount /mnt +graid5 stop data +graid5 unload +.Ed +.Sh COMPATIBILITY +The +.Nm +interleave is in number of bytes, +unlike +.Xr ccdconfig 8 +and +.Xr atacontrol 8 +which use the number of sectors. +A +.Xr ccdconfig 8 +.Ar ileave +of +.Ql 128 +is 64 KB (128 512B sectors). +The same stripe interleave would be specified as +.Ql 65536 +for +.Nm . +.Sh SEE ALSO +.Xr geom 4 , +.Xr loader.conf 5 , +.Xr atacontrol 8 , +.Xr ccdconfig 8 , +.Xr geom 8 , +.Xr mount 8 , +.Xr newfs 8 , +.Xr sysctl 8 , +.Xr umount 8 , +.Xr vinum 8 +.Sh HISTORY +The +.Nm +utility appeared in +.Fx 5.3 . +.Sh AUTHORS +.An Arne W?rner Aq arne_woerner@yahoo.com +.An testing & tuning: Aq veronica@fluffles.net Added: projects/geom_raid5/sys/geom/raid5/g_raid5.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/geom_raid5/sys/geom/raid5/g_raid5.c Mon Feb 2 20:51:26 2009 (r188025) @@ -0,0 +1,4174 @@ +/* + * Copyright (c) 2006 Arne Woerner <arne_woerner@yahoo.com> + * testing + tuning-tricks: veronica@fluffles.net + * derived from gstripe/gmirror (Pawel Jakub Dawidek <pjd@FreeBSD.org>) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$Id: g_raid5.c,v 1.271.1.274 2008/07/29 13:58:03 aw Exp aw $"); + +#ifdef KASSERT +#define MYKASSERT(a,b) KASSERT(a,b) +#else +#define MYKASSERT(a,b) do {if (!(a)) { G_RAID5_DEBUG(0,"KASSERT in line %d.",__LINE__); panic b;}} while (0) +#endif +#define ORDER(a,b) do {if (a > b) { int tmp = a; a = b; b = tmp; }} while(0) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/bio.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/kthread.h> +#include <sys/eventhandler.h> +#include <sys/sched.h> +#include <geom/geom.h> +#include <geom/raid5/g_raid5.h> + +/* + * our sysctl-s + */ +SYSCTL_DECL(_kern_geom); +SYSCTL_NODE(_kern_geom, OID_AUTO, raid5, CTLFLAG_RW, 0, "GEOM_RAID5 stuff"); +static u_int g_raid5_cache_size_mem = 64*1024*1024; +TUNABLE_INT("kern.geom.raid5.csm", &g_raid5_cache_size_mem); +SYSCTL_INT(_kern_geom_raid5, OID_AUTO, csm, CTLFLAG_RW, &g_raid5_cache_size_mem, + 0, "cache size ((<disk count-1)*<stripe size> per bucket) in bytes"); +static int g_raid5_cache_size = -5; +TUNABLE_INT("kern.geom.raid5.cs", &g_raid5_cache_size); +SYSCTL_INT(_kern_geom_raid5, OID_AUTO, cs, CTLFLAG_RW, &g_raid5_cache_size,0, + "cache size ((<disk count-1)*<stripe size> per bucket)"); +static u_int g_raid5_debug = 0; +TUNABLE_INT("kern.geom.raid5.debug", &g_raid5_debug); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, debug, CTLFLAG_RW, &g_raid5_debug, 0, + "Debug level"); +static u_int g_raid5_tooc = 5; +TUNABLE_INT("kern.geom.raid5.tooc", &g_raid5_tooc); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, tooc, CTLFLAG_RW, &g_raid5_tooc, 0, + "timeout on create (in order to avoid unnecessary rebuilds on reboot)"); +static u_int g_raid5_wdt = 5; +TUNABLE_INT("kern.geom.raid5.wdt", &g_raid5_wdt); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wdt, CTLFLAG_RW, &g_raid5_wdt, 0, + "write request delay (in seconds)"); +static u_int g_raid5_maxwql = 25; +TUNABLE_INT("kern.geom.raid5.maxwql", &g_raid5_maxwql); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, maxwql, CTLFLAG_RW, &g_raid5_maxwql, 0, + "max wait queue length"); +static u_int g_raid5_veri_fac = 25; +TUNABLE_INT("kern.geom.raid5.veri_fac", &g_raid5_veri_fac); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_fac, CTLFLAG_RW, &g_raid5_veri_fac, + 0, "veri brake factor in case of veri_min * X < veri_max"); +static u_int g_raid5_veri_nice = 100; +TUNABLE_INT("kern.geom.raid5.veri_nice", &g_raid5_veri_nice); +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO,veri_nice, CTLFLAG_RW,&g_raid5_veri_nice, + 0, "wait this many milli seconds after last user-read (less than 1sec)"); +static u_int g_raid5_vsc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri, CTLFLAG_RD, &g_raid5_vsc, 0, + "verify stripe count"); +static u_int g_raid5_vwc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, veri_w, CTLFLAG_RD, &g_raid5_vwc, 0, + "verify write count"); +static u_int g_raid5_rrc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, rreq_cnt, CTLFLAG_RD, &g_raid5_rrc, 0, + "read request count"); +static u_int g_raid5_wrc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq_cnt, CTLFLAG_RD, &g_raid5_wrc, 0, + "write request count"); +static u_int g_raid5_w1rc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq1_cnt, CTLFLAG_RD, &g_raid5_w1rc, 0, + "write request count (1-phase)"); +static u_int g_raid5_w2rc = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wreq2_cnt, CTLFLAG_RD, &g_raid5_w2rc, 0, + "write request count (2-phase)"); +static u_int g_raid5_disks_ok = 50; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, dsk_ok, CTLFLAG_RD, &g_raid5_disks_ok,0, + "repeat EIO'ed request?"); +static u_int g_raid5_blked1 = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked1, CTLFLAG_RD, &g_raid5_blked1,0, + "1. kind block count"); +static u_int g_raid5_blked2 = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, blked2, CTLFLAG_RD, &g_raid5_blked2,0, + "2. kind block count"); +static u_int g_raid5_wqp = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, wqp, CTLFLAG_RD, &g_raid5_wqp,0, + "max. write queue length"); +static u_int g_raid5_mhm = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhm, CTLFLAG_RD, &g_raid5_mhm,0, + "memory hamster miss"); +static u_int g_raid5_mhh = 0; +SYSCTL_UINT(_kern_geom_raid5, OID_AUTO, mhh, CTLFLAG_RD, &g_raid5_mhh,0, + "memory hamster hit"); + +static MALLOC_DEFINE(M_RAID5, "raid5_data", "GEOM_RAID5 Data"); + +static int g_raid5_destroy(struct g_raid5_softc *sc, + boolean_t force, boolean_t noyoyo); +static int g_raid5_destroy_geom(struct gctl_req *req, struct g_class *mp, + struct g_geom *gp); + +static g_taste_t g_raid5_taste; +static g_ctl_req_t g_raid5_config; +static g_dumpconf_t g_raid5_dumpconf; + +static eventhandler_tag g_raid5_post_sync = NULL; + +static void g_raid5_init(struct g_class *mp); +static void g_raid5_fini(struct g_class *mp); + +struct g_class g_raid5_class = { + .name = G_RAID5_CLASS_NAME, + .version = G_VERSION, + .ctlreq = g_raid5_config, + .taste = g_raid5_taste, + .destroy_geom = g_raid5_destroy_geom, + .init = g_raid5_init, + .fini = g_raid5_fini +}; + +/* GCD & LCM */ +static __inline u_int +gcd(u_int a, u_int b) +{ + while (b != 0) { + u_int c = a; + a = b; + b = c % b; + } + return a; +} +static __inline u_int +g_raid5_lcm(u_int a, u_int b) +{ return ((a * b) / gcd(a, b)); } + +/* + * memory hamster stuff + * memory hamster stores in the first sizeof(int) bytes of each chunk + * that is requested * by malloc() the size of that chunk, + * while the bio-s only see that chunk at offset &[sizeof(int)]... + */ +static __inline int +g_raid5_mh_sz_by_a(caddr_t m) +{ return ((int*)m)[-1]; } +static __inline int +g_raid5_mh_sz_by_i(struct g_raid5_softc *sc, int i) +{ return g_raid5_mh_sz_by_a(sc->mhl[i]); } +static __inline void +g_raid5_mh_sz(caddr_t m, int l) +{ ((int*)m)[-1] = l; } +static __inline void +g_raid5_free_by_a(caddr_t m) +{ free(m - sizeof(int), M_RAID5); } +static __inline void +g_raid5_free_by_i(struct g_raid5_softc *sc, int mi) +{ g_raid5_free_by_a(sc->mhl[mi]); } +static void +g_raid5_mh_all_free(struct g_raid5_softc *sc) { + for (int i=0; i<sc->mhc; i++) + g_raid5_free_by_i(sc,i); + sc->mhc = 0; +} +static caddr_t +g_raid5_malloc(struct g_raid5_softc *sc, int l, int force) +{ + mtx_lock(&sc->mh_mtx); + int h = l*2; + int fi = -1; + int fl = -1; + int i; + for (i=0; i<sc->mhc; i++) { + int ml = g_raid5_mh_sz_by_i(sc,i); + if (ml < l || ml > h) + continue; + if (fl > 0 && ml >= fl) + continue; + fl = ml; + fi = i; + if (ml == l) + break; + } + caddr_t m; + if (fi >= 0) { + m = sc->mhl[fi]; + sc->mhc--; + if (fi < sc->mhc) + sc->mhl[fi] = sc->mhl[sc->mhc]; + g_raid5_mhh++; + mtx_unlock(&sc->mh_mtx); + } else { + g_raid5_mhm++; + mtx_unlock(&sc->mh_mtx); + m = malloc(l+sizeof(fl), M_RAID5, M_NOWAIT); + if (m == NULL && force) { + g_raid5_mh_all_free(sc); + m = malloc(l+sizeof(fl), M_RAID5, M_WAITOK); + } + if (m != NULL) { + m += sizeof(fl); + g_raid5_mh_sz(m,l); + } + } + return m; +} +static void +g_raid5_free(struct g_raid5_softc *sc, caddr_t m) +{ + mtx_lock(&sc->mh_mtx); + MYKASSERT(((int*)m)[-1] > 0, ("this is no mem hamster chunk.")); + if (sc->mhc < sc->mhs) { + sc->mhl[sc->mhc] = m; + sc->mhc++; + } else { + int l = g_raid5_mh_sz_by_a(m); + int mi = -1; + int ml = -1; + for (int i=0; i<sc->mhc; i++) { + int nl = g_raid5_mh_sz_by_i(sc,i); + if (nl >= l) + continue; + if (ml > 0 && ml <= nl) + continue; + mi = i; + ml = nl; + } + if (mi < 0) + g_raid5_free_by_a(m); + else { + g_raid5_free_by_i(sc,mi); + sc->mhl[mi] = m; + } + } + mtx_unlock(&sc->mh_mtx); +} +static void +g_raid5_mh_destroy(struct g_raid5_softc *sc) +{ + g_raid5_mh_all_free(sc); + free(sc->mhl, M_RAID5); + mtx_destroy(&sc->mh_mtx); +} + +/* + * cache entry manager + * implements a simple queue (fst; for next bio it (ab)uses bio's bio_queue) + */ +static __inline int +g_raid5_ce_em(struct g_raid5_cache_entry *ce) +{ return ce->fst == NULL; } +static __inline struct g_raid5_cache_entry * +g_raid5_ce_by_i(struct g_raid5_softc *sc, int i) +{ return sc->ce + i; } +static struct g_raid5_cache_entry * +g_raid5_ce_by_sno(struct g_raid5_softc *sc, off_t s) +{ + struct g_raid5_cache_entry *fce = NULL; + MYKASSERT(s >= 0, ("s must not be negative.")); + s++; + int i = s % sc->cs; + for (int j=sc->cs; j>0; j--) { + struct g_raid5_cache_entry *ce = g_raid5_ce_by_i(sc,i); + if (ce->sno == s) + return ce; + if (fce==NULL && ce->sno == 0) + fce = ce; + i++; + if (i == sc->cs) + i = 0; + } + if (fce == NULL) { + sc->cfc++; + return NULL; + } + MYKASSERT(fce->fst == NULL, ("ce not free.")); + MYKASSERT(fce->dc == 0, ("%p dc inconsistency %d.",fce,fce->dc)); + MYKASSERT(fce->sno == 0, ("ce not free.")); + fce->sno = s; + return fce; +} +static __inline struct g_raid5_cache_entry * +g_raid5_ce_by_off(struct g_raid5_softc *sc, off_t o) +{ return g_raid5_ce_by_sno(sc, o/sc->fsl); } +static __inline struct g_raid5_cache_entry * +g_raid5_ce_by_bio(struct g_raid5_softc *sc, struct bio *bp) +{ return g_raid5_ce_by_off(sc, bp->bio_offset); } +#define G_RAID5_C_TRAVERSE(AAA,BBB,CCC) \ + for (int i = AAA->cs-1; i >= 0; i--) \ + G_RAID5_CE_TRAVERSE((CCC=g_raid5_ce_by_i(sc,i)), BBB) +#define G_RAID5_C_TRAVSAFE(AAA,BBB,CCC) \ + for (int i = AAA->cs-1; i >= 0; i--) \ + G_RAID5_CE_TRAVSAFE((CCC=g_raid5_ce_by_i(sc,i)), BBB) +#define G_RAID5_CE_TRAVERSE(AAA, BBB) \ + for (BBB = AAA->fst; BBB != NULL; BBB = g_raid5_q_nx(BBB)) +#define G_RAID5_CE_TRAVSAFE(AAA, BBB) \ + for (BBB = AAA->fst, BBB##_nxt = g_raid5_q_nx(BBB); \ + BBB != NULL; \ + BBB = BBB##_nxt, BBB##_nxt = g_raid5_q_nx(BBB)) +static __inline void +g_raid5_dc_inc(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce) +{ + MYKASSERT(ce->dc >= 0 && sc->dc >= 0 && sc->wqp >= 0, ("cannot happen.")); + if (ce->dc == 0) + sc->dc++; + ce->dc++; + sc->wqp++; +} +static __inline void +g_raid5_dc_dec(struct g_raid5_softc *sc, struct g_raid5_cache_entry *ce) +{ + MYKASSERT(ce->dc > 0 && sc->dc > 0 && sc->wqp > 0, ("cannot happen.")); + ce->dc--; + if (ce->dc == 0) + sc->dc--; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200902022051.n12KpQhM007111>