Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 24 Mar 2011 21:31:32 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r219974 - in head: etc/mtree include sbin/geom/class sbin/geom/class/raid sys/conf sys/geom/raid sys/modules/geom sys/modules/geom/geom_raid
Message-ID:  <201103242131.p2OLVWxS058123@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Thu Mar 24 21:31:32 2011
New Revision: 219974
URL: http://svn.freebsd.org/changeset/base/219974

Log:
  MFgraid/head:
  Add new RAID GEOM class, that is going to replace ataraid(4) in supporting
  various BIOS-based software RAIDs. Unlike ataraid(4) this implementation
  does not depend on legacy ata(4) subsystem and can be used with any disk
  drivers, including new CAM-based ones (ahci(4), siis(4), mvs(4), ata(4)
  with `options ATA_CAM`). To make code more readable and extensible, this
  implementation follows modular design, including core part and two sets
  of modules, implementing support for different metadata formats and RAID
  levels.
  
  Support for such popular metadata formats is now implemented:
  Intel, JMicron, NVIDIA, Promise (also used by AMD/ATI) and SiliconImage.
  
  Such RAID levels are now supported:
  RAID0, RAID1, RAID1E, RAID10, SINGLE, CONCAT.
  
  For any all of these RAID levels and metadata formats this class supports
  full cycle of volume operations: reading, writing, creation, deletion,
  disk removal and insertion, rebuilding, dirty shutdown detection
  and resynchronization, bad sector recovery, faulty disks tracking,
  hot-spare disks. For Intel and Promise formats there is support multiple
  volumes per disk set.
  
  Look graid(8) manual page for additional details.
  
  Co-authored by:	imp
  Sponsored by:	Cisco Systems, Inc. and iXsystems, Inc.

Added:
  head/sbin/geom/class/raid/
  head/sbin/geom/class/raid/Makefile   (contents, props changed)
  head/sbin/geom/class/raid/geom_raid.c   (contents, props changed)
  head/sbin/geom/class/raid/graid.8   (contents, props changed)
  head/sys/geom/raid/
  head/sys/geom/raid/g_raid.c   (contents, props changed)
  head/sys/geom/raid/g_raid.h   (contents, props changed)
  head/sys/geom/raid/g_raid_ctl.c   (contents, props changed)
  head/sys/geom/raid/g_raid_md_if.m   (contents, props changed)
  head/sys/geom/raid/g_raid_tr_if.m   (contents, props changed)
  head/sys/geom/raid/md_intel.c   (contents, props changed)
  head/sys/geom/raid/md_jmicron.c   (contents, props changed)
  head/sys/geom/raid/md_nvidia.c   (contents, props changed)
  head/sys/geom/raid/md_promise.c   (contents, props changed)
  head/sys/geom/raid/md_sii.c   (contents, props changed)
  head/sys/geom/raid/tr_concat.c   (contents, props changed)
  head/sys/geom/raid/tr_raid0.c   (contents, props changed)
  head/sys/geom/raid/tr_raid1.c   (contents, props changed)
  head/sys/geom/raid/tr_raid1e.c   (contents, props changed)
  head/sys/modules/geom/geom_raid/
  head/sys/modules/geom/geom_raid/Makefile   (contents, props changed)
Modified:
  head/etc/mtree/BSD.include.dist
  head/include/Makefile
  head/sbin/geom/class/Makefile
  head/sys/conf/NOTES
  head/sys/conf/files
  head/sys/conf/options
  head/sys/modules/geom/Makefile

Modified: head/etc/mtree/BSD.include.dist
==============================================================================
--- head/etc/mtree/BSD.include.dist	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/etc/mtree/BSD.include.dist	Thu Mar 24 21:31:32 2011	(r219974)
@@ -190,6 +190,8 @@
         ..
         nop
         ..
+        raid
+        ..
         raid3
         ..
         shsec

Modified: head/include/Makefile
==============================================================================
--- head/include/Makefile	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/include/Makefile	Thu Mar 24 21:31:32 2011	(r219974)
@@ -47,7 +47,7 @@ LSUBDIRS=	cam/ata cam/scsi \
 	${_fs_nwfs} fs/portalfs fs/procfs fs/smbfs fs/udf fs/unionfs \
 	geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
 	geom/mirror geom/mountver geom/multipath geom/nop \
-	geom/raid3 geom/shsec geom/stripe geom/virstor \
+	geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
 	netgraph/atm netgraph/netflow \
 	security/audit \
 	security/mac_biba security/mac_bsdextended security/mac_lomac \

Modified: head/sbin/geom/class/Makefile
==============================================================================
--- head/sbin/geom/class/Makefile	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/sbin/geom/class/Makefile	Thu Mar 24 21:31:32 2011	(r219974)
@@ -14,6 +14,7 @@ SUBDIR+=mountver
 SUBDIR+=multipath
 SUBDIR+=nop
 SUBDIR+=part
+SUBDIR+=raid
 SUBDIR+=raid3
 SUBDIR+=sched
 SUBDIR+=shsec

Added: head/sbin/geom/class/raid/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/raid/Makefile	Thu Mar 24 21:31:32 2011	(r219974)
@@ -0,0 +1,10 @@
+# $FreeBSD$
+
+.PATH:	${.CURDIR}/../../misc
+
+GEOM_CLASS=	raid
+
+DPADD=	${LIBMD}
+LDADD=	-lmd
+
+.include <bsd.lib.mk>

Added: head/sbin/geom/class/raid/geom_raid.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/raid/geom_raid.c	Thu Mar 24 21:31:32 2011	(r219974)
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <errno.h>
+#include <paths.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <libgeom.h>
+#include <geom/raid/g_raid.h>
+#include <core/geom.h>
+#include <misc/subr.h>
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_RAID_VERSION;
+
+struct g_command class_commands[] = {
+	{ "label", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] format label level prov ..."
+	},
+	{ "add", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		{ 'S', "size", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		{ 's', "strip", G_VAL_OPTIONAL, G_TYPE_NUMBER },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] [-S size] [-s stripsize] name label level"
+	},
+	{ "delete", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name [label|num]"
+	},
+	{ "insert", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "remove", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "fail", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    "[-v] name prov ..."
+	},
+	{ "stop", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    "[-fv] name"
+	},
+	G_CMD_SENTINEL
+};
+

Added: head/sbin/geom/class/raid/graid.8
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/raid/graid.8	Thu Mar 24 21:31:32 2011	(r219974)
@@ -0,0 +1,266 @@
+.\" Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 22, 2011
+.Dt GRAID 8
+.Os
+.Sh NAME
+.Nm graid
+.Nd "control utility for software RAID devices"
+.Sh SYNOPSIS
+.Nm
+.Cm label
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar format
+.Ar label
+.Ar level
+.Ar prov ...
+.Nm
+.Cm add
+.Op Fl f
+.Op Fl S Ar size
+.Op Fl s Ar strip
+.Ar name
+.Ar label
+.Ar level
+.Nm
+.Cm delete
+.Op Fl f
+.Ar name
+.Op Ar label | Ar num
+.Nm
+.Cm insert
+.Ar name
+.Ar prov ...
+.Nm
+.Cm remove
+.Ar name
+.Ar prov ...
+.Nm
+.Cm fail
+.Ar name
+.Ar prov ...
+.Nm
+.Cm stop
+.Op Fl fv
+.Ar name ...
+.Nm
+.Cm list
+.Nm
+.Cm status
+.Nm
+.Cm load
+.Nm
+.Cm unload
+.Sh DESCRIPTION
+The
+.Nm
+utility is used to manage software RAID configurations, supported by the
+GEOM RAID class.
+GEOM RAID class uses on-disk metadata to provide access to software-RAID
+volumes defined by different RAID BIOSes.
+Depending on RAID BIOS type and it's metadata format, different subsets of
+configurations and features are supported.
+To allow booting from RAID volume, the metadata format should match the
+RAID BIOS type and its capabilities.
+To guarantee that these match, it is recommended to create volumes via the
+RAID BIOS interface, while experienced users are free to do it using this
+utility.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm destroy"
+.It Cm label
+Create an array with single volume.
+The
+.Ar format
+argument specifies the on-disk metadata format to use for this array,
+such as "Intel".
+The
+.Ar label
+argument specifies the label of the created volume.
+The
+.Ar level
+argument specifies the RAID level of the created volume, such as:
+"RAID0", "RAID1", etc.
+The subsequent list enumerates providers to use as array components.
+The special name "NONE" can be used to reserve space for absent disks.
+The order of components can be important, depending on specific RAID level
+and metadata format.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl s Ar strip"
+.It Fl f
+Enforce specified configuration creation if it is officially unsupported,
+but technically can be created.
+.It Fl S Ar size
+Use
+.Ar size
+bytes on each component for this volume.
+Should be used if several volumes per array are planned, or if smaller
+components going to be inserted later.
+Defaults to size of the smallest component.
+.It Fl s Ar strip
+Specifies strip size in bytes.
+Defaults to 131072.
+.El
+.It Cm add
+Create another volume on the existing array.
+The
+.Ar name
+argument is the name of the existing array, reported by label command.
+The rest of arguments are the same as for the label command.
+.It Cm delete
+Delete volume(s) from the existing array.
+When the last volume is deleted, the array is also deleted and its metadata
+erased.
+The
+.Ar name
+argument is the name of existing array.
+Optional
+.Ar label
+or
+.Ar num
+arguments allow specifying volume for deletion.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Delete volume(s) even if it is still open.
+.El
+.It Cm insert
+Insert specified provider(s) into specified array instead of the first missing
+or failed components.
+If there are no such components, mark disk(s) as spare.
+.It Cm remove
+Remove the specified provider(s) from the specified array and erase metadata.
+If there are spare disks present, the removed disk(s) will be replaced by
+spares.
+.It Cm fail
+Mark the given disks(s) as failed, removing from active use unless absolutely
+necessary due to exhausted redundancy.
+If there are spare disks present - failed disk(s) will be replaced with one
+of them.
+.It Cm stop
+Stop the given array.
+The metadata will not be erased.
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Stop the given array even if some of its volumes are opened.
+.El
+.It Cm list
+See
+.Xr geom 8 .
+.It Cm status
+See
+.Xr geom 8 .
+.It Cm load
+See
+.Xr geom 8 .
+.It Cm unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options include:
+.Bl -tag -width ".Fl v"
+.It Fl v
+Be more verbose.
+.El
+.Sh SUPPORTED METADATA FORMATS
+The GEOM RAID class follows a modular design, allowing different metadata
+formats to be used.
+Support is currently implemented for the following formats:
+.Bl -tag -width "Intel"
+.It Intel
+The format used by Intel RAID BIOS.
+Supports up to two volumes per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks).
+Configurations not supported by Intel RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks).
+.It JMicron
+The format used by JMicron RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID10 (4 disks), CONCAT (2+ disks).
+Configurations not supported by JMicron RAID BIOS, but enforceable on your own
+risk: RAID1 (3+ disks), RAID1E (3+ disks), RAID10 (6+ disks), RAID5 (3+ disks).
+.It NVIDIA
+The format used by NVIDIA MediaShield RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4+ disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by NVIDIA MediaShield RAID BIOS, but enforceable
+on your own risk: RAID1 (3+ disks).
+.It Promise
+The format used by Promise and AMD/ATI RAID BIOSes and FreeBSD ataraid(4)
+driver.
+Supports multiple volumes per array.
+Each disk can be split to be used by up to two arbitrary volumes.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by RAID BIOSes, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.It SiI
+The format used by SiliconImage RAID BIOS.
+Supports one volume per array.
+Supports configurations: RAID0 (2+ disks), RAID1 (2 disks),
+RAID5 (3+ disks), RAID10 (4 disks), SINGLE (1 disk), CONCAT (2+ disks).
+Configurations not supported by SiliconImage RAID BIOS, but enforceable on your
+own risk: RAID1 (3+ disks), RAID10 (6+ disks).
+.El
+.Sh SUPPORTED RAID LEVELS
+The GEOM RAID class follows a modular design, allowing different RAID levels
+to be used.
+Support for the following RAID levels is currently implemented: RAID0, RAID1,
+RAID1E, RAID10, SINGLE, CONCAT.
+.Sh RAID LEVEL MIGRATION
+The GEOM RAID class has no support for RAID level migration, allowed by some
+metadata formats.
+If you started migration using BIOS or in some other way, make sure to
+complete it there.
+Do not run GEOM RAID class on migrating volumes under pain of possible data
+corruption!
+.Sh EXIT STATUS
+Exit status is 0 on success, and non-zero if the command fails.
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8 ,
+.Xr vinum 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in
+.Fx 9.0 .
+.Sh AUTHORS
+.An Alexander Motin Aq mav@FreeBSD.org
+.An M. Warner Losh Aq imp@FreeBSD.org

Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/sys/conf/NOTES	Thu Mar 24 21:31:32 2011	(r219974)
@@ -163,6 +163,7 @@ options 	GEOM_PART_MBR		# MBR partitioni
 options 	GEOM_PART_PC98		# PC-9800 disk partitioning
 options 	GEOM_PART_VTOC8		# SMI VTOC8 disk label
 options 	GEOM_PC98		# NEC PC9800 partitioning
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_RAID3		# RAID3 functionality.
 options 	GEOM_SHSEC		# Shared secret.
 options 	GEOM_STRIPE		# Disk striping.

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/sys/conf/files	Thu Mar 24 21:31:32 2011	(r219974)
@@ -2115,6 +2115,19 @@ geom/part/g_part_gpt.c		optional geom_pa
 geom/part/g_part_mbr.c		optional geom_part_mbr
 geom/part/g_part_pc98.c		optional geom_part_pc98
 geom/part/g_part_vtoc8.c	optional geom_part_vtoc8
+geom/raid/g_raid.c		optional geom_raid
+geom/raid/g_raid_ctl.c		optional geom_raid
+geom/raid/g_raid_md_if.m	optional geom_raid
+geom/raid/g_raid_tr_if.m	optional geom_raid
+geom/raid/md_intel.c		optional geom_raid
+geom/raid/md_jmicron.c		optional geom_raid
+geom/raid/md_nvidia.c		optional geom_raid
+geom/raid/md_promise.c		optional geom_raid
+geom/raid/md_sii.c		optional geom_raid
+geom/raid/tr_concat.c		optional geom_raid
+geom/raid/tr_raid0.c		optional geom_raid
+geom/raid/tr_raid1.c		optional geom_raid
+geom/raid/tr_raid1e.c		optional geom_raid
 geom/raid3/g_raid3.c		optional geom_raid3
 geom/raid3/g_raid3_ctl.c	optional geom_raid3
 geom/shsec/g_shsec.c		optional geom_shsec

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Thu Mar 24 20:28:09 2011	(r219973)
+++ head/sys/conf/options	Thu Mar 24 21:31:32 2011	(r219974)
@@ -102,6 +102,7 @@ GEOM_PART_MBR	opt_geom.h
 GEOM_PART_PC98	opt_geom.h
 GEOM_PART_VTOC8	opt_geom.h
 GEOM_PC98	opt_geom.h
+GEOM_RAID	opt_geom.h
 GEOM_RAID3	opt_geom.h
 GEOM_SHSEC	opt_geom.h
 GEOM_STRIPE	opt_geom.h

Added: head/sys/geom/raid/g_raid.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/geom/raid/g_raid.c	Thu Mar 24 21:31:32 2011	(r219974)
@@ -0,0 +1,2340 @@
+/*-
+ * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/eventhandler.h>
+#include <vm/uma.h>
+#include <geom/geom.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/sched.h>
+#include <geom/raid/g_raid.h>
+#include "g_raid_md_if.h"
+#include "g_raid_tr_if.h"
+
+static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
+
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
+u_int g_raid_aggressive_spare = 0;
+TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
+    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
+u_int g_raid_debug = 2;
+TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
+    "Debug level");
+int g_raid_read_err_thresh = 10;
+TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
+    &g_raid_read_err_thresh, 0,
+    "Number of read errors equated to disk failure");
+u_int g_raid_start_timeout = 30;
+TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
+    &g_raid_start_timeout, 0,
+    "Time to wait for all array components");
+static u_int g_raid_clean_time = 5;
+TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
+    &g_raid_clean_time, 0, "Mark volume as clean when idling");
+static u_int g_raid_disconnect_on_failure = 1;
+TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
+    &g_raid_disconnect_on_failure);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
+    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
+static u_int g_raid_name_format = 0;
+TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
+    &g_raid_name_format, 0, "Providers name format.");
+static u_int g_raid_idle_threshold = 1000000;
+TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
+SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
+    &g_raid_idle_threshold, 1000000,
+    "Time in microseconds to consider a volume idle.");
+
+#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
+	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
+	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
+	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
+} while (0)
+
+LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
+    LIST_HEAD_INITIALIZER(g_raid_md_classes);
+
+LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
+    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
+
+LIST_HEAD(, g_raid_volume) g_raid_volumes =
+    LIST_HEAD_INITIALIZER(g_raid_volumes);
+
+static eventhandler_tag g_raid_pre_sync = NULL;
+static int g_raid_started = 0;
+
+static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
+    struct g_geom *gp);
+static g_taste_t g_raid_taste;
+static void g_raid_init(struct g_class *mp);
+static void g_raid_fini(struct g_class *mp);
+
+struct g_class g_raid_class = {
+	.name = G_RAID_CLASS_NAME,
+	.version = G_VERSION,
+	.ctlreq = g_raid_ctl,
+	.taste = g_raid_taste,
+	.destroy_geom = g_raid_destroy_geom,
+	.init = g_raid_init,
+	.fini = g_raid_fini
+};
+
+static void g_raid_destroy_provider(struct g_raid_volume *vol);
+static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
+static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
+static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
+static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
+static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
+    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
+static void g_raid_start(struct bio *bp);
+static void g_raid_start_request(struct bio *bp);
+static void g_raid_disk_done(struct bio *bp);
+static void g_raid_poll(struct g_raid_softc *sc);
+
+static const char *
+g_raid_node_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_NODE_E_WAKE:
+		return ("WAKE");
+	case G_RAID_NODE_E_START:
+		return ("START");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_disk_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_DISK_S_NONE:
+		return ("NONE");
+	case G_RAID_DISK_S_OFFLINE:
+		return ("OFFLINE");
+	case G_RAID_DISK_S_FAILED:
+		return ("FAILED");
+	case G_RAID_DISK_S_STALE_FAILED:
+		return ("STALE_FAILED");
+	case G_RAID_DISK_S_SPARE:
+		return ("SPARE");
+	case G_RAID_DISK_S_STALE:
+		return ("STALE");
+	case G_RAID_DISK_S_ACTIVE:
+		return ("ACTIVE");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_disk_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_DISK_E_DISCONNECTED:
+		return ("DISCONNECTED");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_subdisk_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_SUBDISK_S_NONE:
+		return ("NONE");
+	case G_RAID_SUBDISK_S_FAILED:
+		return ("FAILED");
+	case G_RAID_SUBDISK_S_NEW:
+		return ("NEW");
+	case G_RAID_SUBDISK_S_REBUILD:
+		return ("REBUILD");
+	case G_RAID_SUBDISK_S_UNINITIALIZED:
+		return ("UNINITIALIZED");
+	case G_RAID_SUBDISK_S_STALE:
+		return ("STALE");
+	case G_RAID_SUBDISK_S_RESYNC:
+		return ("RESYNC");
+	case G_RAID_SUBDISK_S_ACTIVE:
+		return ("ACTIVE");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_subdisk_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_SUBDISK_E_NEW:
+		return ("NEW");
+	case G_RAID_SUBDISK_E_DISCONNECTED:
+		return ("DISCONNECTED");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_volume_state2str(int state)
+{
+
+	switch (state) {
+	case G_RAID_VOLUME_S_STARTING:
+		return ("STARTING");
+	case G_RAID_VOLUME_S_BROKEN:
+		return ("BROKEN");
+	case G_RAID_VOLUME_S_DEGRADED:
+		return ("DEGRADED");
+	case G_RAID_VOLUME_S_SUBOPTIMAL:
+		return ("SUBOPTIMAL");
+	case G_RAID_VOLUME_S_OPTIMAL:
+		return ("OPTIMAL");
+	case G_RAID_VOLUME_S_UNSUPPORTED:
+		return ("UNSUPPORTED");
+	case G_RAID_VOLUME_S_STOPPED:
+		return ("STOPPED");
+	default:
+		return ("INVALID");
+	}
+}
+
+static const char *
+g_raid_volume_event2str(int event)
+{
+
+	switch (event) {
+	case G_RAID_VOLUME_E_UP:
+		return ("UP");
+	case G_RAID_VOLUME_E_DOWN:
+		return ("DOWN");
+	case G_RAID_VOLUME_E_START:
+		return ("START");
+	case G_RAID_VOLUME_E_STARTMD:
+		return ("STARTMD");
+	default:
+		return ("INVALID");
+	}
+}
+
+const char *
+g_raid_volume_level2str(int level, int qual)
+{
+
+	switch (level) {
+	case G_RAID_VOLUME_RL_RAID0:
+		return ("RAID0");
+	case G_RAID_VOLUME_RL_RAID1:
+		return ("RAID1");
+	case G_RAID_VOLUME_RL_RAID3:
+		return ("RAID3");
+	case G_RAID_VOLUME_RL_RAID4:
+		return ("RAID4");
+	case G_RAID_VOLUME_RL_RAID5:
+		return ("RAID5");
+	case G_RAID_VOLUME_RL_RAID6:
+		return ("RAID6");
+	case G_RAID_VOLUME_RL_RAID1E:
+		return ("RAID1E");
+	case G_RAID_VOLUME_RL_SINGLE:
+		return ("SINGLE");
+	case G_RAID_VOLUME_RL_CONCAT:
+		return ("CONCAT");
+	case G_RAID_VOLUME_RL_RAID5E:
+		return ("RAID5E");
+	case G_RAID_VOLUME_RL_RAID5EE:
+		return ("RAID5EE");
+	default:
+		return ("UNKNOWN");
+	}
+}
+
+int
+g_raid_volume_str2level(const char *str, int *level, int *qual)
+{
+
+	*level = G_RAID_VOLUME_RL_UNKNOWN;
+	*qual = G_RAID_VOLUME_RLQ_NONE;
+	if (strcasecmp(str, "RAID0") == 0)
+		*level = G_RAID_VOLUME_RL_RAID0;
+	else if (strcasecmp(str, "RAID1") == 0)
+		*level = G_RAID_VOLUME_RL_RAID1;
+	else if (strcasecmp(str, "RAID3") == 0)
+		*level = G_RAID_VOLUME_RL_RAID3;
+	else if (strcasecmp(str, "RAID4") == 0)
+		*level = G_RAID_VOLUME_RL_RAID4;
+	else if (strcasecmp(str, "RAID5") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5;
+	else if (strcasecmp(str, "RAID6") == 0)
+		*level = G_RAID_VOLUME_RL_RAID6;
+	else if (strcasecmp(str, "RAID10") == 0 ||
+		 strcasecmp(str, "RAID1E") == 0)
+		*level = G_RAID_VOLUME_RL_RAID1E;
+	else if (strcasecmp(str, "SINGLE") == 0)
+		*level = G_RAID_VOLUME_RL_SINGLE;
+	else if (strcasecmp(str, "CONCAT") == 0)
+		*level = G_RAID_VOLUME_RL_CONCAT;
+	else if (strcasecmp(str, "RAID5E") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5E;
+	else if (strcasecmp(str, "RAID5EE") == 0)
+		*level = G_RAID_VOLUME_RL_RAID5EE;
+	else
+		return (-1);
+	return (0);
+}
+
+const char *
+g_raid_get_diskname(struct g_raid_disk *disk)
+{
+
+	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
+		return ("[unknown]");
+	return (disk->d_consumer->provider->name);
+}
+
+void
+g_raid_report_disk_state(struct g_raid_disk *disk)
+{
+	struct g_raid_subdisk *sd;
+	int len, state;
+	uint32_t s;
+
+	if (disk->d_consumer == NULL)
+		return;
+	if (disk->d_state == G_RAID_DISK_S_FAILED ||
+	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
+		s = G_STATE_FAILED;
+	} else {
+		state = G_RAID_SUBDISK_S_ACTIVE;
+		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
+			if (sd->sd_state < state)
+				state = sd->sd_state;
+		}
+		if (state == G_RAID_SUBDISK_S_FAILED)
+			s = G_STATE_FAILED;
+		else if (state == G_RAID_SUBDISK_S_NEW ||
+		    state == G_RAID_SUBDISK_S_REBUILD)
+			s = G_STATE_REBUILD;
+		else if (state == G_RAID_SUBDISK_S_STALE ||
+		    state == G_RAID_SUBDISK_S_RESYNC)
+			s = G_STATE_RESYNC;
+		else
+			s = G_STATE_ACTIVE;
+	}
+	len = sizeof(s);
+	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
+	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
+	    g_raid_get_diskname(disk), s);
+}
+
+void
+g_raid_change_disk_state(struct g_raid_disk *disk, int state)
+{
+
+	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
+	    g_raid_get_diskname(disk),
+	    g_raid_disk_state2str(disk->d_state),
+	    g_raid_disk_state2str(state));
+	disk->d_state = state;
+	g_raid_report_disk_state(disk);
+}
+
+void
+g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
+{
+
+	G_RAID_DEBUG1(0, sd->sd_softc,
+	    "Subdisk %s:%d-%s state changed from %s to %s.",
+	    sd->sd_volume->v_name, sd->sd_pos,
+	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
+	    g_raid_subdisk_state2str(sd->sd_state),
+	    g_raid_subdisk_state2str(state));
+	sd->sd_state = state;
+	if (sd->sd_disk)
+		g_raid_report_disk_state(sd->sd_disk);
+}
+
+void
+g_raid_change_volume_state(struct g_raid_volume *vol, int state)
+{
+
+	G_RAID_DEBUG1(0, vol->v_softc,
+	    "Volume %s state changed from %s to %s.",
+	    vol->v_name,
+	    g_raid_volume_state2str(vol->v_state),
+	    g_raid_volume_state2str(state));
+	vol->v_state = state;
+}
+
+/*
+ * --- Events handling functions ---
+ * Events in geom_raid are used to maintain subdisks and volumes status
+ * from one thread to simplify locking.
+ */
+static void
+g_raid_event_free(struct g_raid_event *ep)
+{
+
+	free(ep, M_RAID);
+}
+
+int
+g_raid_event_send(void *arg, int event, int flags)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_event *ep;
+	int error;
+
+	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
+		sc = ((struct g_raid_volume *)arg)->v_softc;
+	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
+		sc = ((struct g_raid_disk *)arg)->d_softc;
+	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
+		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
+	} else {
+		sc = arg;
+	}
+	ep = malloc(sizeof(*ep), M_RAID,
+	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->e_tgt = arg;
+	ep->e_event = event;
+	ep->e_flags = flags;
+	ep->e_error = 0;
+	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
+	mtx_unlock(&sc->sc_queue_mtx);
+	wakeup(sc);
+
+	if ((flags & G_RAID_EVENT_WAIT) == 0)
+		return (0);
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
+	sx_xunlock(&sc->sc_lock);
+	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
+		mtx_lock(&sc->sc_queue_mtx);
+		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
+		    hz * 5);
+	}
+	error = ep->e_error;
+	g_raid_event_free(ep);
+	sx_xlock(&sc->sc_lock);
+	return (error);
+}
+
+static void
+g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
+{
+	struct g_raid_event *ep, *tmpep;
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
+		if (ep->e_tgt != tgt)
+			continue;
+		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
+		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
+			g_raid_event_free(ep);
+		else {
+			ep->e_error = ECANCELED;
+			wakeup(ep);
+		}
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+}
+
+static int
+g_raid_event_check(struct g_raid_softc *sc, void *tgt)
+{
+	struct g_raid_event *ep;
+	int	res = 0;
+
+	sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+	mtx_lock(&sc->sc_queue_mtx);
+	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
+		if (ep->e_tgt != tgt)
+			continue;
+		res = 1;
+		break;
+	}
+	mtx_unlock(&sc->sc_queue_mtx);
+	return (res);
+}
+
+/*
+ * Return the number of disks in given state.
+ * If state is equal to -1, count all connected disks.
+ */
+u_int
+g_raid_ndisks(struct g_raid_softc *sc, int state)
+{
+	struct g_raid_disk *disk;
+	u_int n;
+
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
+	n = 0;
+	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
+		if (disk->d_state == state || state == -1)
+			n++;
+	}
+	return (n);
+}
+
+/*
+ * Return the number of subdisks in given state.
+ * If state is equal to -1, count all connected disks.
+ */
+u_int
+g_raid_nsubdisks(struct g_raid_volume *vol, int state)
+{
+	struct g_raid_subdisk *subdisk;
+	struct g_raid_softc *sc;
+	u_int i, n ;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201103242131.p2OLVWxS058123>