From owner-svn-src-head@FreeBSD.ORG  Wed Sep 28 13:08:52 2011
Return-Path: <owner-svn-src-head@FreeBSD.ORG>
Delivered-To: svn-src-head@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id 5B358106566B;
	Wed, 28 Sep 2011 13:08:52 +0000 (UTC) (envelope-from pjd@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c])
	by mx1.freebsd.org (Postfix) with ESMTP id 4A18B8FC19;
	Wed, 28 Sep 2011 13:08:52 +0000 (UTC)
Received: from svn.freebsd.org (localhost [127.0.0.1])
	by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id p8SD8qvB074291;
	Wed, 28 Sep 2011 13:08:52 GMT (envelope-from pjd@svn.freebsd.org)
Received: (from pjd@localhost)
	by svn.freebsd.org (8.14.4/8.14.4/Submit) id p8SD8qGk074284;
	Wed, 28 Sep 2011 13:08:52 GMT (envelope-from pjd@svn.freebsd.org)
Message-Id: <201109281308.p8SD8qGk074284@svn.freebsd.org>
From: Pawel Jakub Dawidek <pjd@FreeBSD.org>
Date: Wed, 28 Sep 2011 13:08:52 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
	svn-src-head@freebsd.org
X-SVN-Group: head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: 
Subject: svn commit: r225830 - head/sbin/hastd
X-BeenThere: svn-src-head@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: SVN commit messages for the src tree for head/-current
	<svn-src-head.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-head>
List-Post: <mailto:svn-src-head@freebsd.org>
List-Help: <mailto:svn-src-head-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Wed, 28 Sep 2011 13:08:52 -0000

Author: pjd
Date: Wed Sep 28 13:08:51 2011
New Revision: 225830
URL: http://svn.freebsd.org/changeset/base/225830

Log:
  After every activemap change flush disk's write cache, so that write
  reordering won't make the actual write to be committed before marking
  the coresponding extent as dirty.
  
  It can be disabled in configuration file.
  
  If BIO_FLUSH is not supported by the underlying file system we log a warning
  and never send BIO_FLUSH again to that GEOM provider.
  
  MFC after:	3 days

Modified:
  head/sbin/hastd/hast.conf.5
  head/sbin/hastd/hast.h
  head/sbin/hastd/hastd.c
  head/sbin/hastd/parse.y
  head/sbin/hastd/primary.c
  head/sbin/hastd/token.l

Modified: head/sbin/hastd/hast.conf.5
==============================================================================
--- head/sbin/hastd/hast.conf.5	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/hast.conf.5	Wed Sep 28 13:08:51 2011	(r225830)
@@ -63,6 +63,7 @@ checksum <algorithm>
 compression <algorithm>
 timeout <seconds>
 exec <path>
+metaflush "on" | "off"
 
 on <node> {
 	# Node section
@@ -85,12 +86,14 @@ resource <name> {
 	local <path>
 	timeout <seconds>
 	exec <path>
+	metaflush "on" | "off"
 
 	on <node> {
 		# Resource-node section
 		name <name>
 		# Required
 		local <path>
+		metaflush "on" | "off"
 		# Required
 		remote <addr>
 		source <addr>
@@ -100,6 +103,7 @@ resource <name> {
 		name <name>
 		# Required
 		local <path>
+		metaflush "on" | "off"
 		# Required
 		remote <addr>
 		source <addr>
@@ -318,6 +322,25 @@ It can be one of:
 .Ar secondary ,
 .Ar primary .
 .Pp
+.It Ic metaflush on | off
+.Pp
+When set to
+.Va on ,
+flush write cache of the local provider after every metadata (activemap) update.
+Flushing write cache ensures that provider will not reorder writes and that
+metadata will be properly updated before real data is stored.
+If the local provider does not support flushing write cache (it returns
+.Er EOPNOTSUPP
+on the
+.Cm BIO_FLUSH
+request),
+.Nm hastd
+will disable
+.Ic metaflush
+automatically.
+The default value is
+.Va on .
+.Pp
 .It Ic name Aq name
 .Pp
 GEOM provider name that will appear as

Modified: head/sbin/hastd/hast.h
==============================================================================
--- head/sbin/hastd/hast.h	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/hast.h	Wed Sep 28 13:08:51 2011	(r225830)
@@ -167,6 +167,8 @@ struct hast_resource {
 	off_t	hr_local_mediasize;
 	/* Sector size of local provider. */
 	unsigned int hr_local_sectorsize;
+	/* Flush write cache on metadata updates? */
+	int	hr_metaflush;
 
 	/* Descriptor for /dev/ggctl communication. */
 	int	hr_ggatefd;

Modified: head/sbin/hastd/hastd.c
==============================================================================
--- head/sbin/hastd/hastd.c	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/hastd.c	Wed Sep 28 13:08:51 2011	(r225830)
@@ -386,6 +386,12 @@ resource_needs_restart(const struct hast
 			return (true);
 		if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
 			return (true);
+		/*
+		 * When metaflush has changed we don't really need restart,
+		 * but it is just easier this way.
+		 */
+		if (res0->hr_metaflush != res1->hr_metaflush)
+			return (true);
 	}
 	return (false);
 }
@@ -416,6 +422,8 @@ resource_needs_reload(const struct hast_
 		return (true);
 	if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
 		return (true);
+	if (res0->hr_metaflush != res1->hr_metaflush)
+		return (true);
 	return (false);
 }
 
@@ -436,6 +444,7 @@ resource_reload(const struct hast_resour
 	nv_add_int32(nvout, (int32_t)res->hr_compression, "compression");
 	nv_add_int32(nvout, (int32_t)res->hr_timeout, "timeout");
 	nv_add_string(nvout, res->hr_exec, "exec");
+	nv_add_int32(nvout, (int32_t)res->hr_metaflush, "metaflush");
 	if (nv_error(nvout) != 0) {
 		nv_free(nvout);
 		pjdlog_error("Unable to allocate header for reload message.");
@@ -591,12 +600,13 @@ hastd_reload(void)
 	 * recreating it.
 	 *
 	 * We do just reload (send SIGHUP to worker process) if we act as
-	 * PRIMARY, but only if remote address, replication mode, timeout or
-	 * execution path has changed. For those, there is no need to restart
-	 * worker process.
+	 * PRIMARY, but only if remote address, source address, replication
+	 * mode, timeout, execution path or metaflush has changed.
+	 * For those, there is no need to restart worker process.
 	 * If PRIMARY receives SIGHUP, it will reconnect if remote address or
-	 * replication mode has changed or simply set new timeout if only
-	 * timeout has changed.
+	 * source address has changed or it will set new timeout if only timeout
+	 * has changed or it will update metaflush if only metaflush has
+	 * changed.
 	 */
 	TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
 		TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
@@ -627,6 +637,7 @@ hastd_reload(void)
 			cres->hr_timeout = nres->hr_timeout;
 			strlcpy(cres->hr_exec, nres->hr_exec,
 			    sizeof(cres->hr_exec));
+			cres->hr_metaflush = nres->hr_metaflush;
 			if (cres->hr_workerpid != 0)
 				resource_reload(cres);
 		}

Modified: head/sbin/hastd/parse.y
==============================================================================
--- head/sbin/hastd/parse.y	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/parse.y	Wed Sep 28 13:08:51 2011	(r225830)
@@ -68,9 +68,11 @@ static int depth0_checksum;
 static int depth0_compression;
 static int depth0_timeout;
 static char depth0_exec[PATH_MAX];
+static int depth0_metaflush;
 
 static char depth1_provname[PATH_MAX];
 static char depth1_localpath[PATH_MAX];
+static int depth1_metaflush;
 
 extern void yyrestart(FILE *);
 
@@ -197,6 +199,7 @@ yy_config_parse(const char *config, bool
 	strlcpy(depth0_listen_tcp6, HASTD_LISTEN_TCP6,
 	    sizeof(depth0_listen_tcp6));
 	depth0_exec[0] = '\0';
+	depth0_metaflush = 1;
 
 	lconfig = calloc(1, sizeof(*lconfig));
 	if (lconfig == NULL) {
@@ -328,6 +331,13 @@ yy_config_parse(const char *config, bool
 			strlcpy(curres->hr_exec, depth0_exec,
 			    sizeof(curres->hr_exec));
 		}
+		if (curres->hr_metaflush == -1) {
+			/*
+			 * Metaflush is not set at resource-level.
+			 * Use global or default setting.
+			 */
+			curres->hr_metaflush = depth0_metaflush;
+		}
 	}
 
 	return (lconfig);
@@ -355,8 +365,8 @@ yy_config_free(struct hastd_config *conf
 }
 %}
 
-%token CONTROL LISTEN PORT REPLICATION CHECKSUM COMPRESSION
-%token TIMEOUT EXEC EXTENTSIZE RESOURCE NAME LOCAL REMOTE SOURCE ON
+%token CONTROL LISTEN PORT REPLICATION CHECKSUM COMPRESSION METAFLUSH
+%token TIMEOUT EXEC EXTENTSIZE RESOURCE NAME LOCAL REMOTE SOURCE ON OFF
 %token FULLSYNC MEMSYNC ASYNC NONE CRC32 SHA256 HOLE LZF
 %token NUM STR OB CB
 
@@ -364,6 +374,7 @@ yy_config_free(struct hastd_config *conf
 %type <num> replication_type
 %type <num> checksum_type
 %type <num> compression_type
+%type <num> boolean
 
 %union
 {
@@ -396,6 +407,8 @@ statement:
 	|
 	exec_statement
 	|
+	metaflush_statement
+	|
 	node_statement
 	|
 	resource_statement
@@ -585,6 +598,34 @@ exec_statement:		EXEC STR
 	}
 	;
 
+metaflush_statement:	METAFLUSH boolean
+	{
+		switch (depth) {
+		case 0:
+			depth0_metaflush = $2;
+			break;
+		case 1:
+			PJDLOG_ASSERT(curres != NULL);
+			depth1_metaflush = $2;
+			break;
+		case 2:
+			if (!mynode)
+				break;
+			PJDLOG_ASSERT(curres != NULL);
+			curres->hr_metaflush = $2;
+			break;
+		default:
+			PJDLOG_ABORT("metaflush at wrong depth level");
+		}
+	}
+	;
+
+boolean:
+	ON		{ $$ = 1; }
+	|
+	OFF		{ $$ = 0; }
+	;
+
 node_statement:		ON node_start OB node_entries CB
 	{
 		mynode = false;
@@ -660,6 +701,13 @@ resource_statement:	RESOURCE resource_st
 				strlcpy(curres->hr_localpath, depth1_localpath,
 				    sizeof(curres->hr_localpath));
 			}
+			if (curres->hr_metaflush == -1 && depth1_metaflush != -1) {
+				/*
+				 * Metaflush is not set at node-level,
+				 * but is set at resource-level, use it.
+				 */
+				curres->hr_metaflush = depth1_metaflush;
+			}
 
 			/*
 			 * If provider name is not given, use resource name
@@ -713,6 +761,7 @@ resource_start:	STR
 		 */
 		depth1_provname[0] = '\0';
 		depth1_localpath[0] = '\0';
+		depth1_metaflush = -1;
 		hadmynode = false;
 
 		curres = calloc(1, sizeof(*curres));
@@ -739,6 +788,7 @@ resource_start:	STR
 		curres->hr_provname[0] = '\0';
 		curres->hr_localpath[0] = '\0';
 		curres->hr_localfd = -1;
+		curres->hr_metaflush = -1;
 		curres->hr_remoteaddr[0] = '\0';
 		curres->hr_sourceaddr[0] = '\0';
 		curres->hr_ggateunit = -1;
@@ -761,6 +811,8 @@ resource_entry:
 	|
 	exec_statement
 	|
+	metaflush_statement
+	|
 	name_statement
 	|
 	local_statement
@@ -869,6 +921,8 @@ resource_node_entry:
 	remote_statement
 	|
 	source_statement
+	|
+	metaflush_statement
 	;
 
 remote_statement:	REMOTE remote_str

Modified: head/sbin/hastd/primary.c
==============================================================================
--- head/sbin/hastd/primary.c	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/primary.c	Wed Sep 28 13:08:51 2011	(r225830)
@@ -296,6 +296,17 @@ hast_activemap_flush(struct hast_resourc
 		pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk");
 		return (-1);
 	}
+	if (res->hr_metaflush == 1 && g_flush(res->hr_localfd) == -1) {
+		if (errno == EOPNOTSUPP) {
+			pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.",
+			    res->hr_localpath);
+			res->hr_metaflush = 0;
+		} else {
+			pjdlog_errno(LOG_ERR,
+			    "Unable to flush disk cache on activemap update");
+			return (-1);
+		}
+	}
 	return (0);
 }
 
@@ -1999,6 +2010,7 @@ primary_config_reload(struct hast_resour
 	nv_assert(nv, "compression");
 	nv_assert(nv, "timeout");
 	nv_assert(nv, "exec");
+	nv_assert(nv, "metaflush");
 
 	ncomps = HAST_NCOMPONENTS;
 
@@ -2009,6 +2021,7 @@ primary_config_reload(struct hast_resour
 #define MODIFIED_COMPRESSION	0x10
 #define MODIFIED_TIMEOUT	0x20
 #define MODIFIED_EXEC		0x40
+#define MODIFIED_METAFLUSH	0x80
 	modified = 0;
 
 	vstr = nv_get_string(nv, "remoteaddr");
@@ -2050,6 +2063,11 @@ primary_config_reload(struct hast_resour
 		strlcpy(gres->hr_exec, vstr, sizeof(gres->hr_exec));
 		modified |= MODIFIED_EXEC;
 	}
+	vint = nv_get_int32(nv, "metaflush");
+	if (gres->hr_metaflush != vint) {
+		gres->hr_metaflush = vint;
+		modified |= MODIFIED_METAFLUSH;
+	}
 
 	/*
 	 * Change timeout for connected sockets.
@@ -2099,6 +2117,7 @@ primary_config_reload(struct hast_resour
 #undef	MODIFIED_COMPRESSION
 #undef	MODIFIED_TIMEOUT
 #undef	MODIFIED_EXEC
+#undef	MODIFIED_METAFLUSH
 
 	pjdlog_info("Configuration reloaded successfully.");
 }

Modified: head/sbin/hastd/token.l
==============================================================================
--- head/sbin/hastd/token.l	Wed Sep 28 12:13:15 2011	(r225829)
+++ head/sbin/hastd/token.l	Wed Sep 28 13:08:51 2011	(r225830)
@@ -53,12 +53,14 @@ checksum		{ DP; return CHECKSUM; }
 compression		{ DP; return COMPRESSION; }
 timeout			{ DP; return TIMEOUT; }
 exec			{ DP; return EXEC; }
+metaflush		{ DP; return METAFLUSH; }
 resource		{ DP; return RESOURCE; }
 name			{ DP; return NAME; }
 local			{ DP; return LOCAL; }
 remote			{ DP; return REMOTE; }
 source			{ DP; return SOURCE; }
 on			{ DP; return ON; }
+off			{ DP; return OFF; }
 fullsync		{ DP; return FULLSYNC; }
 memsync			{ DP; return MEMSYNC; }
 async			{ DP; return ASYNC; }