Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 27 Apr 2011 19:46:57 +0000 (UTC)
From:      Mikolaj Golub <trociny@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r221135 - in stable/8/sbin: hastctl hastd
Message-ID:  <201104271946.p3RJkv5Y086656@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: trociny
Date: Wed Apr 27 19:46:57 2011
New Revision: 221135
URL: http://svn.freebsd.org/changeset/base/221135

Log:
  MFC r220520, r220521, r220522, r220523, r220573, r220744, r220865,
    r220889, r220890, r220898, r220899:
  
  r220520:
  
  hastd(8) maintains a map of dirty extents, not hastctl(8). Fix this.
  
  r220521:
  
  Fix a typo in comments.
  
  r220522:
  
  In hast_proto_recv_data() check that the size of the data to be
  received does not exceed the buffer size.
  
  r220573 (pjd):
  
  The replication mode that is currently support is fullsync, not memsync.
  Correct this and print a warning if different replication mode is
  configured.
  
  r220523, r220744:
  
  Remove hast_proto_recv(). It was used only in one place, where
  hast_proto_recv_hdr() may be used.
  
  r220865 (pjd):
  
  Scenario:
  - We have two nodes connected and synchronized (local counters on both sides
    are 0).
  - We take secondary down and recreate it.
  - Primary connects to it and starts synchronization (but local counters are
    still 0).
  - We switch the roles.
  - Synchronization restarts but data is synchronized now from new primary
    (because local counters are 0) that doesn't have new data yet.
  
  This fix this issue we bump local counter on primary when we discover that
  connected secondary was recreated and has no data yet.
  
  Reported by:    trociny
  Discussed with: trociny
  Tested by:      trociny
  
  r220889 (pjd):
  
  Timeout must be positive.
  
  r220890 (pjd):
  
  If we act in different role than requested by the remote node, log it
  as a warning and not an error.
  
  MFC after:      1 week
  
  r220898 (pjd), r220899 (pjd):
  
  When we become primary, we connect to the remote and expect it to be in
  secondary role. It is possible that the remote node is primary, but only
  because there was a role change and it didn't finish cleaning up (unmounting
  file systems, etc.). If we detect such situation, wait for the remote node
  to switch the role to secondary before accepting I/Os. If we don't wait for
  it in that case, we will most likely cause split-brain.
  
  Approved by:	pjd (mentor)

Modified:
  stable/8/sbin/hastctl/hastctl.8
  stable/8/sbin/hastctl/hastctl.c
  stable/8/sbin/hastd/activemap.c
  stable/8/sbin/hastd/hast_proto.c
  stable/8/sbin/hastd/hast_proto.h
  stable/8/sbin/hastd/hastd.c
  stable/8/sbin/hastd/parse.y
  stable/8/sbin/hastd/primary.c
  stable/8/sbin/hastd/secondary.c
Directory Properties:
  stable/8/sbin/hastctl/   (props changed)
  stable/8/sbin/hastd/   (props changed)

Modified: stable/8/sbin/hastctl/hastctl.8
==============================================================================
--- stable/8/sbin/hastctl/hastctl.8	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastctl/hastctl.8	Wed Apr 27 19:46:57 2011	(r221135)
@@ -27,7 +27,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd March 13, 2011
+.Dd April 10, 2011
 .Dt HASTCTL 8
 .Os
 .Sh NAME
@@ -88,7 +88,7 @@ Additional options include:
 .It Fl e Ar extentsize
 Size of an extent.
 Extent is a block which is used for synchronization.
-.Nm
+.Xr hastd 8
 maintains a map of dirty extents and extent is the smallest region that
 can be marked as dirty.
 If any part of an extent is modified, entire extent will be synchronized

Modified: stable/8/sbin/hastctl/hastctl.c
==============================================================================
--- stable/8/sbin/hastctl/hastctl.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastctl/hastctl.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -492,7 +492,7 @@ main(int argc, char *argv[])
 	}
 	nv_free(nv);
 	/* ...and receive reply. */
-	if (hast_proto_recv(NULL, controlconn, &nv, NULL, 0) < 0) {
+	if (hast_proto_recv_hdr(controlconn, &nv) < 0) {
 		pjdlog_exit(EX_UNAVAILABLE,
 		    "cannot receive reply from hastd via %s",
 		    cfg->hc_controladdr);

Modified: stable/8/sbin/hastd/activemap.c
==============================================================================
--- stable/8/sbin/hastd/activemap.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/activemap.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -470,7 +470,7 @@ activemap_copyin(struct activemap *amp, 
 }
 
 /*
- * Function merges the given bitmap with existng one.
+ * Function merges the given bitmap with existing one.
  */
 void
 activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)

Modified: stable/8/sbin/hastd/hast_proto.c
==============================================================================
--- stable/8/sbin/hastd/hast_proto.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/hast_proto.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -189,9 +189,12 @@ hast_proto_recv_data(const struct hast_r
 	dptr = data;
 
 	dsize = nv_get_uint32(nv, "size");
-	if (dsize == 0)
+	if (dsize > size) {
+		errno = EINVAL;
+		goto end;
+	} else if (dsize == 0) {
 		(void)nv_set_error(nv, 0);
-	else {
+	} else {
 		if (proto_recv(conn, data, dsize) < 0)
 			goto end;
 		for (ii = sizeof(pipeline) / sizeof(pipeline[0]); ii > 0;
@@ -216,26 +219,3 @@ end:
 		free(dptr);
 	return (ret);
 }
-
-int
-hast_proto_recv(const struct hast_resource *res, struct proto_conn *conn,
-    struct nv **nvp, void *data, size_t size)
-{
-	struct nv *nv;
-	size_t dsize;
-	int ret;
-
-	ret = hast_proto_recv_hdr(conn, &nv);
-	if (ret < 0)
-		return (ret);
-	dsize = nv_get_uint32(nv, "size");
-	if (dsize == 0)
-		(void)nv_set_error(nv, 0);
-	else
-		ret = hast_proto_recv_data(res, conn, nv, data, size);
-	if (ret < 0)
-		nv_free(nv);
-	else
-		*nvp = nv;
-	return (ret);
-}

Modified: stable/8/sbin/hastd/hast_proto.h
==============================================================================
--- stable/8/sbin/hastd/hast_proto.h	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/hast_proto.h	Wed Apr 27 19:46:57 2011	(r221135)
@@ -39,8 +39,6 @@
 
 int hast_proto_send(const struct hast_resource *res, struct proto_conn *conn,
     struct nv *nv, const void *data, size_t size);
-int hast_proto_recv(const struct hast_resource *res, struct proto_conn *conn,
-    struct nv **nvp, void *data, size_t size);
 int hast_proto_recv_hdr(const struct proto_conn *conn, struct nv **nvp);
 int hast_proto_recv_data(const struct hast_resource *res,
     struct proto_conn *conn, struct nv *nv, void *data, size_t size);

Modified: stable/8/sbin/hastd/hastd.c
==============================================================================
--- stable/8/sbin/hastd/hastd.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/hastd.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -730,12 +730,19 @@ listen_accept(void)
 	}
 	/* Is the resource marked as secondary? */
 	if (res->hr_role != HAST_ROLE_SECONDARY) {
-		pjdlog_error("We act as %s for the resource and not as %s as requested by %s.",
+		pjdlog_warning("We act as %s for the resource and not as %s as requested by %s.",
 		    role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY),
 		    raddr);
 		nv_add_stringf(nverr, "errmsg",
 		    "Remote node acts as %s for the resource and not as %s.",
 		    role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
+		if (res->hr_role == HAST_ROLE_PRIMARY) {
+			/*
+			 * If we act as primary request the other side to wait
+			 * for us a bit, as we might be finishing cleanups.
+			 */
+			nv_add_uint8(nverr, 1, "wait");
+		}
 		goto fail;
 	}
 	/* Does token (if exists) match? */

Modified: stable/8/sbin/hastd/parse.y
==============================================================================
--- stable/8/sbin/hastd/parse.y	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/parse.y	Wed Apr 27 19:46:57 2011	(r221135)
@@ -169,7 +169,7 @@ yy_config_parse(const char *config, bool
 	lineno = 0;
 
 	depth0_timeout = HAST_TIMEOUT;
-	depth0_replication = HAST_REPLICATION_MEMSYNC;
+	depth0_replication = HAST_REPLICATION_FULLSYNC;
 	depth0_checksum = HAST_CHECKSUM_NONE;
 	depth0_compression = HAST_COMPRESSION_HOLE;
 	strlcpy(depth0_control, HAST_CONTROL, sizeof(depth0_control));
@@ -228,6 +228,13 @@ yy_config_parse(const char *config, bool
 			 */
 			curres->hr_replication = depth0_replication;
 		}
+		if (curres->hr_replication == HAST_REPLICATION_MEMSYNC ||
+		    curres->hr_replication == HAST_REPLICATION_ASYNC) {
+			pjdlog_warning("Replication mode \"%s\" is not implemented, falling back to \"%s\".",
+			    curres->hr_replication == HAST_REPLICATION_MEMSYNC ?
+			    "memsync" : "async", "fullsync");
+			curres->hr_replication = HAST_REPLICATION_FULLSYNC;
+		}
 		if (curres->hr_checksum == -1) {
 			/*
 			 * Checksum is not set at resource-level.
@@ -454,6 +461,10 @@ compression_type:
 
 timeout_statement:	TIMEOUT NUM
 	{
+		if ($2 <= 0) {
+			pjdlog_error("Negative or zero timeout.");
+			return (1);
+		}
 		switch (depth) {
 		case 0:
 			depth0_timeout = $2;

Modified: stable/8/sbin/hastd/primary.c
==============================================================================
--- stable/8/sbin/hastd/primary.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/primary.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -219,6 +219,7 @@ static pthread_cond_t range_regular_cond
 static struct rangelocks *range_sync;
 static bool range_sync_wait;
 static pthread_cond_t range_sync_cond;
+static bool fullystarted;
 
 static void *ggate_recv_thread(void *arg);
 static void *local_send_thread(void *arg);
@@ -524,7 +525,7 @@ primary_connect(struct hast_resource *re
 	return (0);
 }
 
-static bool
+static int
 init_remote(struct hast_resource *res, struct proto_conn **inp,
     struct proto_conn **outp)
 {
@@ -537,6 +538,7 @@ init_remote(struct hast_resource *res, s
 	int64_t datasize;
 	uint32_t mapsize;
 	size_t size;
+	int error;
 
 	PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL));
 	PJDLOG_ASSERT(real_remote(res));
@@ -545,7 +547,9 @@ init_remote(struct hast_resource *res, s
 	errmsg = NULL;
 
 	if (primary_connect(res, &out) == -1)
-		return (false);
+		return (ECONNREFUSED);
+
+	error = ECONNABORTED;
 
 	/*
 	 * First handshake step.
@@ -577,6 +581,8 @@ init_remote(struct hast_resource *res, s
 	errmsg = nv_get_string(nvin, "errmsg");
 	if (errmsg != NULL) {
 		pjdlog_warning("%s", errmsg);
+		if (nv_exists(nvin, "wait"))
+			error = EBUSY;
 		nv_free(nvin);
 		goto close;
 	}
@@ -667,6 +673,25 @@ init_remote(struct hast_resource *res, s
 	res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt");
 	res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt");
 	res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc");
+	if (nv_exists(nvin, "virgin")) {
+		/*
+		 * Secondary was reinitialized, bump localcnt if it is 0 as
+		 * only we have the data.
+		 */
+		PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_PRIMARY);
+		PJDLOG_ASSERT(res->hr_secondary_localcnt == 0);
+
+		if (res->hr_primary_localcnt == 0) {
+			PJDLOG_ASSERT(res->hr_secondary_remotecnt == 0);
+
+			mtx_lock(&metadata_lock);
+			res->hr_primary_localcnt++;
+			pjdlog_debug(1, "Increasing localcnt to %ju.",
+			    (uintmax_t)res->hr_primary_localcnt);
+			(void)metadata_write(res);
+			mtx_unlock(&metadata_lock);
+		}
+	}
 	map = NULL;
 	mapsize = nv_get_uint32(nvin, "mapsize");
 	if (mapsize > 0) {
@@ -715,14 +740,14 @@ init_remote(struct hast_resource *res, s
 		res->hr_remoteout = out;
 	}
 	event_send(res, EVENT_CONNECT);
-	return (true);
+	return (0);
 close:
 	if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0)
 		event_send(res, EVENT_SPLITBRAIN);
 	proto_close(out);
 	if (in != NULL)
 		proto_close(in);
-	return (false);
+	return (error);
 }
 
 static void
@@ -901,8 +926,30 @@ hastd_primary(struct hast_resource *res)
 	 */
 	error = pthread_create(&td, NULL, ctrl_thread, res);
 	PJDLOG_ASSERT(error == 0);
-	if (real_remote(res) && init_remote(res, NULL, NULL))
-		sync_start();
+	if (real_remote(res)) {
+		error = init_remote(res, NULL, NULL);
+		if (error == 0) {
+			sync_start();
+		} else if (error == EBUSY) {
+			time_t start = time(NULL);
+
+			pjdlog_warning("Waiting for remote node to become %s for %ds.",
+			    role2str(HAST_ROLE_SECONDARY),
+			    res->hr_timeout);
+			for (;;) {
+				sleep(1);
+				error = init_remote(res, NULL, NULL);
+				if (error != EBUSY)
+					break;
+				if (time(NULL) > start + res->hr_timeout)
+					break;
+			}
+			if (error == EBUSY) {
+				pjdlog_warning("Remote node is still %s, starting anyway.",
+				    role2str(HAST_ROLE_PRIMARY));
+			}
+		}
+	}
 	error = pthread_create(&td, NULL, ggate_recv_thread, res);
 	PJDLOG_ASSERT(error == 0);
 	error = pthread_create(&td, NULL, local_send_thread, res);
@@ -913,6 +960,7 @@ hastd_primary(struct hast_resource *res)
 	PJDLOG_ASSERT(error == 0);
 	error = pthread_create(&td, NULL, ggate_send_thread, res);
 	PJDLOG_ASSERT(error == 0);
+	fullystarted = true;
 	(void)sync_thread(res);
 }
 
@@ -2076,7 +2124,7 @@ guard_one(struct hast_resource *res, uns
 	pjdlog_debug(2, "remote_guard: Reconnecting to %s.",
 	    res->hr_remoteaddr);
 	in = out = NULL;
-	if (init_remote(res, &in, &out)) {
+	if (init_remote(res, &in, &out) == 0) {
 		rw_wlock(&hio_remote_lock[ncomp]);
 		PJDLOG_ASSERT(res->hr_remotein == NULL);
 		PJDLOG_ASSERT(res->hr_remoteout == NULL);
@@ -2134,12 +2182,19 @@ guard_thread(void *arg)
 			break;
 		}
 
-		pjdlog_debug(2, "remote_guard: Checking connections.");
-		now = time(NULL);
-		if (lastcheck + HAST_KEEPALIVE <= now) {
-			for (ii = 0; ii < ncomps; ii++)
-				guard_one(res, ii);
-			lastcheck = now;
+		/*
+		 * Don't check connections until we fully started,
+		 * as we may still be looping, waiting for remote node
+		 * to switch from primary to secondary.
+		 */
+		if (fullystarted) {
+			pjdlog_debug(2, "remote_guard: Checking connections.");
+			now = time(NULL);
+			if (lastcheck + HAST_KEEPALIVE <= now) {
+				for (ii = 0; ii < ncomps; ii++)
+					guard_one(res, ii);
+				lastcheck = now;
+			}
 		}
 		signo = sigtimedwait(&mask, NULL, &timeout);
 	}

Modified: stable/8/sbin/hastd/secondary.c
==============================================================================
--- stable/8/sbin/hastd/secondary.c	Wed Apr 27 19:36:35 2011	(r221134)
+++ stable/8/sbin/hastd/secondary.c	Wed Apr 27 19:46:57 2011	(r221135)
@@ -261,6 +261,7 @@ init_remote(struct hast_resource *res, s
 		} else {
 			memset(map, 0xff, mapsize);
 		}
+		nv_add_int8(nvout, 1, "virgin");
 		nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
 	} else if (res->hr_resuid != resuid) {
 		char errmsg[256];



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201104271946.p3RJkv5Y086656>