From owner-svn-src-head@FreeBSD.ORG Wed Apr 20 18:43:29 2011 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 1E333106566B; Wed, 20 Apr 2011 18:43:29 +0000 (UTC) (envelope-from pjd@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 0C25D8FC08; Wed, 20 Apr 2011 18:43:29 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id p3KIhS4G061781; Wed, 20 Apr 2011 18:43:28 GMT (envelope-from pjd@svn.freebsd.org) Received: (from pjd@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id p3KIhS1b061778; Wed, 20 Apr 2011 18:43:28 GMT (envelope-from pjd@svn.freebsd.org) Message-Id: <201104201843.p3KIhS1b061778@svn.freebsd.org> From: Pawel Jakub Dawidek Date: Wed, 20 Apr 2011 18:43:28 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r220898 - head/sbin/hastd X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 20 Apr 2011 18:43:29 -0000 Author: pjd Date: Wed Apr 20 18:43:28 2011 New Revision: 220898 URL: http://svn.freebsd.org/changeset/base/220898 Log: When we become primary, we connect to the remote and expect it to be in secondary role. It is possible that the remote node is primary, but only because there was a role change and it didn't finish cleaning up (unmounting file systems, etc.). If we detect such situation, wait for the remote node to switch the role to secondary before accepting I/Os. If we don't wait for it in that case, we will most likely cause split-brain. MFC after: 1 week Modified: head/sbin/hastd/hastd.c head/sbin/hastd/primary.c Modified: head/sbin/hastd/hastd.c ============================================================================== --- head/sbin/hastd/hastd.c Wed Apr 20 18:04:34 2011 (r220897) +++ head/sbin/hastd/hastd.c Wed Apr 20 18:43:28 2011 (r220898) @@ -736,6 +736,13 @@ listen_accept(void) nv_add_stringf(nverr, "errmsg", "Remote node acts as %s for the resource and not as %s.", role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY)); + if (res->hr_role == HAST_ROLE_PRIMARY) { + /* + * If we act as primary request the other side to wait + * for us for a bit, as may might be finishing cleanups. + */ + nv_add_uint8(nverr, 1, "wait"); + } goto fail; } /* Does token (if exists) match? */ Modified: head/sbin/hastd/primary.c ============================================================================== --- head/sbin/hastd/primary.c Wed Apr 20 18:04:34 2011 (r220897) +++ head/sbin/hastd/primary.c Wed Apr 20 18:43:28 2011 (r220898) @@ -219,6 +219,7 @@ static pthread_cond_t range_regular_cond static struct rangelocks *range_sync; static bool range_sync_wait; static pthread_cond_t range_sync_cond; +static bool fullystarted; static void *ggate_recv_thread(void *arg); static void *local_send_thread(void *arg); @@ -524,7 +525,7 @@ primary_connect(struct hast_resource *re return (0); } -static bool +static int init_remote(struct hast_resource *res, struct proto_conn **inp, struct proto_conn **outp) { @@ -537,6 +538,7 @@ init_remote(struct hast_resource *res, s int64_t datasize; uint32_t mapsize; size_t size; + int error; PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL)); PJDLOG_ASSERT(real_remote(res)); @@ -545,7 +547,9 @@ init_remote(struct hast_resource *res, s errmsg = NULL; if (primary_connect(res, &out) == -1) - return (false); + return (ECONNREFUSED); + + error = ECONNABORTED; /* * First handshake step. @@ -577,6 +581,8 @@ init_remote(struct hast_resource *res, s errmsg = nv_get_string(nvin, "errmsg"); if (errmsg != NULL) { pjdlog_warning("%s", errmsg); + if (nv_exists(nvin, "wait")) + error = EBUSY; nv_free(nvin); goto close; } @@ -734,14 +740,14 @@ init_remote(struct hast_resource *res, s res->hr_remoteout = out; } event_send(res, EVENT_CONNECT); - return (true); + return (0); close: if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0) event_send(res, EVENT_SPLITBRAIN); proto_close(out); if (in != NULL) proto_close(in); - return (false); + return (error); } static void @@ -920,8 +926,30 @@ hastd_primary(struct hast_resource *res) */ error = pthread_create(&td, NULL, ctrl_thread, res); PJDLOG_ASSERT(error == 0); - if (real_remote(res) && init_remote(res, NULL, NULL)) - sync_start(); + if (real_remote(res)) { + error = init_remote(res, NULL, NULL); + if (error == 0) { + sync_start(); + } else if (error == EBUSY) { + time_t start = time(NULL); + + pjdlog_warning("Waiting for remote node to become %s for %ds.", + role2str(HAST_ROLE_SECONDARY), + res->hr_timeout); + for (;;) { + sleep(1); + error = init_remote(res, NULL, NULL); + if (error != EBUSY) + break; + if (time(NULL) > start + res->hr_timeout) + break; + } + if (error == EBUSY) { + pjdlog_warning("Remote node is still %s, starting anyway.", + role2str(HAST_ROLE_PRIMARY)); + } + } + } error = pthread_create(&td, NULL, ggate_recv_thread, res); PJDLOG_ASSERT(error == 0); error = pthread_create(&td, NULL, local_send_thread, res); @@ -932,6 +960,7 @@ hastd_primary(struct hast_resource *res) PJDLOG_ASSERT(error == 0); error = pthread_create(&td, NULL, ggate_send_thread, res); PJDLOG_ASSERT(error == 0); + fullystarted = true; (void)sync_thread(res); } @@ -2095,7 +2124,7 @@ guard_one(struct hast_resource *res, uns pjdlog_debug(2, "remote_guard: Reconnecting to %s.", res->hr_remoteaddr); in = out = NULL; - if (init_remote(res, &in, &out)) { + if (init_remote(res, &in, &out) == 0) { rw_wlock(&hio_remote_lock[ncomp]); PJDLOG_ASSERT(res->hr_remotein == NULL); PJDLOG_ASSERT(res->hr_remoteout == NULL); @@ -2153,12 +2182,19 @@ guard_thread(void *arg) break; } - pjdlog_debug(2, "remote_guard: Checking connections."); - now = time(NULL); - if (lastcheck + HAST_KEEPALIVE <= now) { - for (ii = 0; ii < ncomps; ii++) - guard_one(res, ii); - lastcheck = now; + /* + * Don't check connections until we fully started, + * as we may still be looping, waiting for remote node + * to switch from primary to secondary. + */ + if (fullystarted) { + pjdlog_debug(2, "remote_guard: Checking connections."); + now = time(NULL); + if (lastcheck + HAST_KEEPALIVE <= now) { + for (ii = 0; ii < ncomps; ii++) + guard_one(res, ii); + lastcheck = now; + } } signo = sigtimedwait(&mask, NULL, &timeout); }