From owner-svn-src-all@FreeBSD.ORG Fri Aug 27 14:26:38 2010 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 2DEC41065695; Fri, 27 Aug 2010 14:26:38 +0000 (UTC) (envelope-from pjd@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 1D09B8FC17; Fri, 27 Aug 2010 14:26:38 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o7REQcIK080705; Fri, 27 Aug 2010 14:26:38 GMT (envelope-from pjd@svn.freebsd.org) Received: (from pjd@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o7REQb1D080701; Fri, 27 Aug 2010 14:26:37 GMT (envelope-from pjd@svn.freebsd.org) Message-Id: <201008271426.o7REQb1D080701@svn.freebsd.org> From: Pawel Jakub Dawidek Date: Fri, 27 Aug 2010 14:26:37 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r211882 - head/sbin/hastd X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 27 Aug 2010 14:26:38 -0000 Author: pjd Date: Fri Aug 27 14:26:37 2010 New Revision: 211882 URL: http://svn.freebsd.org/changeset/base/211882 Log: Implement keepalive mechanism inside HAST protocol so we can detect secondary node failures quickly for HAST resources that are rarely modified. Remove XXX from a comment now that the guard thread never sleeps infinitely. MFC after: 2 weeks Obtained from: Wheel Systems Sp. z o.o. http://www.wheelsystems.com Modified: head/sbin/hastd/hast.h head/sbin/hastd/primary.c head/sbin/hastd/secondary.c Modified: head/sbin/hastd/hast.h ============================================================================== --- head/sbin/hastd/hast.h Fri Aug 27 14:12:53 2010 (r211881) +++ head/sbin/hastd/hast.h Fri Aug 27 14:26:37 2010 (r211882) @@ -48,7 +48,12 @@ #include "proto.h" -#define HAST_PROTO_VERSION 0 +/* + * Version history: + * 0 - initial version + * 1 - HIO_KEEPALIVE added + */ +#define HAST_PROTO_VERSION 1 #define EHAST_OK 0 #define EHAST_NOENTRY 1 @@ -74,6 +79,7 @@ #define HIO_WRITE 2 #define HIO_DELETE 3 #define HIO_FLUSH 4 +#define HIO_KEEPALIVE 5 #define HAST_TIMEOUT 5 #define HAST_CONFIG "/etc/hast.conf" Modified: head/sbin/hastd/primary.c ============================================================================== --- head/sbin/hastd/primary.c Fri Aug 27 14:12:53 2010 (r211881) +++ head/sbin/hastd/primary.c Fri Aug 27 14:26:37 2010 (r211882) @@ -151,7 +151,11 @@ static pthread_mutex_t metadata_lock; */ #define HAST_NCOMPONENTS 2 /* - * Number of seconds to sleep before next reconnect try. + * Number of seconds to sleep between keepalive packets. + */ +#define KEEPALIVE_SLEEP 10 +/* + * Number of seconds to sleep between reconnect retries. */ #define RECONNECT_SLEEP 5 @@ -886,11 +890,14 @@ remote_close(struct hast_resource *res, sync_stop(); /* - * Wake up guard thread, so it can immediately start reconnect. + * Wake up guard thread (if we are not called from within guard thread), + * so it can immediately start reconnect. */ - mtx_lock(&hio_guard_lock); - cv_signal(&hio_guard_cond); - mtx_unlock(&hio_guard_lock); + if (!mtx_owned(&hio_guard_lock)) { + mtx_lock(&hio_guard_lock); + cv_signal(&hio_guard_cond); + mtx_unlock(&hio_guard_lock); + } } /* @@ -1734,7 +1741,7 @@ sighandler(int sig) assert(!"invalid condition"); } /* - * XXX: Racy, but if we cannot obtain hio_guard_lock here, we don't + * Racy, but if we cannot obtain hio_guard_lock here, we don't * want to risk deadlock. */ unlock = mtx_trylock(&hio_guard_lock); @@ -1851,6 +1858,32 @@ failed: pjdlog_warning("Configuration not reloaded."); } +static void +keepalive_send(struct hast_resource *res, unsigned int ncomp) +{ + struct nv *nv; + + nv = nv_alloc(); + nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); + if (nv_error(nv) != 0) { + nv_free(nv); + pjdlog_debug(1, + "keepalive_send: Unable to prepare header to send."); + return; + } + if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) < 0) { + pjdlog_common(LOG_DEBUG, 1, errno, + "keepalive_send: Unable to send request"); + nv_free(nv); + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + rw_rlock(&hio_remote_lock[ncomp]); + return; + } + nv_free(nv); + pjdlog_debug(2, "keepalive_send: Request sent."); +} + /* * Thread guards remote connections and reconnects when needed, handles * signals, etc. @@ -1874,14 +1907,8 @@ guard_thread(void *arg) sighup_received = false; config_reload(); } - /* - * If all the connection will be fine, we will sleep until - * someone wakes us up. - * If any of the connections will be broken and we won't be - * able to connect, we will sleep only for RECONNECT_SLEEP - * seconds so we can retry soon. - */ - timeout = 0; + + timeout = KEEPALIVE_SLEEP; pjdlog_debug(2, "remote_guard: Checking connections."); mtx_lock(&hio_guard_lock); for (ii = 0; ii < ncomps; ii++) { @@ -1891,6 +1918,11 @@ guard_thread(void *arg) if (ISCONNECTED(res, ii)) { assert(res->hr_remotein != NULL); assert(res->hr_remoteout != NULL); + keepalive_send(res, ii); + } + if (ISCONNECTED(res, ii)) { + assert(res->hr_remotein != NULL); + assert(res->hr_remoteout != NULL); rw_unlock(&hio_remote_lock[ii]); pjdlog_debug(2, "remote_guard: Connection to %s is ok.", Modified: head/sbin/hastd/secondary.c ============================================================================== --- head/sbin/hastd/secondary.c Fri Aug 27 14:12:53 2010 (r211881) +++ head/sbin/hastd/secondary.c Fri Aug 27 14:26:37 2010 (r211882) @@ -413,6 +413,9 @@ reqlog(int loglevel, int debuglevel, int "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset, (uintmax_t)hio->hio_length); break; + case HIO_KEEPALIVE: + (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE."); + break; default: (void)snprintf(msg + len, sizeof(msg) - len, "UNKNOWN(%u).", (unsigned int)hio->hio_cmd); @@ -433,6 +436,8 @@ requnpack(struct hast_resource *res, str goto end; } switch (hio->hio_cmd) { + case HIO_KEEPALIVE: + break; case HIO_READ: case HIO_WRITE: case HIO_DELETE: @@ -517,7 +522,14 @@ recv_thread(void *arg) } reqlog(LOG_DEBUG, 2, -1, hio, "recv: (%p) Got request header: ", hio); - if (hio->hio_cmd == HIO_WRITE) { + if (hio->hio_cmd == HIO_KEEPALIVE) { + pjdlog_debug(2, + "recv: (%p) Moving request to the free queue.", + hio); + nv_free(hio->hio_nv); + QUEUE_INSERT(free, hio); + continue; + } else if (hio->hio_cmd == HIO_WRITE) { if (hast_proto_recv_data(res, res->hr_remotein, hio->hio_nv, hio->hio_data, MAXPHYS) < 0) { pjdlog_exit(EX_TEMPFAIL,