Date: Sat, 30 Oct 2010 15:25:56 +0300 From: Mikolaj Golub <to.my.trociny@gmail.com> To: Pawel Jakub Dawidek <pjd@FreeBSD.org> Cc: freebsd-stable@freebsd.org, Pete French <petefrench@ticketswitch.com> Subject: Re: hast vs ggate+gmirror sychrnoisation speed Message-ID: <86d3qr3m0b.fsf@kopusha.home.net> In-Reply-To: <86lj5i3zjt.fsf@kopusha.home.net> (Mikolaj Golub's message of "Thu, 28 Oct 2010 22:08:54 %2B0300") References: <E1PAlxN-000H5x-Eh@dilbert.ticketswitch.com> <86wrp3wj67.fsf@kopusha.home.net> <20101028163036.GA2347@garage.freebsd.pl> <86lj5i3zjt.fsf@kopusha.home.net>
next in thread | previous in thread | raw e-mail | index | archive | help
--=-=-= On Thu, 28 Oct 2010 22:08:54 +0300 Mikolaj Golub wrote to Pawel Jakub Dawidek: PJD>> I looked at the code and the keepalive packets arbe sent from another PJD>> thread. Could you try turning them off in primary.c and see if that PJD>> helps? MG> At first I set RETRY_SLEEP to 1 sec to have more keepalive packets. The errors MG> started to observe frequently: MG> Oct 28 21:35:53 bolek hastd[1709]: [storage] (secondary) Unable to receive request header: RPC version wrong. MG> Oct 28 21:35:54 bolek hastd[1632]: [storage] (secondary) Worker process exited ungracefully (pid=1709, exitcode=75). MG> Oct 28 21:36:12 bolek hastd[1722]: [storage] (secondary) Unable to receive request header: RPC version wrong. MG> Oct 28 21:36:12 bolek hastd[1632]: [storage] (secondary) Worker process exited ungracefully (pid=1722, exitcode=75). MG> ... MG> Now I have been running synchronization for more then a half an hour with MG> keepalive_send disabled and have not seen any error. So :-) What do you think about sending keepalive in remote_send_thread() to avoid this problem and sending them only when a connection is idle (it looks like there is no much use to send them all the time)? Something like in the patch below (it works for me). -- Mikolaj Golub --=-=-= Content-Type: text/x-patch Content-Disposition: attachment; filename=hastd.keepalive.patch Index: sbin/hastd/primary.c =================================================================== --- sbin/hastd/primary.c (revision 214550) +++ sbin/hastd/primary.c (working copy) @@ -190,6 +190,19 @@ static pthread_mutex_t metadata_lock; hio_next[(ncomp)]); \ mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ } while (0) +#define QUEUE_TRY1(hio, name, ncomp) do { \ + mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ + (hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)]); \ + if (hio == NULL) { \ + cv_timedwait(&hio_##name##_list_cond[(ncomp)], \ + &hio_##name##_list_lock[(ncomp)], RETRY_SLEEP); \ + hio = TAILQ_FIRST(&hio_##name##_list[(ncomp)]); \ + } \ + if (hio != NULL) \ + TAILQ_REMOVE(&hio_##name##_list[(ncomp)], hio, \ + hio_next[(ncomp)]); \ + mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ +} while (0) #define QUEUE_TAKE2(hio, name) do { \ mtx_lock(&hio_##name##_list_lock); \ while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ @@ -1176,6 +1189,38 @@ local_send_thread(void *arg) return (NULL); } +static void +keepalive_send(struct hast_resource *res, unsigned int ncomp) +{ + struct nv *nv; + + if (!ISCONNECTED(res, ncomp)) + return; + + assert(res->hr_remotein != NULL); + assert(res->hr_remoteout != NULL); + + nv = nv_alloc(); + nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); + if (nv_error(nv) != 0) { + nv_free(nv); + pjdlog_debug(1, + "keepalive_send: Unable to prepare header to send."); + return; + } + if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) < 0) { + pjdlog_common(LOG_DEBUG, 1, errno, + "keepalive_send: Unable to send request"); + nv_free(nv); + rw_unlock(&hio_remote_lock[ncomp]); + remote_close(res, ncomp); + rw_rlock(&hio_remote_lock[ncomp]); + return; + } + nv_free(nv); + pjdlog_debug(2, "keepalive_send: Request sent."); +} + /* * Thread sends request to secondary node. */ @@ -1184,6 +1229,7 @@ remote_send_thread(void *arg) { struct hast_resource *res = arg; struct g_gate_ctl_io *ggio; + time_t lastcheck, now; struct hio *hio; struct nv *nv; unsigned int ncomp; @@ -1194,10 +1240,19 @@ remote_send_thread(void *arg) /* Remote component is 1 for now. */ ncomp = 1; + lastcheck = time(NULL); for (;;) { pjdlog_debug(2, "remote_send: Taking request."); - QUEUE_TAKE1(hio, send, ncomp); + QUEUE_TRY1(hio, send, ncomp); + if (hio == NULL) { + now = time(NULL); + if (lastcheck + RETRY_SLEEP <= now) { + keepalive_send(res, ncomp); + lastcheck = now; + } + continue; + } pjdlog_debug(2, "remote_send: (%p) Got request.", hio); ggio = &hio->hio_ggio; switch (ggio->gctl_cmd) { @@ -1883,32 +1938,6 @@ failed: } static void -keepalive_send(struct hast_resource *res, unsigned int ncomp) -{ - struct nv *nv; - - nv = nv_alloc(); - nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); - if (nv_error(nv) != 0) { - nv_free(nv); - pjdlog_debug(1, - "keepalive_send: Unable to prepare header to send."); - return; - } - if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) < 0) { - pjdlog_common(LOG_DEBUG, 1, errno, - "keepalive_send: Unable to send request"); - nv_free(nv); - rw_unlock(&hio_remote_lock[ncomp]); - remote_close(res, ncomp); - rw_rlock(&hio_remote_lock[ncomp]); - return; - } - nv_free(nv); - pjdlog_debug(2, "keepalive_send: Request sent."); -} - -static void guard_one(struct hast_resource *res, unsigned int ncomp) { struct proto_conn *in, *out; @@ -1926,12 +1955,6 @@ guard_one(struct hast_resource *res, unsigned int if (ISCONNECTED(res, ncomp)) { assert(res->hr_remotein != NULL); assert(res->hr_remoteout != NULL); - keepalive_send(res, ncomp); - } - - if (ISCONNECTED(res, ncomp)) { - assert(res->hr_remotein != NULL); - assert(res->hr_remoteout != NULL); rw_unlock(&hio_remote_lock[ncomp]); pjdlog_debug(2, "remote_guard: Connection to %s is ok.", res->hr_remoteaddr); --=-=-=--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?86d3qr3m0b.fsf>