From owner-svn-src-head@FreeBSD.ORG Tue Dec 10 19:56:27 2013 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 21DA81A9; Tue, 10 Dec 2013 19:56:27 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id 018E11AA1; Tue, 10 Dec 2013 19:56:27 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id rBAJuQhd093519; Tue, 10 Dec 2013 19:56:26 GMT (envelope-from trociny@svn.freebsd.org) Received: (from trociny@localhost) by svn.freebsd.org (8.14.7/8.14.7/Submit) id rBAJuQ26093518; Tue, 10 Dec 2013 19:56:26 GMT (envelope-from trociny@svn.freebsd.org) Message-Id: <201312101956.rBAJuQ26093518@svn.freebsd.org> From: Mikolaj Golub Date: Tue, 10 Dec 2013 19:56:26 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r259191 - head/sbin/hastd X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.17 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 10 Dec 2013 19:56:27 -0000 Author: trociny Date: Tue Dec 10 19:56:26 2013 New Revision: 259191 URL: http://svnweb.freebsd.org/changeset/base/259191 Log: For memsync replication, hio_countdown is used not only as an indication when a request can be moved to done queue, but also for detecting the current state of memsync request. This approach has problems, e.g. leaking a request if memsynk ack from the secondary failed, or racy usage of write_complete, which should be called only once per write request, but for memsync can be entered by local_send_thread and ggate_send_thread simultaneously. So the following approach is implemented instead: 1) Use hio_countdown only for counting components we waiting to complete, i.e. initially it is always 2 for any replication mode. 2) To distinguish between "memsync ack" and "memsync fin" responses from the secondary, add and use hio_memsyncacked field. 3) write_complete() in component threads is called only before releasing hio_countdown (i.e. before the hio may be returned to the done queue). 4) Add and use hio_writecount refcounter to detect when write_complete() can be called in memsync case. Reported by: Pete French petefrench ingresso.co.uk Tested by: Pete French petefrench ingresso.co.uk MFC after: 2 weeks Modified: head/sbin/hastd/primary.c Modified: head/sbin/hastd/primary.c ============================================================================== --- head/sbin/hastd/primary.c Tue Dec 10 19:48:48 2013 (r259190) +++ head/sbin/hastd/primary.c Tue Dec 10 19:56:26 2013 (r259191) @@ -94,6 +94,15 @@ struct hio { */ bool hio_done; /* + * Number of components we are still waiting before sending write + * completion ack to GEOM Gate. Used for memsync. + */ + refcnt_t hio_writecount; + /* + * Memsync request was acknowleged by remote. + */ + bool hio_memsyncacked; + /* * Remember replication from the time the request was initiated, * so we won't get confused when replication changes on reload. */ @@ -231,6 +240,9 @@ static pthread_mutex_t metadata_lock; #define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1) #define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0) #define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2) +#define ISMEMSYNCWRITE(hio) \ + (((hio)->hio_replication == HAST_REPLICATION_MEMSYNC && \ + (hio)->hio_ggio.gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio))) static struct hast_resource *gres; @@ -1344,6 +1356,10 @@ ggate_recv_thread(void *arg) } else { mtx_unlock(&res->hr_amp_lock); } + if (hio->hio_replication == HAST_REPLICATION_MEMSYNC) { + hio->hio_memsyncacked = false; + refcnt_init(&hio->hio_writecount, ncomps); + } break; case BIO_DELETE: res->hr_stat_delete++; @@ -1354,13 +1370,7 @@ ggate_recv_thread(void *arg) } pjdlog_debug(2, "ggate_recv: (%p) Moving request to the send queues.", hio); - if (hio->hio_replication == HAST_REPLICATION_MEMSYNC && - ggio->gctl_cmd == BIO_WRITE) { - /* Each remote request needs two responses in memsync. */ - refcnt_init(&hio->hio_countdown, ncomps + 1); - } else { - refcnt_init(&hio->hio_countdown, ncomps); - } + refcnt_init(&hio->hio_countdown, ncomps); for (ii = ncomp; ii < ncomps; ii++) QUEUE_INSERT1(hio, send, ii); } @@ -1470,42 +1480,13 @@ local_send_thread(void *arg) } break; } - - if (hio->hio_replication != HAST_REPLICATION_MEMSYNC || - ggio->gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) { - if (refcnt_release(&hio->hio_countdown) > 0) - continue; - } else { - /* - * Depending on hio_countdown value, requests finished - * in the following order: - * 0: remote memsync, remote final, local write - * 1: remote memsync, local write, (remote final) - * 2: local write, (remote memsync), (remote final) - */ - switch (refcnt_release(&hio->hio_countdown)) { - case 0: - /* - * Local write finished as last. - */ - break; - case 1: - /* - * Local write finished after remote memsync - * reply arrvied. We can complete the write now. - */ - if (hio->hio_errors[0] == 0) - write_complete(res, hio); - continue; - case 2: - /* - * Local write finished as first. - */ - continue; - default: - PJDLOG_ABORT("Invalid hio_countdown."); + if (ISMEMSYNCWRITE(hio)) { + if (refcnt_release(&hio->hio_writecount) == 0) { + write_complete(res, hio); } } + if (refcnt_release(&hio->hio_countdown) > 0) + continue; if (ISSYNCREQ(hio)) { mtx_lock(&sync_lock); SYNCREQDONE(hio); @@ -1627,10 +1608,8 @@ remote_send_thread(void *arg) nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq"); nv_add_uint64(nv, offset, "offset"); nv_add_uint64(nv, length, "length"); - if (hio->hio_replication == HAST_REPLICATION_MEMSYNC && - ggio->gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) { + if (ISMEMSYNCWRITE(hio)) nv_add_uint8(nv, 1, "memsync"); - } if (nv_error(nv) != 0) { hio->hio_errors[ncomp] = nv_error(nv); pjdlog_debug(2, @@ -1709,8 +1688,12 @@ done_queue: } else { mtx_unlock(&res->hr_amp_lock); } - if (hio->hio_replication == HAST_REPLICATION_MEMSYNC) - (void)refcnt_release(&hio->hio_countdown); + if (ISMEMSYNCWRITE(hio)) { + if (refcnt_release(&hio->hio_writecount) == 0) { + if (hio->hio_errors[0] == 0) + write_complete(res, hio); + } + } } if (refcnt_release(&hio->hio_countdown) > 0) continue; @@ -1768,6 +1751,7 @@ remote_recv_thread(void *arg) hio_next[ncomp]); hio_recv_list_size[ncomp]--; mtx_unlock(&hio_recv_list_lock[ncomp]); + hio->hio_errors[ncomp] = ENOTCONN; goto done_queue; } if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) { @@ -1841,82 +1825,34 @@ remote_recv_thread(void *arg) hio->hio_errors[ncomp] = 0; nv_free(nv); done_queue: - if (hio->hio_replication != HAST_REPLICATION_MEMSYNC || - hio->hio_ggio.gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) { - if (refcnt_release(&hio->hio_countdown) > 0) - continue; - } else { - /* - * Depending on hio_countdown value, requests finished - * in the following order: - * - * 0: local write, remote memsync, remote final - * or - * 0: remote memsync, local write, remote final - * - * 1: local write, remote memsync, (remote final) - * or - * 1: remote memsync, remote final, (local write) - * - * 2: remote memsync, (local write), (remote final) - * or - * 2: remote memsync, (remote final), (local write) - */ - switch (refcnt_release(&hio->hio_countdown)) { - case 0: - /* - * Remote final reply arrived. - */ - PJDLOG_ASSERT(!memsyncack); - break; - case 1: - if (memsyncack) { - /* - * Local request already finished, so we - * can complete the write. - */ + if (ISMEMSYNCWRITE(hio)) { + if (!hio->hio_memsyncacked) { + PJDLOG_ASSERT(memsyncack || + hio->hio_errors[ncomp] != 0); + /* Remote ack arrived. */ + if (refcnt_release(&hio->hio_writecount) == 0) { if (hio->hio_errors[0] == 0) write_complete(res, hio); - /* - * We still need to wait for final - * remote reply. - */ + } + hio->hio_memsyncacked = true; + if (hio->hio_errors[ncomp] == 0) { pjdlog_debug(2, - "remote_recv: (%p) Moving request back to the recv queue.", - hio); + "remote_recv: (%p) Moving request " + "back to the recv queue.", hio); mtx_lock(&hio_recv_list_lock[ncomp]); TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]); hio_recv_list_size[ncomp]++; mtx_unlock(&hio_recv_list_lock[ncomp]); - } else { - /* - * Remote final reply arrived before - * local write finished. - * Nothing to do in such case. - */ + continue; } - continue; - case 2: - /* - * We received remote memsync reply even before - * local write finished. - */ - PJDLOG_ASSERT(memsyncack); - - pjdlog_debug(2, - "remote_recv: (%p) Moving request back to the recv queue.", - hio); - mtx_lock(&hio_recv_list_lock[ncomp]); - TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, - hio_next[ncomp]); - hio_recv_list_size[ncomp]++; - mtx_unlock(&hio_recv_list_lock[ncomp]); - continue; - default: - PJDLOG_ABORT("Invalid hio_countdown."); + } else { + PJDLOG_ASSERT(!memsyncack); + /* Remote final reply arrived. */ } } + if (refcnt_release(&hio->hio_countdown) > 0) + continue; if (ISSYNCREQ(hio)) { mtx_lock(&sync_lock); SYNCREQDONE(hio);