Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 5 Feb 2012 12:43:46 +0100
From:      Pawel Jakub Dawidek <pjd@FreeBSD.org>
To:        Konstantin Belousov <kostikbel@gmail.com>
Cc:        arch@freebsd.org
Subject:   Re: Prefaulting for i/o buffers
Message-ID:  <20120205114345.GE30033@garage.freebsd.pl>
In-Reply-To: <20120203193719.GB3283@deviant.kiev.zoral.com.ua>
References:  <20120203193719.GB3283@deviant.kiev.zoral.com.ua>

next in thread | previous in thread | raw e-mail | index | archive | help

--idY8LE8SD6/8DnRI
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Fri, Feb 03, 2012 at 09:37:19PM +0200, Konstantin Belousov wrote:
> FreeBSD I/O infrastructure has well known issue with deadlock caused
> by vnode lock order reversal when buffers supplied to read(2) or
> write(2) syscalls are backed by mmaped file.
>=20
> I previously published the patches to convert i/o path to use VMIO,
> based on the Jeff Roberson proposal, see
> http://wiki.freebsd.org/VM6. As a side effect, the VM6 fixed the
> deadlock. Since that work is very intrusive and did not got any
> follow-up, it get stalled.
>=20
> Below is very lightweight patch which only goal is to fix deadlock in
> the least intrusive way. This is possible after FreeBSD got the
> vm_fault_quick_hold_pages(9) and vm_fault_disable_pagefaults(9) KPIs.
> http://people.freebsd.org/~kib/misc/vm1.3.patch

+struct rl_q_entry *
+rlqentry_alloc()

Missing 'void'.

+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) =3D=3D RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) =3D=3D RL_LOCK_READ)
+		return (0);
+#define	IN_RANGE(a, e) (a >=3D e->rl_q_start && a < e->rl_q_end)
+	if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||
+	    IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))
+		return (1);
+#undef	IN_RANGE
+	return (0);
+}

The checks here are a bit wrong and imcomplete.

This is correct:

+	if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||

This is not:

+	    IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))

After simplifying one of those checks we get:

	if (end1 >=3D start2 && end1 < end2)

This will be true if end1 =3D=3D start2, but in this case it should be
false.

There are also some checks missing. If the first range covers entire
second range or vice-versa you will return that the locks are
compatible, eg.

	start1-----start2-----end2-----end1

The following check should cover all the cases:

	if (e1->rl_q_start < e2->rl_q_end &&
	    e1->rl_q_end > e2->rl_q_start) {
		return (1);
	}

+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep =3D=3D TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep !=3D NULL)
+		lock->rl_currdep =3D TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry =3D lock->rl_currdep; entry;
+	     entry =3D TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 =3D=3D entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep =3D entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead =3D=3D lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |=3D RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}

This function doesn't look at it is going to scale. Searching for
incompatible locks looks very expensive.

This function seems to be responsible for waking up waiters. In case of
range locking this might be tricky, as unlocking read lock on the given
range doesn't mean the range can be exclusively locked by a waiter, as
part of this range might be read-locked multiple times or different
range might be read locked, but which is also covered by the waiting
writer.

Could you elaborate a bit on how this all works together? Does the
function look through all the waiters and for each waiter goes though
all the active locks?

Do you protect somehow against starving writers?

+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX"));
+	KASSERT(entry->rl_q_start =3D=3D base, ("XXX"));
+	KASSERT(entry->rl_q_end >=3D base + len, ("XXX"));

I expect those XXX will be replaced with real messages in the final
version? We could really use simple ASSERT() without obligatory message
in the kernel...

+void *
+rangelock_rlock(struct rangelock *lock, off_t base, size_t len, struct mtx=
 *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	td =3D curthread;
+	if (td->td_rlqe !=3D NULL) {
+		entry =3D td->td_rlqe;
+		td->td_rlqe =3D NULL;
+	} else
+		entry =3D rlqentry_alloc();
+	entry->rl_q_flags =3D RL_LOCK_READ;
+	entry->rl_q_start =3D base;
+	entry->rl_q_end =3D base + len;
+	return (rangelock_enqueue(lock, entry, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t base, size_t len, struct mtx=
 *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	td =3D curthread;
+	if (td->td_rlqe !=3D NULL) {
+		entry =3D td->td_rlqe;
+		td->td_rlqe =3D NULL;
+	} else
+		entry =3D rlqentry_alloc();
+	entry->rl_q_flags =3D RL_LOCK_WRITE;
+	entry->rl_q_start =3D base;
+	entry->rl_q_end =3D base + len;
+	return (rangelock_enqueue(lock, entry, ilk));
+}

Those functions are nearly identical, maybe they can be converted to
something like this:

static void *
rangelock_lock(struct rangelock *lock, off_t base, size_t len, struct mtx *=
ilk,
    int flags)
{
	struct rl_q_entry *entry;
	struct thread *td;

	td =3D curthread;
	if (td->td_rlqe !=3D NULL) {
		entry =3D td->td_rlqe;
		td->td_rlqe =3D NULL;
	} else
		entry =3D rlqentry_alloc();
	entry->rl_q_flags =3D RL_LOCK_READ;
	entry->rl_q_start =3D base;
	entry->rl_q_end =3D base + len;
	return (rangelock_enqueue(lock, entry, ilk));
}

void *
rangelock_rlock(struct rangelock *lock, off_t base, size_t len, struct mtx =
*ilk)
{

	return (range_lock(lock, base, len, ilk, RL_LOCK_READ);
}

void *
rangelock_wlock(struct rangelock *lock, off_t base, size_t len, struct mtx =
*ilk)
{

	return (range_lock(lock, base, len, ilk, RL_LOCK_WRITE);
}

Or even merge rangelock_lock() with rangelock_enqueue()?

@@ -199,6 +200,7 @@ thread_init(void *mem, int size, int flags)
=20
 	td->td_sleepqueue =3D sleepq_alloc();
 	td->td_turnstile =3D turnstile_alloc();
+	td->td_rlqe =3D rlqentry_alloc();

What is the purpose of td_rlqe field? From what I see thread can lock
more than one range. Could you elaborate on that as well?

Do you assert somehow that the same thread is trying to write-lock the
range it already read-locked?

--- a/sys/kern/subr_syscall.c
+++ b/sys/kern/subr_syscall.c
@@ -177,6 +177,12 @@ syscallret(struct thread *td, int error, struct syscal=
l_args *sa __unused)
 	KASSERT(td->td_locks =3D=3D 0,
 	    ("System call %s returning with %d locks held",
 	     syscallname(p, sa->code), td->td_locks));
+	KASSERT((td->td_pflags & TDP_NOFAULTING) =3D=3D 0,
+	    ("System call %s returning with pagefaults disabled",
+	     syscallname(p, sa->code)));
+	KASSERT((td->td_pflags & TDP_NOSLEEPING) =3D=3D 0,
+	    ("System call %s returning with sleep disabled",
+	     syscallname(p, sa->code)));

We can commit those separately, right?

 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) =3D=3D 0) {
-		if (rw =3D=3D UIO_READ)
-			error =3D mac_vnode_check_read(active_cred, file_cred,
-			    vp);
-		else
+		if (rw =3D=3D UIO_WRITE)
 			error =3D mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}

I don't see mac_vnode_check_read() call being moved anywhere.
Was that intended?

+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 info userspace

s/info/into/

+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ or VOP_WRITE().

s/VOP_READ/VOP_READ()/

+		prot =3D VM_PROT_READ;
+		if ((fp->f_flag & O_APPEND) || !(flags & FOF_OFFSET))
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie =3D vn_rangelock_wlock(vp, 0, (size_t)-1);

On 32bit archs size_t is still 32bit, AFAIR and file size can be bigger
than that. I think we should also use off_t for length (GEOM does that).

+		len =3D uio->uio_iov->iov_len;
+		addr =3D (vm_offset_t)uio->uio_iov->iov_base;
+		end =3D round_page(addr + len);
+		cnt =3D howmany(end - trunc_page(addr), PAGE_SIZE);
+		if (cnt > io_hold_cnt) {
+			len =3D io_hold_cnt * PAGE_SIZE;
+again:
+			end =3D round_page(addr + len);
+			cnt =3D howmany(end - trunc_page(addr), PAGE_SIZE);
+			if (cnt > io_hold_cnt) {
+				len -=3D PAGE_SIZE;
+				goto again;
+			}
+		}

This is a bit hard to understand immediately. Maybe something like the
following is a bit more readable (I assume this goto could happen only
once):

		len =3D MIN(uio->uio_iov->iov_len, io_hold_cnt * PAGE_SIZE);
		addr =3D (vm_offset_t)uio->uio_iov->iov_base;
		end =3D round_page(addr + len);
		if (howmany(end - trunc_page(addr), PAGE_SIZE) > io_hold_cnt) {
			len -=3D PAGE_SIZE;
			end =3D round_page(addr + len);
		}

+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
[...]
+	/*
+	 * Lock the range where the shortening take place. Increase of
+	 * file size does not need rangelock, but it is faster to lock
+	 * the range then call VOP_GETATTR to get the current size and
+	 * deal with races.
+	 */
+	rl_cookie =3D vn_rangelock_wlock(vp, length, -1);

Hmm. In case of increasing file size don't we need the range lock
because we might be racing with write append?

BTW. Comments in sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
might be a useful read.

--=20
Pawel Jakub Dawidek                       http://www.wheelsystems.com
FreeBSD committer                         http://www.FreeBSD.org
Am I Evil? Yes, I Am!                     http://tupytaj.pl

--idY8LE8SD6/8DnRI
Content-Type: application/pgp-signature

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.18 (FreeBSD)

iEYEARECAAYFAk8ua3EACgkQForvXbEpPzRszwCg2nscTg3qYkCHMvXP+s5yj8Y1
slQAoJUnLamwebH26vFztl4OUHbXimS4
=KK/7
-----END PGP SIGNATURE-----

--idY8LE8SD6/8DnRI--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120205114345.GE30033>