From owner-svn-src-head@FreeBSD.ORG  Thu Jun 21 09:19:42 2012
Return-Path: <owner-svn-src-head@FreeBSD.ORG>
Delivered-To: svn-src-head@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52])
	by hub.freebsd.org (Postfix) with ESMTP id 9F3011065672;
	Thu, 21 Jun 2012 09:19:42 +0000 (UTC) (envelope-from kib@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c])
	by mx1.freebsd.org (Postfix) with ESMTP id 898518FC0A;
	Thu, 21 Jun 2012 09:19:42 +0000 (UTC)
Received: from svn.freebsd.org (localhost [127.0.0.1])
	by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q5L9JgWn001683;
	Thu, 21 Jun 2012 09:19:42 GMT (envelope-from kib@svn.freebsd.org)
Received: (from kib@localhost)
	by svn.freebsd.org (8.14.4/8.14.4/Submit) id q5L9JfoW001681;
	Thu, 21 Jun 2012 09:19:41 GMT (envelope-from kib@svn.freebsd.org)
Message-Id: <201206210919.q5L9JfoW001681@svn.freebsd.org>
From: Konstantin Belousov <kib@FreeBSD.org>
Date: Thu, 21 Jun 2012 09:19:41 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
	svn-src-head@freebsd.org
X-SVN-Group: head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: 
Subject: svn commit: r237365 - head/sys/kern
X-BeenThere: svn-src-head@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: SVN commit messages for the src tree for head/-current
	<svn-src-head.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-head>
List-Post: <mailto:svn-src-head@freebsd.org>
List-Help: <mailto:svn-src-head-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-head>,
	<mailto:svn-src-head-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Thu, 21 Jun 2012 09:19:42 -0000

Author: kib
Date: Thu Jun 21 09:19:41 2012
New Revision: 237365
URL: http://svn.freebsd.org/changeset/base/237365

Log:
  Fix locking for f_offset, vn_read() and vn_write() cases only, for now.
  
  It seems that intended locking protocol for struct file f_offset field
  was as follows: f_offset should always be changed under the vnode lock
  (except fcntl(2) and lseek(2) did not followed the rules). Since
  read(2) uses shared vnode lock, FOFFSET_LOCKED block is additionally
  taken to serialize shared vnode lock owners.
  
  This was broken first by enabling shared lock on writes, then by
  fadvise changes, which moved f_offset assigned from under vnode lock,
  and last by vn_io_fault() doing chunked i/o. More, due to uio_offset
  not yet valid in vn_io_fault(), the range lock for reads was taken on
  the wrong region.
  
  Change the locking for f_offset to always use FOFFSET_LOCKED block,
  which is placed before rangelocks in the lock order.
  
  Extract foffset_lock() and foffset_unlock() functions which implements
  FOFFSET_LOCKED lock, and consistently lock f_offset with it in the
  vn_io_fault() both for reads and writes, even if MNTK_NO_IOPF flag is
  not set for the vnode mount. Indicate that f_offset is already valid
  for vn_read() and vn_write() calls from vn_io_fault() with FOF_OFFSET
  flag, and assert that all callers of vn_read() and vn_write() follow
  this protocol.
  
  Extract get_advice() function to calculate the POSIX_FADV_XXX value
  for the i/o region, and use it were appropriate.
  
  Reviewed by:	jhb
  Tested by:	pho
  MFC after:	2 weeks

Modified:
  head/sys/kern/vfs_vnops.c

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c	Thu Jun 21 09:12:33 2012	(r237364)
+++ head/sys/kern/vfs_vnops.c	Thu Jun 21 09:19:41 2012	(r237365)
@@ -527,6 +527,66 @@ vn_rdwr_inchunks(rw, vp, base, len, offs
 	return (error);
 }
 
+static void
+foffset_lock(struct file *fp, struct uio *uio, int flags)
+{
+	struct mtx *mtxp;
+
+	if ((flags & FOF_OFFSET) != 0)
+		return;
+
+	/*
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
+	 */
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+		fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+		msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+		    "vnread offlock", 0);
+	}
+	fp->f_vnread_flags |= FOFFSET_LOCKED;
+	uio->uio_offset = fp->f_offset;
+	mtx_unlock(mtxp);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+	struct mtx *mtxp;
+	int ret;
+
+	ret = POSIX_FADV_NORMAL;
+	if (fp->f_advice == NULL)
+		return (ret);
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (uio->uio_offset >= fp->f_advice->fa_start &&
+	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+		ret = fp->f_advice->fa_advice;
+	mtx_unlock(mtxp);
+	return (ret);
+}
+
+static void
+foffset_unlock(struct file *fp, struct uio *uio, int flags)
+{
+	struct mtx *mtxp;
+
+	if ((flags & FOF_OFFSET) != 0)
+		return;
+
+	fp->f_offset = uio->uio_offset;
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+		wakeup(&fp->f_vnread_flags);
+	fp->f_vnread_flags = 0;
+	mtx_unlock(mtxp);
+}
+
 /*
  * File table vnode read routine.
  */
@@ -539,44 +599,22 @@ vn_read(fp, uio, active_cred, flags, td)
 	struct thread *td;
 {
 	struct vnode *vp;
-	int error, ioflag;
 	struct mtx *mtxp;
+	int error, ioflag;
 	int advice, vfslocked;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	advice = POSIX_FADV_NORMAL;
+	advice = get_advice(fp, uio);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	/*
-	 * According to McKusick the vn lock was protecting f_offset here.
-	 * It is now protected by the FOFFSET_LOCKED flag.
-	 */
-	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if ((flags & FOF_OFFSET) == 0) {
-			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
-				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-				    "vnread offlock", 0);
-			}
-			fp->f_vnread_flags |= FOFFSET_LOCKED;
-			uio->uio_offset = fp->f_offset;
-		}
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
@@ -596,14 +634,6 @@ vn_read(fp, uio, active_cred, flags, td)
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
-		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
-	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
@@ -625,6 +655,7 @@ vn_read(fp, uio, active_cred, flags, td)
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -656,13 +687,14 @@ vn_write(fp, uio, active_cred, flags, td
 {
 	struct vnode *vp;
 	struct mount *mp;
-	int error, ioflag, lock_flags;
 	struct mtx *mtxp;
+	int error, ioflag, lock_flags;
 	int advice, vfslocked;
 	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type == VREG)
@@ -681,6 +713,8 @@ vn_write(fp, uio, active_cred, flags, td
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
+
+	advice = get_advice(fp, uio);
  
 	if ((MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
@@ -691,19 +725,6 @@ vn_write(fp, uio, active_cred, flags, td
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-	advice = POSIX_FADV_NORMAL;
-	mtxp = NULL;
-	if (fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
@@ -721,8 +742,6 @@ vn_write(fp, uio, active_cred, flags, td
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
@@ -761,6 +780,7 @@ vn_write(fp, uio, active_cred, flags, td
 		 */
 		start = offset;
 		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
 		if (fp->f_advice != NULL &&
 		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -845,11 +865,15 @@ vn_io_fault(struct file *fp, struct uio 
 	else
 		doio = vn_write;
 	vp = fp->f_vnode;
+	foffset_lock(fp, uio, flags);
+
 	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
 	    ((mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
-	    !vn_io_fault_enable)
-		return (doio(fp, uio, active_cred, flags, td));
+	    !vn_io_fault_enable) {
+		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+		goto out_last;
+	}
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
@@ -882,7 +906,7 @@ vn_io_fault(struct file *fp, struct uio 
 	}
 
 	save = vm_fault_disable_pagefaults();
-	error = doio(fp, uio, active_cred, flags, td);
+	error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	if (error != EFAULT)
 		goto out;
 
@@ -933,7 +957,8 @@ vn_io_fault(struct file *fp, struct uio 
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
-		error = doio(fp, &short_uio, active_cred, flags, td);
+		error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+		    td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
@@ -956,6 +981,8 @@ out:
 	vm_fault_enable_pagefaults(save);
 	vn_rangelock_unlock(vp, rl_cookie);
 	free(uio_clone, M_IOV);
+out_last:
+	foffset_unlock(fp, uio, flags);
 	return (error);
 }