Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 15 Aug 2019 23:21:41 +0000 (UTC)
From:      Rick Macklem <rmacklem@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r351106 - head/sys/kern
Message-ID:  <201908152321.x7FNLfQW055908@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rmacklem
Date: Thu Aug 15 23:21:41 2019
New Revision: 351106
URL: https://svnweb.freebsd.org/changeset/base/351106

Log:
  Fix copy_file_range(2) so that unneeded blocks are not allocated to the output file.
  
  When the byte range for copy_file_range(2) doesn't go to EOF on the
  output file and there is a hole in the input file, a hole must be
  "punched" in the output file. This is done by writing a block of bytes
  all set to 0.
  Without this patch, the write is done unconditionally which means that,
  if the output file already has a hole in that byte range, a unneeded data block
  of all 0 bytes would be allocated.
  This patch adds code to check for a hole in the output file, so that it can
  skip doing the write if there is already a hole in that byte range of
  the output file. This avoids unnecessary allocation of blocks to the
  output file.
  
  Reviewed by:	kib
  Differential Revision:	https://reviews.freebsd.org/D21155

Modified:
  head/sys/kern/vfs_vnops.c

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c	Thu Aug 15 22:55:59 2019	(r351105)
+++ head/sys/kern/vfs_vnops.c	Thu Aug 15 23:21:41 2019	(r351106)
@@ -2713,18 +2713,92 @@ mem_iszero(void *dat, int len)
 }
 
 /*
+ * Look for a hole in the output file and, if found, adjust *outoffp
+ * and *xferp to skip past the hole.
+ * *xferp is the entire hole length to be written and xfer2 is how many bytes
+ * to be written as 0's upon return.
+ */
+static off_t
+vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
+    off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
+{
+	int error;
+	off_t delta;
+
+	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
+		*dataoffp = *outoffp;
+		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
+		    curthread);
+		if (error == 0) {
+			*holeoffp = *dataoffp;
+			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
+			    curthread);
+		}
+		if (error != 0 || *holeoffp == *dataoffp) {
+			/*
+			 * Since outvp is unlocked, it may be possible for
+			 * another thread to do a truncate(), lseek(), write()
+			 * creating a hole at startoff between the above
+			 * VOP_IOCTL() calls, if the other thread does not do
+			 * rangelocking.
+			 * If that happens, *holeoffp == *dataoffp and finding
+			 * the hole has failed, so disable vn_skip_hole().
+			 */
+			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
+			return (xfer2);
+		}
+		KASSERT(*dataoffp >= *outoffp,
+		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
+		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
+		KASSERT(*holeoffp > *dataoffp,
+		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
+		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
+	}
+
+	/*
+	 * If there is a hole before the data starts, advance *outoffp and
+	 * *xferp past the hole.
+	 */
+	if (*dataoffp > *outoffp) {
+		delta = *dataoffp - *outoffp;
+		if (delta >= *xferp) {
+			/* Entire *xferp is a hole. */
+			*outoffp += *xferp;
+			*xferp = 0;
+			return (0);
+		}
+		*xferp -= delta;
+		*outoffp += delta;
+		xfer2 = MIN(xfer2, *xferp);
+	}
+
+	/*
+	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
+	 * that the write ends at the start of the hole.
+	 * *holeoffp should always be greater than *outoffp, but for the
+	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
+	 * value.
+	 */
+	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
+		xfer2 = *holeoffp - *outoffp;
+	return (xfer2);
+}
+
+/*
  * Write an xfer sized chunk to outvp in blksize blocks from dat.
  * dat is a maximum of blksize in length and can be written repeatedly in
  * the chunk.
  * If growfile == true, just grow the file via vn_truncate_locked() instead
  * of doing actual writes.
+ * If checkhole == true, a hole is being punched, so skip over any hole
+ * already in the output file.
  */
 static int
 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
-    u_long blksize, bool growfile, struct ucred *cred)
+    u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 {
 	struct mount *mp;
-	off_t xfer2;
+	off_t dataoff, holeoff, xfer2;
 	int error, lckf;
 
 	/*
@@ -2733,7 +2807,24 @@ vn_write_outvp(struct vnode *outvp, char *dat, off_t o
 	 * done for each iteration, since the xfer argument can be very
 	 * large if there is a large hole to punch in the output file.
 	 */
+	error = 0;
+	holeoff = 0;
 	do {
+		xfer2 = MIN(xfer, blksize);
+		if (checkhole) {
+			/*
+			 * Punching a hole.  Skip writing if there is
+			 * already a hole in the output file.
+			 */
+			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
+			    &dataoff, &holeoff, cred);
+			if (xfer == 0)
+				break;
+			if (holeoff < 0)
+				checkhole = false;
+			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
+			    (intmax_t)xfer2));
+		}
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
@@ -2749,7 +2840,6 @@ vn_write_outvp(struct vnode *outvp, char *dat, off_t o
 				error = vn_truncate_locked(outvp, outoff + xfer,
 				    false, cred);
 			else {
-				xfer2 = MIN(xfer, blksize);
 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
 				    curthread->td_ucred, cred, NULL, curthread);
@@ -2916,7 +3006,7 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *
 					memset(dat, 0, MIN(xfer2, blksize));
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer2, blksize, false,
-					    outcred);
+					    holeout > 0, outcred);
 				}
 
 				if (error == 0 && *outoffp + xfer >
@@ -2924,7 +3014,7 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *
 					/* Grow last block. */
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize, true,
-					    outcred);
+					    false, outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					*outoffp += xfer;
@@ -2980,7 +3070,8 @@ vn_generic_copy_file_range(struct vnode *invp, off_t *
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize,
 					    readzeros && xfer == len &&
-					    *outoffp >= va.va_size, outcred);
+					    *outoffp >= va.va_size, false,
+					    outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					startoff += xfer;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201908152321.x7FNLfQW055908>