Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 20 Nov 2014 22:19:14 -0500 (EST)
From:      Rick Macklem <rmacklem@uoguelph.ca>
To:        FreeBSD Filesystems <freebsd-fs@freebsd.org>
Subject:   RFC: patch to make d_fileno 64bits
Message-ID:  <539201047.4538834.1416539954794.JavaMail.root@uoguelph.ca>
In-Reply-To: <683927697.4538805.1416539949195.JavaMail.root@uoguelph.ca>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
The attached patch covers the basics of a way to
convert the d_fileno field of "struct dirent" to
64bits. This patch is incomplete and won't even
build, but I thought I'd post it in case anyone
wanted to take a look and comment on the approach
it uses.

- renames the old/current one "struct dirent32"
- changes d_fileno to 64bits and adds a 64bit
  d_off field for the offset of the underlying
  file system
- defines a new VOP_READDIR() that will return
  the new "struct dirent" that is used as the
  default one for a new getdirentries(2).
- the old/current getdirentries(2) uses the old
  VOP_READDIR32() by default.

For the case of a file system that supports both
the new and old VOP_READDIR(), they are used by
the corresponding new and old getdirentries(2)
syscalls.

For a file system that only supports one of
the VOP_READDIR()s, the "struct dirent32"
is copied to "struct dirent" (or vice versa).

At this point, all file systems would support
the old VOP_READDIR() and I think the new
VOP_READDIR() can easily be added for NFS,
ZFS. (OpenBSD already has UFS code for
essentially a new struct dirent and hopefully
that code could be ported easily, too.)

Anyhow, any comments on this approach? rick

[-- Attachment #2 --]
--- sys/dirent.h.sav	2014-10-23 18:12:59.000000000 -0400
+++ sys/dirent.h	2014-11-19 19:13:12.000000000 -0500
@@ -38,16 +38,31 @@
 
 /*
  * The dirent structure defines the format of directory entries returned by
- * the getdirentries(2) system call.
+ * the getdirentries(2) system call and dirent32 for the getdirentries32(2)
+ * system call.
  *
- * A directory entry has a struct dirent at the front of it, containing its
+ * A directory entry has a struct dirent(32) at the front of it, containing its
  * inode number, the length of the entry, and the length of the name
- * contained in the entry.  These are followed by the name padded to a 4
+ * contained in the entry.  These are followed by the name padded to a 8(4)
  * byte boundary with null bytes.  All names are guaranteed null terminated.
  * The maximum length of a name in a directory is MAXNAMLEN.
  */
 
 struct dirent {
+	__uint64_t d_off;		/* dir offset for on-disk directory */
+	__uint64_t d_fileno;		/* file number of entry */
+	__uint16_t d_reclen;		/* length of this record */
+	__uint8_t  d_type; 		/* file type, see below */
+	__uint8_t  d_namlen;		/* length of string in d_name */
+#if __BSD_VISIBLE
+#define	MAXNAMLEN	255
+	char	d_name[MAXNAMLEN + 1];	/* name must be no longer than this */
+#else
+	char	d_name[255 + 1];	/* name must be no longer than this */
+#endif
+};
+
+struct dirent32 {
 	__uint32_t d_fileno;		/* file number of entry */
 	__uint16_t d_reclen;		/* length of this record */
 	__uint8_t  d_type; 		/* file type, see below */
@@ -81,20 +96,26 @@ struct dirent {
 #define	DTTOIF(dirtype)	((dirtype) << 12)
 
 /*
- * The _GENERIC_DIRSIZ macro gives the minimum record length which will hold
- * the directory entry.  This returns the amount of space in struct direct
- * without the d_name field, plus enough space for the name with a terminating
- * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary.
+ * The _GENERIC_xxx macros gives the minimum record length which will
+ * hold the directory entry.  They return the amount of space in struct
+ * dirent(32) without the d_name field, plus enough space for the name with a
+ * terminating null byte (dp->d_namlen+1), rounded up to a 8(4) byte boundary.
+ * The _GENERIC_DIRVAL() case takes the name length instead of dp as the
+ * argument.
  *
  * XXX although this macro is in the implementation namespace, it requires
  * a manifest constant that is not.
  */
-#define	_GENERIC_DIRSIZ(dp) \
-    ((sizeof (struct dirent) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3))
+#define	_GENERIC_DIRVAL(namlen) \
+    ((sizeof(struct dirent) - (MAXNAMLEN + 1) + (namlen) + 1 + 7) & ~7)
+#define	_GENERIC_DIRSIZ(dp)	_GENERIC_DIRVAL((dp)->d_namlen)
+#define	_GENERIC_DIRSIZ32(dp) \
+    ((sizeof (struct dirent32) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3))
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 #define	GENERIC_DIRSIZ(dp)	_GENERIC_DIRSIZ(dp)
+#define	GENERIC_DIRSIZ32(dp)	_GENERIC_DIRSIZ32(dp)
 #endif
 
 #endif /* !_SYS_DIRENT_H_ */
--- kern/vfs_syscalls.c.sav	2014-10-24 16:45:39.000000000 -0400
+++ kern/vfs_syscalls.c	2014-11-20 21:46:29.000000000 -0500
@@ -4006,10 +4006,11 @@ unionread:
 #endif /* COMPAT_43 */
 
 /*
- * Read a block of directory entries in a filesystem independent format.
+ * Read the old "struct dirent32" block of directory entries in a
+ * filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
-struct getdirentries_args {
+struct getdirentries32_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
@@ -4017,9 +4018,9 @@ struct getdirentries_args {
 };
 #endif
 int
-sys_getdirentries(td, uap)
+sys_getdirentries32(td, uap)
 	struct thread *td;
-	register struct getdirentries_args /* {
+	register struct getdirentries32_args /* {
 		int fd;
 		char *buf;
 		u_int count;
@@ -4029,7 +4030,7 @@ sys_getdirentries(td, uap)
 	long base;
 	int error;
 
-	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	error = kern_getdirentries32(td, uap->fd, uap->buf, uap->count, &base,
 	    NULL, UIO_USERSPACE);
 	if (error != 0)
 		return (error);
@@ -4039,7 +4040,7 @@ sys_getdirentries(td, uap)
 }
 
 int
-kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
+kern_getdirentries32(struct thread *td, int fd, char *buf, u_int count,
     long *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
@@ -4048,8 +4049,9 @@ kern_getdirentries(struct thread *td, in
 	struct iovec aiov;
 	cap_rights_t rights;
 	long loff;
-	int error, eofflag;
+	int copy_dir = 0, error, eofflag;
 	off_t foffset;
+	char *tbuf = NULL;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
@@ -4070,22 +4072,46 @@ unionread:
 		error = EINVAL;
 		goto fail;
 	}
-	aiov.iov_base = buf;
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+trynew:
+	/*
+	 * If this file system only returns the new struct dirent, allocate
+	 * a kernel buffer to be read into, so it can be copied/converted.
+	 */
+	if (copy_dir != 0 && bufseg == UIO_USERSPACE) {
+		if (tbuf == NULL)
+			tbuf = malloc(count, M_TEMP, M_WAITOK);
+		aiov.iov_base = tbuf;
+	} else
+		aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
-	auio.uio_segflg = bufseg;
+	if (copy_dir != 0 && bufseg == UIO_USERSPACE)
+		auio.uio_segflg = UIO_SYSSPACE;
+	else
+		auio.uio_segflg = bufseg;
 	auio.uio_td = td;
-	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
-		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
-		    NULL);
+	{
+		if (copy_dir == 0) {
+			error = VOP_READDIR32(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			if (error == EOPNOTSUPP) {
+				copy_dir = 1;
+				error = 0;
+				goto trynew;
+			}
+		} else
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+	}
 	foffset = auio.uio_offset;
 	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
@@ -4102,14 +4128,209 @@ unionread:
 		fp->f_data = vp;
 		foffset = 0;
 		vput(tvp);
+		copy_dir = 0;
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
+	if (copy_dir != 0 && count - auio.uio_resid > 0) {
+		if (bufseg == UIO_USERSPACE) {
+			copy_dirent32(tbuf, count - auio.uio_resid);
+			error = copyout(tbuf, buf, count - auio.uio_resid);
+			if (error != 0)
+				goto fail;
+		} else
+			copy_dirent32(buf, count - auio.uio_resid);
+	}
 	*basep = loff;
 	if (residp != NULL)
 		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
+	if (tbuf != NULL)
+		free(tbuf, M_TEMP);
+	foffset_unlock(fp, foffset, 0);
+	fdrop(fp, td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdents32_args {
+	int fd;
+	char *buf;
+	size_t count;
+};
+#endif
+int
+sys_getdents32(td, uap)
+	struct thread *td;
+	register struct getdents32_args /* {
+		int fd;
+		char *buf;
+		u_int count;
+	} */ *uap;
+{
+	struct getdirentries32_args ap;
+
+	ap.fd = uap->fd;
+	ap.buf = uap->buf;
+	ap.count = uap->count;
+	ap.basep = NULL;
+	return (sys_getdirentries32(td, &ap));
+}
+
+/*
+ * Read in the new "struct dirent" block of directory entries in a
+ * filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	uint64_t *basep;
+};
+#endif
+int
+sys_getdirentries(td, uap)
+	struct thread *td;
+	register struct getdirentries_args /* {
+		int fd;
+		char *buf;
+		u_int count;
+		uint64_t *basep;
+	} */ *uap;
+{
+	uint64_t base;
+	int error;
+
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
+	if (error != 0)
+		return (error);
+	if (uap->basep != NULL)
+		error = copyout(&base, uap->basep, sizeof(uint64_t));
+	return (error);
+}
+
+int
+kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
+    uint64_t *basep, ssize_t *residp, enum uio_seg bufseg)
+{
+	struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	cap_rights_t rights;
+	uint64_t loff;
+	int copy_dir = 0, error, eofflag;
+	off_t foffset;
+	char *ibuf = NULL, *obuf = NULL;
+	u_int obuflen;
+
+	AUDIT_ARG_FD(fd);
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
+	auio.uio_resid = count;
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_READ), &fp);
+	if (error != 0)
+		return (error);
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
+unionread:
+	if (vp->v_type != VDIR) {
+		error = EINVAL;
+		goto fail;
+	}
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+tryold:
+	/*
+	 * If this file system only returns the old struct dirent, allocate
+	 * kernel buffers to be read and copied/converted into.
+	 */
+	if (copy_dir != 0) {
+		if (ibuf == NULL)
+			ibuf = malloc(count, M_TEMP, M_WAITOK);
+		if (obuf == NULL)
+			obuf = malloc(count, M_TEMP, M_WAITOK);
+		aiov.iov_base = ibuf;
+	} else
+		aiov.iov_base = buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	if (copy_dir != 0)
+		auio.uio_segflg = UIO_SYSSPACE;
+	else
+		auio.uio_segflg = bufseg;
+	auio.uio_td = td;
+	AUDIT_ARG_VNODE1(vp);
+	loff = auio.uio_offset = foffset;
+#ifdef MAC
+	error = mac_vnode_check_readdir(td->td_ucred, vp);
+	if (error == 0)
+#endif
+	{
+		if (copy_dir == 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			if (error == EOPNOTSUPP) {
+				copy_dir = 1;
+				error = 0;
+				goto tryold;
+			}
+			foffset = auio.uio_offset;
+		} else
+			error = VOP_READDIR32(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+	}
+	if (error != 0) {
+		VOP_UNLOCK(vp, 0);
+		goto fail;
+	}
+	if (count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		foffset = 0;
+		vput(tvp);
+		copy_dir = 0;
+		goto unionread;
+	}
+	VOP_UNLOCK(vp, 0);
+	if (copy_dir != 0 && count - auio.uio_resid > 0) {
+		obuflen = copy_dirent(ibuf, count - auio.uio_resid, obuf, count,
+		    &foffset);
+		if (bufseg == UIO_USERSPACE)
+			error = copyout(obuf, buf, obuflen);
+		else
+			bcopy(obuf, buf, obuflen);
+		if (error != 0)
+			goto fail;
+		if (residp != NULL)
+			*residp = count - obuflen;
+		td->td_retval[0] = obuflen;
+	} else {
+		if (residp != NULL)
+			*residp = auio.uio_resid;
+		td->td_retval[0] = count - auio.uio_resid;
+	}
+	*basep = loff;
+fail:
+	if (ibuf != NULL)
+		free(ibuf, M_TEMP);
+	if (obuf != NULL)
+		free(obuf, M_TEMP);
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
@@ -4141,6 +4362,69 @@ sys_getdents(td, uap)
 }
 
 /*
+ * Copy the new struct ndirent to the old struct dirent format.
+ */
+static void
+copy_dirent32(char *buf, u_int len)
+{
+	struct dirent *dp;
+	struct dirent32 *dp32;
+
+	while (len > 0) {
+		dp = (struct dirent *)buf;
+		dp32 = (struct dirent32 *)buf;
+		dp32->d_fileno = dp->d_fileno;
+		dp32->d_reclen = dp->d_reclen;
+		dp32->d_type = dp->d_type;
+		dp32->d_namlen = dp->d_namlen;
+		bcopy(dp->d_name, dp32->d_name, dp32->d_namlen + 1);
+		buf += dp32->d_reclen;
+		len -= dp32->d_reclen;
+	}
+}
+
+/*
+ * Copy the old struct dirent32 to new struct dirent format.
+ */
+static u_int
+copy_dirent(char *ibuf, u_int ilen, char *obuf, u_int olen, off_t *offp)
+{
+	struct dirent *dp;
+	struct dirent32 *dp32;
+	u_int left, ocnt;
+
+	dp32 = (struct dirent32 *)ibuf;
+	ocnt = 0;
+	while (ilen > 0 && olen >= ocnt + _GENERIC_DIRVAL(dp32->d_namlen)) {
+		dp = (struct dirent *)obuf;
+		dp->d_off = *offp;
+		dp->d_fileno = dp32->d_fileno;
+		dp->d_type = dp32->d_type;
+		dp->d_namlen = dp32->d_namlen;
+		bcopy(dp32->d_name, dp->d_name, dp32->d_namlen + 1);
+		dp->d_reclen = _GENERIC_DIRSIZ(dp);
+		ibuf += dp32->d_reclen;
+		ilen -= dp32->d_reclen;
+		*offp += dp32->d_reclen;
+		obuf += dp->d_reclen;
+		ocnt += dp->d_reclen;
+		left = DEV_BSIZE - (ocnt & (DEV_BSIZE - 1));
+		dp32 = (struct dirent32 *)ibuf;
+		if (ilen > 0 && left < _GENERIC_DIRVAL(dp32->d_namlen)) {
+			dp->d_reclen += left;
+			obuf += left;
+			ocnt += left;
+		}
+	}
+	if (ocnt < olen) {
+		left = DEV_BSIZE - (ocnt & (DEV_BSIZE - 1));
+		dp->d_reclen += left;
+		ocnt += left;
+	}
+	return (ocnt);
+}
+
+/*
  * Set the mode mask for creation of filesystem nodes.
  */
 #ifndef _SYS_SYSPROTO_H_

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?539201047.4538834.1416539954794.JavaMail.root>