Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 11 May 2010 16:13:45 -0700
From:      Tim Prouty <tim.prouty@isilon.com>
To:        freebsd-arch@freebsd.org
Cc:        Zachary Loafman <zachary.loafman@isilon.com>, Matthew Fleming <matthew.fleming@isilon.com>
Subject:   Re: [PATCH]/[RFC] Increase scalability of per-process file descriptor data structures
Message-ID:  <A57FE0F9-47B9-4036-8F1C-0D30FB545980@isilon.com>
In-Reply-To: <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com>
References:  <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com>

next in thread | previous in thread | raw e-mail | index | archive | help

--Apple-Mail-18--1040317416
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed
Content-Transfer-Encoding: 7bit

The patch was slightly truncated, I'm guessing because it was > 50K.
Attached is a slightly trimmed down patch.

-Tim


--Apple-Mail-18--1040317416
Content-Disposition: attachment;
	filename=fd_scalability.patch
Content-Type: application/octet-stream; x-unix-mode=0700;
	name="fd_scalability.patch"
Content-Transfer-Encoding: 7bit

diff --git a/src/sys/compat/linux/linux_stats.c b/src/sys/compat/linux/linux_stats.c
index 374ce39..905db20 100644
--- a/src/sys/compat/linux/linux_stats.c
+++ b/src/sys/compat/linux/linux_stats.c
@@ -129,7 +129,7 @@ translate_path_major_minor(struct thread *td, char *path, struct stat *buf)
 	fd = td->td_retval[0];
 	td->td_retval[0] = temp;
 	translate_fd_major_minor(td, fd, buf);
-	fdclose(fdp, fdp->fd_ofiles[fd], fd, td);
+	fdclose(fdp, ftable_get(fdp, fd), fd, td);
 }
 
 static int
diff --git a/src/sys/compat/svr4/svr4_filio.c b/src/sys/compat/svr4/svr4_filio.c
index 701bf15..82364ca 100644
--- a/src/sys/compat/svr4/svr4_filio.c
+++ b/src/sys/compat/svr4/svr4_filio.c
@@ -212,13 +212,13 @@ svr4_fil_ioctl(fp, td, retval, fd, cmd, data)
 	switch (cmd) {
 	case SVR4_FIOCLEX:
 		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+		ftable_set_cloexec(fdp, fd, 1);
 		FILEDESC_XUNLOCK(fdp);
 		return 0;
 
 	case SVR4_FIONCLEX:
 		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+		ftable_set_cloexec(fdp, fd, 0);
 		FILEDESC_XUNLOCK(fdp);
 		return 0;
 
diff --git a/src/sys/fs/fdescfs/fdesc_vfsops.c b/src/sys/fs/fdescfs/fdesc_vfsops.c
index 16fa4cf..fb2e45e 100644
--- a/src/sys/fs/fdescfs/fdesc_vfsops.c
+++ b/src/sys/fs/fdescfs/fdesc_vfsops.c
@@ -203,7 +203,7 @@ fdesc_statfs(mp, sbp, td)
 	last = min(fdp->fd_nfiles, lim);
 	freefd = 0;
 	for (i = fdp->fd_freefile; i < last; i++)
-		if (fdp->fd_ofiles[i] == NULL)
+		if (ftable_get(fdp, i) == NULL)
 			freefd++;
 
 	/*
diff --git a/src/sys/fs/fdescfs/fdesc_vnops.c b/src/sys/fs/fdescfs/fdesc_vnops.c
index f39c3a7..0ea6607 100644
--- a/src/sys/fs/fdescfs/fdesc_vnops.c
+++ b/src/sys/fs/fdescfs/fdesc_vnops.c
@@ -581,7 +581,7 @@ fdesc_readdir(ap)
 			dp->d_type = DT_DIR;
 			break;
 		default:
-			if (fdp->fd_ofiles[fcnt] == NULL) {
+			if (ftable_get(fdp, fcnt) == NULL) {
 				FILEDESC_SUNLOCK(fdp);
 				goto done;
 			}
diff --git a/src/sys/fs/nfsserver/nfs_nfsdport.c b/src/sys/fs/nfsserver/nfs_nfsdport.c
index 232e465..94fd81c 100644
--- a/src/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/src/sys/fs/nfsserver/nfs_nfsdport.c
@@ -3103,7 +3103,7 @@ fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
 
 	fdp = p->td_proc->p_fd;
 	if (fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	    (fp = ftable_get(fdp, fd)) == NULL)
 		return (EBADF);
 	*fpp = fp;
 	return (0);

diff --git a/src/sys/kern/kern_descrip.c b/src/sys/kern/kern_descrip.c
index 6ce0356..1a34987 100644
--- a/src/sys/kern/kern_descrip.c
+++ b/src/sys/kern/kern_descrip.c
@@ -112,9 +112,8 @@ enum dup_type { DUP_VARIABLE, DUP_FIXED };
 
 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval);
-static int	fd_first_free(struct filedesc *, int, int);
+static int	fd_first_free(struct filedesc *, int);
 static int	fd_last_used(struct filedesc *, int, int);
-static void	fdgrowtable(struct filedesc *, int);
 static int	fdrop_locked(struct file *fp, struct thread *td);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
@@ -134,10 +133,406 @@ static void	fdused(struct filedesc *fdp, int fd);
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
-/*
- * Storage required per open file descriptor.
+#define IDB_BLOCK_SIZE PAGE_SIZE
+#define IDB_ENT_SIZE sizeof(uintptr_t)
+#define IDB_ENTS_PER_BLOCK (IDB_BLOCK_SIZE/IDB_ENT_SIZE)
+
+/* May be a perf impact on 32-bit kernels. */
+CTASSERT(NDSLOTSIZE == IDB_ENT_SIZE);
+
+/**
+ * Return the index into the indirect table given an entry.
+ */
+static inline int
+idb_block_index(int ent)
+{
+
+	return (ent / IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return offset into an indirect block given an entry.
+ */
+static inline int
+idb_block_off(int ent)
+{
+
+	return (ent % IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return 1 if the indirect block table is flat, else 0.
+ */
+static inline int
+idb_is_flat(struct idb_table *idb)
+{
+
+	return (idb->idb_nents <= IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return a pointer to the block.  If the block is sparse or ent is outside
+ * the current size of the table, return NULL.
+ */
+static inline void *
+idb_block(struct idb_table *idb, int ent)
+{
+
+	return (ent >= idb->idb_nents ? NULL :
+	    idb->idb_tbl.indirect[idb_block_index(ent)]);
+}
+
+/**
+ * Initialize a new indirect table.  The caller is responsible for allocating
+ * the idb struct, and must provide an initial non-null flat table.
+ *
+ * @param idb		 Indirect table to initialize.
+ * @param idb_flat	 Initial non-null table.
+ * @param idb_nents	 Number of entries in the initial flat table.
+ */
+static void
+idb_init(struct idb_table *idb, void *idb_flat, int idb_nents)
+{
+
+	KASSERT(idb != NULL, ("idb table must be allocated by caller"));
+	KASSERT(idb_flat != NULL,
+	    ("idb flat table must be allocated by caller"));
+
+	idb->idb_tbl.flat = idb_flat;
+	idb->idb_nents = idb_nents;
+	idb->idb_orig_nents = idb_nents;
+}
+
+/**
+ * Free all blocks associated with the indirect table.
+ */
+static void
+idb_free(struct idb_table *idb)
+{
+	int indx;
+	void *block;
+
+	if (idb_is_flat(idb)) {
+		if (idb->idb_nents > idb->idb_orig_nents)
+			free(idb->idb_tbl.flat, M_FILEDESC);
+		return;
+	}
+
+	/* Free indirect leaves. */
+	for (indx = idb_block_index(0);
+	     indx < idb_block_index(idb->idb_nents);
+	     indx++) {
+		block = idb->idb_tbl.indirect[indx];
+		if (block != NULL)
+			free(block, M_FILEDESC);
+	}
+
+	/* Free indirect root. */
+	free(idb->idb_tbl.indirect, M_FILEDESC);
+}
+
+/**
+ * Return a pointer into the table/block given an index.
+ */
+static void *
+idb_get_entry(struct idb_table *idb, int ent)
+{
+	void *block;
+
+	if (ent > idb->idb_nents)
+		return (NULL);
+
+	if (idb_is_flat(idb))
+		return (((caddr_t)idb->idb_tbl.flat) + (ent * IDB_ENT_SIZE));
+
+	/* Indirect block.  Return NULL for sparse blocks. */
+	block = idb_block(idb, ent);
+	if (block == NULL)
+		return (NULL);
+
+	return (((caddr_t)block) + (idb_block_off(ent) * IDB_ENT_SIZE));
+}
+
+/**
+ * If the current table size doesn't accomodate the new number of entries,
+ * grow it to fit new_nents.  Mult is a multiplying factor used to check the
+ * number of entries in the table against new_nents which allows growing the
+ * flat table or the indirect table.  The current number of entries in the
+ * table must be a multiple of mult.
+ *
+ * @param idb		Table to grow.
+ * @param new_nents	Number of entries to grow the table to.
+ * @param mult		Multiplier for new_nents.
+ * @param sx		Exclusive lock that may be dropped/reqacquired.
  */
-#define OFILESIZE (sizeof(struct file *) + sizeof(char))
+static void
+idb_grow_table(struct idb_table *idb, int new_nents, int mult, struct sx *sx)
+{
+	int old_nents;
+	void *ntable;
+
+	KASSERT(idb->idb_nents % mult == 0,
+	    ("%d is not a multiple of %d", idb->idb_nents, mult));
+
+	old_nents = idb->idb_nents / mult;
+
+	/* Do nothing if the table is already big enough. */
+	if (old_nents > new_nents)
+		return;
+
+	sx_xunlock(sx);
+	ntable = malloc(new_nents * IDB_ENT_SIZE, M_FILEDESC,
+	    M_ZERO | M_WAITOK);
+	sx_xlock(sx);
+
+	/* Done if table grew when the lock was dropped. */
+	if (idb->idb_nents / mult > new_nents) {
+		free(ntable, M_FILEDESC);
+		return;
+	}
+
+	/* Copy the data to the new table and fix up the pointers. */
+	bcopy(idb->idb_tbl.flat, ntable, old_nents * IDB_ENT_SIZE);
+	if (idb->idb_nents > idb->idb_orig_nents)
+		free(idb->idb_tbl.flat, M_FILEDESC);
+	idb->idb_tbl.flat = ntable;
+	idb->idb_nents = new_nents * mult;
+}
+
+/**
+ * Transition a flat table to an indirect block table.
+ *
+ * @param idb	Table to transition.
+ * @param sx	Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_transition_to_indirect(struct idb_table *idb, struct sx *sx)
+{
+	void **ntable = NULL;
+
+	KASSERT(idb->idb_nents >= IDB_ENTS_PER_BLOCK,
+	    ("Insufficient size for indirect transition: %d", idb->idb_nents));
+
+	/* Done if the table has already transitioned. */
+	if (idb->idb_nents > IDB_ENTS_PER_BLOCK) {
+		return;
+	}
+
+	sx_xunlock(sx);
+	ntable = malloc(IDB_BLOCK_SIZE, M_FILEDESC,
+		    M_ZERO | M_WAITOK);
+	sx_xlock(sx);
+
+	/* Done if indirect transition done when the lock was dropped. */
+	if (idb->idb_nents > IDB_ENTS_PER_BLOCK) {
+		free(ntable, M_FILEDESC);
+		return;
+	}
+
+	/* Make indirect transition. */
+	ntable[0] = idb->idb_tbl.flat;
+	idb->idb_tbl.indirect = ntable;
+	idb->idb_nents = IDB_ENTS_PER_BLOCK * IDB_ENTS_PER_BLOCK;
+}
+
+/**
+ * Allocates an indirect block in the table if one doesn't already exist for
+ * new_ent.
+ *
+ * @param idb		Table to ensure new_ent has an indirect block in.
+ * @param new_ent	New entry index to create indirect block for.
+ * @param sx		Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_ensure_indirect_block(struct idb_table *idb, int new_ent, struct sx *sx)
+{
+	void *nblock = NULL;
+
+	KASSERT(new_ent < idb->idb_nents,
+	    ("Table too small (%d) for indirect block at index %d",
+		idb->idb_nents, new_ent));
+
+	/* Done if the block is already allocated. */
+	if (idb_block(idb, new_ent) != NULL)
+		return;
+
+	sx_xunlock(sx);
+	nblock = malloc(IDB_BLOCK_SIZE, M_FILEDESC, M_ZERO | M_WAITOK);
+	sx_xlock(sx);
+
+	/* Done if block was allocated when the lock was dropped. */
+	if (idb_block(idb, new_ent) != NULL) {
+		free(nblock, M_FILEDESC);
+		return;
+	}
+
+	idb->idb_tbl.indirect[idb_block_index(new_ent)] = nblock;
+}
+
+/**
+ * idb_ensure_size() guarantees that:
+ *  1. If the table is flat, the table will be made large enough for new_ent,
+ *     possibly being transitioned to an indirect table.
+ *
+ *  2. If the table is indirect, the indirect table is large enough to have an
+ *     entry to point to the indirect block, and the indirect block itself is
+ *     allocated.
+ *
+ * The sx lock will be released if new memory needs to be allocated, but will
+ * be reacquired before returning.
+ *
+ * @param idb		Table to ensure new_ent fits in.
+ * @param new_ent	New entry index.
+ * @param maxsize	Max size of the table so excess memory isn't used.
+ * @param sx		Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_ensure_size(struct idb_table *idb, int new_ent, int maxsize, struct sx *sx)
+{
+	KASSERT(idb->idb_nents > 0, ("zero-length idb table"));
+	KASSERT(new_ent < maxsize,
+	    ("new_ent(%d) >= maxsize(%d)", new_ent, maxsize));
+
+	sx_assert(sx, SX_XLOCKED | SX_NOTRECURSED);
+
+	/* Grow table 2x while it is flat. */
+	if (idb_is_flat(idb) && new_ent < IDB_ENTS_PER_BLOCK) {
+		if (new_ent >= idb->idb_nents) {
+			KASSERT(new_ent > 0, ("Negative new_ent %d", new_ent));
+			/* Round up to power of 2 to appease the allocator. */
+			idb_grow_table(idb, min(min(1 << (fls(new_ent)),
+				    IDB_ENTS_PER_BLOCK), maxsize), 1, sx);
+		}
+		return;
+	}
+
+	/* Transition flat table to indirect. */
+	if (idb_is_flat(idb) && new_ent >= IDB_ENTS_PER_BLOCK) {
+		idb_grow_table(idb, IDB_ENTS_PER_BLOCK, 1, sx);
+		idb_transition_to_indirect(idb, sx);
+	}
+
+	/* Grow size of indirect table. */
+	if (new_ent >= idb->idb_nents) {
+		int grow_factor, new_nents;
+		/* Need to grow the indirect table. */
+		for (grow_factor = 2;; grow_factor <<= 1) {
+			if (idb_block_index(idb->idb_nents) * grow_factor >
+			    idb_block_index(new_ent))
+				break;
+		}
+		new_nents = min(idb_block_index(idb->idb_nents) * grow_factor,
+		    idb_block_index(maxsize));
+		idb_grow_table(idb, new_nents, IDB_ENTS_PER_BLOCK, sx);
+	}
+
+	/* Ensure block is allocated in sparse table. */
+	idb_ensure_indirect_block(idb, new_ent, sx);
+}
+
+/**
+ * Get the file struct for an fd from the ftable.
+ *
+ * @return The file struct for a particular or NULL.
+ */
+struct file *
+ftable_get(struct filedesc *fdp, int fd)
+{
+	struct file **fpp;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	fpp = idb_get_entry(&fdp->fd_files, fd);
+	return (fpp != NULL ? *fpp : NULL);
+}
+
+/**
+ * Set an entry in the table to point to a struct file.  ftable_ensure_fd()
+ * must be first called to ensure the underlying data structure can support
+ * this entry.
+ */
+void
+ftable_set(struct filedesc *fdp, int fd, struct file *fp)
+{
+	struct file **fpp;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	fpp = idb_get_entry(&fdp->fd_files, fd);
+	KASSERT(fpp != NULL, ("Trying to set unallocated entry"));
+	*fpp = fp;
+}
+
+/**
+ * Get the close exec state of a file descriptor.
+ *
+ * @return 1 if close exec is set, otherwise 0.
+ */
+int
+ftable_get_cloexec(struct filedesc *fdp, int fd)
+{
+	NDSLOTTYPE *map;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd));
+	if (map == NULL)
+		return (0);
+
+	return ((*map & NDBIT(fd)) != 0);
+}
+
+/**
+ * Set the close exec state of a file descriptor.
+ *
+ * @param on	1: close exec state will be turned on.
+ *		0: close exec state will be turned off.
+ */
+void
+ftable_set_cloexec(struct filedesc *fdp, int fd, int on)
+{
+	NDSLOTTYPE *map;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd));
+	KASSERT(map != NULL, ("trying to set cloexec on an unallocated file"));
+
+	if (on)
+		*map |= NDBIT(fd);
+	else
+		*map &= ~NDBIT(fd);
+}
+
+/**
+ * If the ftable is already large enough to store the fd, then simply return.
+ * Otherwise, allocate the necessary blocks to accomodate the new fd.  This
+ * allows for a sparse table.  May malloc new blocks requiring the fdp lock to
+ * be dropped and reacquired.
+ *
+ * @param nfd	File descriptor to possilbly grow the table to fit.
+ * @param maxfd	Maximum fd so excess memory isn't used.
+ */
+static void
+ftable_ensure_fd(struct filedesc *fdp, int nfd, int maxfd)
+{
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	KASSERT(nfd <= maxfd, ("nfd(%d) > maxfd(%d)", nfd, maxfd));
+
+	idb_ensure_size(&fdp->fd_files, nfd, maxfd + 1, &fdp->fd_sx);
+	idb_ensure_size(&fdp->fd_map, NDSLOT(nfd), NDSLOT(maxfd) + 1,
+	    &fdp->fd_sx);
+	idb_ensure_size(&fdp->fd_cloexec, NDSLOT(nfd), NDSLOT(maxfd) + 1,
+	    &fdp->fd_sx);
+
+	/*
+	 * ft_map and ft_cloexec grow at the same rate, but ft_files grows at
+	 * a different rate, so advertise table size as the min.
+	 */
+	fdp->fd_nfiles = min(fdp->fd_files.idb_nents,
+	    fdp->fd_map.idb_nents * NDENTRIES);
+}
 
 /*
  * Basic allocation of descriptors:
@@ -150,8 +545,8 @@ struct filedesc0 {
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
-	char	fd_dfileflags[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+	NDSLOTTYPE fd_dcloexec[NDSLOTS(NDFILE)];
 };
 
 /*
@@ -166,14 +561,13 @@ void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
 
-/*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+/**
+ * Iterate a flat array searching for the first zero bit in the given bitmap,
+ * starting at low and not exceeding size - 1.
  */
 static int
-fd_first_free(struct filedesc *fdp, int low, int size)
+fd_first_free_block(NDSLOTTYPE *map, int low, int size)
 {
-	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
@@ -193,14 +587,61 @@ fd_first_free(struct filedesc *fdp, int low, int size)
 	return (size);
 }
 
+/**
+ * Iterate the indirect block table fd map searching for the first free fd,
+ * starting at low.  Return the current number of entries in the table if none
+ * are free.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low)
+{
+	struct idb_table *idb = &fdp->fd_map;
+	NDSLOTTYPE *block;
+	int indx;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	/* Flat table. */
+	if (idb_is_flat(idb))
+		return (fd_first_free_block(idb->idb_tbl.flat, low,
+			idb->idb_nents * NDENTRIES));
+
+	/* Loop through the indirect blocks. */
+	for (indx = idb_block_index(NDSLOT(low));
+	     indx < idb_block_index(idb->idb_nents);
+	     indx++) {
+		int block_low, free_ent;
+
+		block = idb->idb_tbl.indirect[indx];
+		if (block == NULL) {
+			/* Unallocated block, so the first index is fine. */
+			free_ent = indx * IDB_ENTS_PER_BLOCK * NDENTRIES;
+			return (max(free_ent, low));
+		}
+
+		/* Scan block, starting mid-block if necessary. */
+		block_low = (indx == idb_block_index(NDSLOT(low))) ?
+		    idb_block_off(NDSLOT(low)) * NDENTRIES : 0;
+		free_ent = fd_first_free_block(block, block_low,
+		    IDB_ENTS_PER_BLOCK * NDENTRIES);
+
+		/* If there was a free fd, return it. */
+		if (free_ent < IDB_ENTS_PER_BLOCK * NDENTRIES)
+			return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES +
+			    free_ent);
+	}
+
+	/* No free fds found. */
+	return (idb->idb_nents);
+}
+
 /*
  * Find the highest non-zero bit in the given bitmap, starting at low and
  * not exceeding size - 1.
  */
 static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used_block(NDSLOTTYPE *map, int low, int size)
 {
-	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
@@ -220,12 +661,65 @@ fd_last_used(struct filedesc *fdp, int low, int size)
 	return (low - 1);
 }
 
+/**
+ * Iterate the indirect block table fd map searching for the highest non-zero
+ * bit, starting at low and not exceeding size - 1.  Return low -1 if no fds
+ * >= low are used.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int low, int size)
+{
+	struct idb_table *idb = &fdp->fd_map;
+	NDSLOTTYPE *block;
+	int indx;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	/* Flat table. */
+	if (idb_is_flat(idb))
+		return (fd_last_used_block(idb->idb_tbl.flat, low, size));
+
+	/* Loop through the indirect blocks backwards. */
+	for (indx = idb_block_index(NDSLOT(size));
+	     indx >= idb_block_index(NDSLOT(low));
+	     indx--) {
+		int block_low, block_high, used_ent;
+
+		block = idb->idb_tbl.indirect[indx];
+		/* If the block is sparse, move onto the next one. */
+		if (block == NULL)
+			continue;
+
+		/* Scan block, starting/ending mid-block if necessary. */
+		block_low = (indx == idb_block_index(NDSLOT(low))) ?
+		    idb_block_off(NDSLOT(low)) * NDENTRIES : 0;
+		block_high = (indx == idb_block_index(NDSLOT(size))) ?
+		    idb_block_off(NDSLOT(size)) * NDENTRIES :
+		    IDB_ENTS_PER_BLOCK;
+		used_ent = fd_last_used_block(block, block_low, block_high);
+
+		/* If there was a used fd, return it. */
+		if (used_ent >= block_low)
+			return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES +
+			    used_ent);
+	}
+
+	/* No used fds found. */
+	return (low - 1);
+}
+
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
+	NDSLOTTYPE *map;
+
+	FILEDESC_LOCK_ASSERT(fdp);
         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
-	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+
+	map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+
+	return (map && (*map & NDBIT(fd)) != 0);
 }
 
 /*
@@ -234,16 +728,19 @@ fdisused(struct filedesc *fdp, int fd)
 static void
 fdused(struct filedesc *fdp, int fd)
 {
+	NDSLOTTYPE *map;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(!fdisused(fdp, fd),
-	    ("fd already used"));
+	KASSERT(!fdisused(fdp, fd), ("fd already used"));
 
-	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+	map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+	KASSERT(map != NULL, ("Map block is NULL"));
+
+	*map |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
-		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+		fdp->fd_freefile = fd_first_free(fdp, fd);
 }
 
 /*
@@ -253,13 +750,19 @@ static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
+	NDSLOTTYPE *map;
+
 	FILEDESC_XLOCK_ASSERT(fdp);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
+	KASSERT(ftable_get(fdp, fd) == NULL,
 	    ("fd is still in use"));
 
-	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+	map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+	KASSERT(map != NULL, ("Map block is NULL"));
+
+	*map &= ~NDBIT(fd);
+
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
@@ -410,7 +913,7 @@ fdtofp(int fd, struct filedesc *fdp)
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	    (fp = ftable_get(fdp, fd)) == NULL)
 		return (NULL);
 	return (fp);
 }
@@ -422,7 +925,6 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 	struct flock *flp;
 	struct file *fp;
 	struct proc *p;
-	char *pop;
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
@@ -467,8 +969,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			error = EBADF;
 			break;
 		}
-		pop = &fdp->fd_ofileflags[fd];
-		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+		td->td_retval[0] = ftable_get_cloexec(fdp, fd) ?
+		    FD_CLOEXEC :0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
@@ -479,9 +981,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 			error = EBADF;
 			break;
 		}
-		pop = &fdp->fd_ofileflags[fd];
-		*pop = (*pop &~ UF_EXCLOSE) |
-		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+		ftable_set_cloexec(fdp, fd, arg & FD_CLOEXEC);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
@@ -651,7 +1151,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		/* Check for race with close */
 		FILEDESC_SLOCK(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
-		    fp != fdp->fd_ofiles[fd]) {
+		    fp != ftable_get(fdp, fd)) {
 			FILEDESC_SUNLOCK(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
@@ -750,7 +1250,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 		return (EMFILE);
 
 	FILEDESC_XLOCK(fdp);
-	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+	if (old >= fdp->fd_nfiles || ftable_get(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
@@ -759,7 +1259,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
-	fp = fdp->fd_ofiles[old];
+	fp = ftable_get(fdp, old);
 	fhold(fp);
 
 	/*
@@ -770,9 +1270,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 	 * out for a race.
 	 */
 	if (type == DUP_FIXED) {
-		if (new >= fdp->fd_nfiles)
-			fdgrowtable(fdp, new + 1);
-		if (fdp->fd_ofiles[new] == NULL)
+		ftable_ensure_fd(fdp, new, maxfd);
+		if (ftable_get(fdp, new) == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
@@ -787,9 +1286,9 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 	 * bad file descriptor.  Userland should do its own locking to
 	 * avoid this case.
 	 */
-	if (fdp->fd_ofiles[old] != fp) {
+	if (ftable_get(fdp, old) != fp) {
 		/* we've allocated a descriptor which we won't use */
-		if (fdp->fd_ofiles[new] == NULL)
+		if (ftable_get(fdp, new) == NULL)
 			fdunused(fdp, new);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
@@ -805,7 +1304,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 	 *
 	 * XXX this duplicates parts of close().
 	 */
-	delfp = fdp->fd_ofiles[new];
+	delfp = ftable_get(fdp, new);
 	holdleaders = 0;
 	if (delfp != NULL) {
 		if (td->td_proc->p_fdtol != NULL) {
@@ -821,8 +1320,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
 	/*
 	 * Duplicate the source descriptor
 	 */
-	fdp->fd_ofiles[new] = fp;
-	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+	ftable_set(fdp, new, fp);
+	ftable_set_cloexec(fdp, new, 0);
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
@@ -1111,12 +1610,12 @@ kern_close(td, fd)
 
 	FILEDESC_XLOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+	    (fp = ftable_get(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
-	fdp->fd_ofiles[fd] = NULL;
-	fdp->fd_ofileflags[fd] = 0;
+	ftable_set(fdp, fd, NULL);
+	ftable_set_cloexec(fdp, fd, 0);
 	fdunused(fdp, fd);
 	if (td->td_proc->p_fdtol != NULL) {
 		/*
@@ -1178,7 +1677,7 @@ closefrom(struct thread *td, struct closefrom_args *uap)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
-		if (fdp->fd_ofiles[fd] != NULL) {
+		if (ftable_get(fdp, fd) != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
@@ -1806,70 +2305,6 @@ out:
 }
 
 /*
- * Grow the file table to accomodate (at least) nfd descriptors.  This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
- */
-static void
-fdgrowtable(struct filedesc *fdp, int nfd)
-{
-	struct file **ntable;
-	char *nfileflags;
-	int nnfiles, onfiles;
-	NDSLOTTYPE *nmap;
-
-	FILEDESC_XLOCK_ASSERT(fdp);
-
-	KASSERT(fdp->fd_nfiles > 0,
-	    ("zero-length file table"));
-
-	/* compute the size of the new table */
-	onfiles = fdp->fd_nfiles;
-	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
-	if (nnfiles <= onfiles)
-		/* the table is already large enough */
-		return;
-
-	/* allocate a new table and (if required) new bitmaps */
-	FILEDESC_XUNLOCK(fdp);
-	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
-	    M_FILEDESC, M_ZERO | M_WAITOK);
-	nfileflags = (char *)&ntable[nnfiles];
-	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
-		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
-		    M_FILEDESC, M_ZERO | M_WAITOK);
-	else
-		nmap = NULL;
-	FILEDESC_XLOCK(fdp);
-
-	/*
-	 * We now have new tables ready to go.  Since we dropped the
-	 * filedesc lock to call malloc(), watch out for a race.
-	 */
-	onfiles = fdp->fd_nfiles;
-	if (onfiles >= nnfiles) {
-		/* we lost the race, but that's OK */
-		free(ntable, M_FILEDESC);
-		if (nmap != NULL)
-			free(nmap, M_FILEDESC);
-		return;
-	}
-	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
-	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
-	if (onfiles > NDFILE)
-		free(fdp->fd_ofiles, M_FILEDESC);
-	fdp->fd_ofiles = ntable;
-	fdp->fd_ofileflags = nfileflags;
-	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
-		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
-		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
-			free(fdp->fd_map, M_FILEDESC);
-		fdp->fd_map = nmap;
-	}
-	fdp->fd_nfiles = nnfiles;
-}
-
-/*
  * Allocate a file descriptor for the process.
  */
 int
@@ -1891,16 +2326,18 @@ fdalloc(struct thread *td, int minfd, int *result)
 	/*
 	 * Search the bitmap for a free descriptor.  If none is found, try
 	 * to grow the file table.  Keep at it until we either get a file
-	 * descriptor or run into process or system limits; fdgrowtable()
+	 * descriptor or run into process or system limits; ftable_ensure_fd()
 	 * may drop the filedesc lock, so we're in a race.
 	 */
 	for (;;) {
-		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+		fd = fd_first_free(fdp, minfd);
 		if (fd >= maxfd)
 			return (EMFILE);
-		if (fd < fdp->fd_nfiles)
+		/* Grow if necessary. */
+		ftable_ensure_fd(fdp, fd, maxfd);
+		/* Required check since ftable_ensure_fd() can drop xlock. */
+		if (ftable_get(fdp, fd) == NULL)
 			break;
-		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
 	/*
@@ -1909,9 +2346,9 @@ fdalloc(struct thread *td, int minfd, int *result)
 	 */
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
+	KASSERT(ftable_get(fdp, fd) == NULL,
 	    ("free descriptor isn't"));
-	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+	ftable_set_cloexec(fdp, fd, 0); /* XXX needed? */
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
@@ -1926,7 +2363,7 @@ fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
-	struct file **fpp;
+	struct file *fp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
@@ -1937,9 +2374,10 @@ fdavail(struct thread *td, int n)
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
-	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
-	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
-		if (*fpp == NULL && --n <= 0)
+	fp = ftable_get(fdp, fdp->fd_freefile);
+	for (i = last - fdp->fd_freefile; --i >= 0;
+	     fp = ftable_get(fdp, last - i)) {
+		if (fp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
@@ -2017,7 +2455,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
 	ifs_init_lockdata(fp);
 
 	FILEDESC_XLOCK(p->p_fd);
-	if ((fq = p->p_fd->fd_ofiles[0])) {
+	if ((fq = ftable_get(p->p_fd, 0))) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
@@ -2030,7 +2468,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
 			fdrop(fp, td);
 		return (error);
 	}
-	p->p_fd->fd_ofiles[i] = fp;
+	ftable_set(p->p_fd, i, fp);
 	FILEDESC_XUNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
@@ -2068,10 +2506,13 @@ fdinit(struct filedesc *fdp)
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
-	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
-	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
-	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
+
+	idb_init(&newfdp->fd_fd.fd_files, &newfdp->fd_dfiles, NDFILE);
+	idb_init(&newfdp->fd_fd.fd_map, &newfdp->fd_dmap, NDSLOTS(NDFILE));
+	idb_init(&newfdp->fd_fd.fd_cloexec, &newfdp->fd_dcloexec,
+	    NDSLOTS(NDFILE));
+
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
@@ -2144,6 +2585,7 @@ struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
+	struct file *fp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
@@ -2152,23 +2594,23 @@ fdcopy(struct filedesc *fdp)
 
 	newfdp = fdinit(fdp);
 	FILEDESC_SLOCK(fdp);
-	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
-		FILEDESC_SUNLOCK(fdp);
-		FILEDESC_XLOCK(newfdp);
-		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
-		FILEDESC_XUNLOCK(newfdp);
-		FILEDESC_SLOCK(fdp);
-	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
-		if (fdisused(fdp, i) &&
-		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE &&
-		    fdp->fd_ofiles[i]->f_ops != &badfileops) {
-			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
-			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
-			fhold(newfdp->fd_ofiles[i]);
+		if (fdisused(fdp, i) && (fp = ftable_get(fdp, i)) &&
+		    fp->f_type != DTYPE_KQUEUE && fp->f_ops != &badfileops) {
+			int cloexec = ftable_get_cloexec(fdp, i);
+			int maxfd = fdp->fd_lastfile;
+
+			FILEDESC_SUNLOCK(fdp);
+			FILEDESC_XLOCK(newfdp);
+			ftable_ensure_fd(newfdp, i, maxfd);
+			ftable_set(newfdp, i, fp);
+			ftable_set_cloexec(newfdp, i, cloexec);
 			newfdp->fd_lastfile = i;
+			FILEDESC_XUNLOCK(newfdp);
+			FILEDESC_SLOCK(fdp);
+			fhold(fp);
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
@@ -2177,7 +2619,7 @@ fdcopy(struct filedesc *fdp)
 	FILEDESC_SUNLOCK(fdp);
 	FILEDESC_XLOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
-		if (newfdp->fd_ofiles[i] != NULL)
+		if (ftable_get(newfdp, i) != NULL)
 			fdused(newfdp, i);
 	FILEDESC_XUNLOCK(newfdp);
 	FILEDESC_SLOCK(fdp);
@@ -2195,7 +2637,6 @@ void
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
-	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
@@ -2216,13 +2657,10 @@ fdfree(struct thread *td)
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
-			for (i = 0, fpp = fdp->fd_ofiles;
-			     i <= fdp->fd_lastfile;
-			     i++, fpp++) {
-				if (*fpp == NULL ||
-				    (*fpp)->f_type != DTYPE_VNODE)
+			for (i = 0; i <= fdp->fd_lastfile; i++) {
+				fp = ftable_get(fdp, i);
+				if (fp == NULL || fp->f_type != DTYPE_VNODE)
 					continue;
-				fp = *fpp;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
@@ -2240,7 +2678,6 @@ fdfree(struct thread *td)
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
-				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
@@ -2281,31 +2718,29 @@ fdfree(struct thread *td)
 	}
 	FILEDESC_XLOCK(fdp);
 	i = --fdp->fd_refcnt;
-	FILEDESC_XUNLOCK(fdp);
-	if (i > 0)
+	if (i > 0) {
+		FILEDESC_XUNLOCK(fdp);
 		return;
+	}
 
-	fpp = fdp->fd_ofiles;
-	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
-		if (*fpp) {
-			FILEDESC_XLOCK(fdp);
-			fp = *fpp;
-			*fpp = NULL;
+	for (i = fdp->fd_lastfile; i >= 0 ; i--) {
+		fp = ftable_get(fdp, i);
+		if (fp) {
+			ftable_set(fdp, i, NULL);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
+			FILEDESC_XLOCK(fdp);
 		}
 	}
-	FILEDESC_XLOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
-	if (fdp->fd_nfiles > NDFILE)
-		FREE(fdp->fd_ofiles, M_FILEDESC);
-	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
-		FREE(fdp->fd_map, M_FILEDESC);
+	idb_free(&fdp->fd_files);
+	idb_free(&fdp->fd_map);
+	idb_free(&fdp->fd_cloexec);
 
 	fdp->fd_nfiles = 0;
 
@@ -2377,19 +2812,20 @@ setugidsafety(struct thread *td)
 	 */
 	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		struct file *fp;
+
 		if (i > 2)
 			break;
-		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
-			struct file *fp;
+		fp = ftable_get(fdp, i);
+		if (fp && is_unsafe(fp)) {
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
-			fp = fdp->fd_ofiles[i];
-			fdp->fd_ofiles[i] = NULL;
-			fdp->fd_ofileflags[i] = 0;
+			ftable_set(fdp, i, NULL);
+			ftable_set_cloexec(fdp, i, 0);
 			fdunused(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
@@ -2411,8 +2847,8 @@ fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_XLOCK(fdp);
-	if (fdp->fd_ofiles[idx] == fp) {
-		fdp->fd_ofiles[idx] = NULL;
+	if (ftable_get(fdp, idx) == fp) {
+		ftable_set(fdp, idx, NULL);
 		fdunused(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
@@ -2441,19 +2877,20 @@ fdcloseexec(struct thread *td)
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
-		if (fdp->fd_ofiles[i] != NULL &&
-		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
-		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
-			struct file *fp;
+		struct file *fp;
+
+		fp = ftable_get(fdp, i);
+		if (fp != NULL &&
+		    (fp->f_type == DTYPE_MQUEUE ||
+			ftable_get_cloexec(fdp, i))) {
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
-			fp = fdp->fd_ofiles[i];
-			fdp->fd_ofiles[i] = NULL;
-			fdp->fd_ofileflags[i] = 0;
+			ftable_set(fdp, i, NULL);
+			ftable_set_cloexec(fdp, i, 0);
 			fdunused(fdp, i);
 			if (fp->f_type == DTYPE_MQUEUE)
 				mq_fdclose(td, i, fp);
@@ -2486,7 +2923,7 @@ fdcheckstd(struct thread *td)
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
-		if (fdp->fd_ofiles[i] != NULL)
+		if (ftable_get(fdp, i) != NULL)
 			continue;
 		if (devnull < 0) {
 			save = td->td_retval[0];
@@ -2904,7 +3341,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 	 */
 	FILEDESC_XLOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
-	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+	    (wfp = ftable_get(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
@@ -2931,9 +3368,9 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = wfp;
-		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+		fp = ftable_get(fdp, indx);
+		ftable_set(fdp, indx, wfp);
+		ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd));
 		if (fp == NULL)
 			fdused(fdp, indx);
 		fhold_locked(wfp);
@@ -2951,11 +3388,11 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
-		fdp->fd_ofiles[dfd] = NULL;
-		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		fdp->fd_ofileflags[dfd] = 0;
+		fp = ftable_get(fdp, indx);
+		ftable_set(fdp, indx, ftable_get(fdp, dfd));
+		ftable_set(fdp, dfd, NULL);
+		ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd));
+		ftable_set_cloexec(fdp, dfd, 0);
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
@@ -3103,7 +3540,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
-			if ((fp = fdp->fd_ofiles[n]) == NULL)
+			if ((fp = ftable_get(fdp, n)) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
@@ -3215,7 +3652,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 				fdp, req);
 	for (i = 0; i < fdp->fd_nfiles; i++) {
-		if ((fp = fdp->fd_ofiles[i]) == NULL)
+		if ((fp = ftable_get(fdp, i)) == NULL)
 			continue;
 		bzero(kif, sizeof(*kif));
 		kif->kf_structsize = sizeof(*kif);
@@ -3450,7 +3887,7 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 		export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 				fdp, req);
 	for (i = 0; i < fdp->fd_nfiles; i++) {
-		if ((fp = fdp->fd_ofiles[i]) == NULL)
+		if ((fp = ftable_get(fdp, i)) == NULL)
 			continue;
 		bzero(kif, sizeof(*kif));
 		FILE_LOCK(fp);
@@ -3669,7 +4106,7 @@ file_to_first_proc(struct file *fp)
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
-			if (fp == fdp->fd_ofiles[n])
+			if (fp == ftable_get(fdp, n))
 				return (p);
 		}
 	}
diff --git a/src/sys/kern/kern_lsof.c b/src/sys/kern/kern_lsof.c
index 04e5dd7..70a8286 100644
--- a/src/sys/kern/kern_lsof.c
+++ b/src/sys/kern/kern_lsof.c
@@ -260,7 +260,7 @@ lsof(struct thread *td, struct lsof_args *uap)
 
 		/* Ordinary descriptors for files, pipes, sockets: */
 		} else if (msg.fd < fdp->fd_nfiles) {
-			fp = fdp->fd_ofiles[msg.fd];
+			fp = ftable_get(fdp, msg.fd);
 			if (fp) {
 				switch (fp->f_type) {
 				case DTYPE_VNODE:
diff --git a/src/sys/kern/sys_generic.c b/src/sys/kern/sys_generic.c
index 1a9b061..8aba73d 100644
--- a/src/sys/kern/sys_generic.c
+++ b/src/sys/kern/sys_generic.c
@@ -596,12 +596,12 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 	switch (com) {
 	case FIONCLEX:
 		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+		ftable_set_cloexec(fdp, fd, 0);
 		FILEDESC_XUNLOCK(fdp);
 		goto out;
 	case FIOCLEX:
 		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+		ftable_set_cloexec(fdp, fd, 1);
 		FILEDESC_XUNLOCK(fdp);
 		goto out;
 	case FIONBIO:
@@ -1043,7 +1043,7 @@ pollscan(td, fds, nfd)
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
 		} else {
-			fp = fdp->fd_ofiles[fds->fd];
+			fp = ftable_get(fdp, fds->fd);
 			if (fp == NULL) {
 				fds->revents = POLLNVAL;
 				n++;
diff --git a/src/sys/kern/uipc_mqueue.c b/src/sys/kern/uipc_mqueue.c
index fb2ef6a..59c339a 100644
--- a/src/sys/kern/uipc_mqueue.c
+++ b/src/sys/kern/uipc_mqueue.c
@@ -2005,8 +2005,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap)
 	FILE_UNLOCK(fp);
 
 	FILEDESC_XLOCK(fdp);
-	if (fdp->fd_ofiles[fd] == fp)
-		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+	if (ftable_get(fdp, fd) == fp)
+		ftable_set_cloexec(fdp, fd, 1);
 	FILEDESC_XUNLOCK(fdp);
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
diff --git a/src/sys/kern/uipc_sem.c b/src/sys/kern/uipc_sem.c
index d5525a4..b1e6b62 100644
--- a/src/sys/kern/uipc_sem.c
+++ b/src/sys/kern/uipc_sem.c
@@ -488,8 +488,8 @@ ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
 	fp->f_ops = &ksem_ops;
 
 	FILEDESC_XLOCK(fdp);
-	if (fdp->fd_ofiles[fd] == fp)
-		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+	if (ftable_get(fdp, fd) == fp)
+		ftable_set_cloexec(fdp, fd, 1);
 	FILEDESC_XUNLOCK(fdp);
 	fdrop(fp, td);
 
diff --git a/src/sys/kern/uipc_usrreq.c b/src/sys/kern/uipc_usrreq.c
index b8255f0..1bfc037 100644
--- a/src/sys/kern/uipc_usrreq.c
+++ b/src/sys/kern/uipc_usrreq.c
@@ -1605,7 +1605,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp)
 				if (fdalloc(td, 0, &f))
 					panic("unp_externalize fdalloc failed");
 				fp = *rp++;
-				td->td_proc->p_fd->fd_ofiles[f] = fp;
+				ftable_set(td->td_proc->p_fd, f,
+				    fp);
 				FILE_LOCK(fp);
 				fp->f_msgcount--;
 				FILE_UNLOCK(fp);
@@ -1735,12 +1736,12 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
 			for (i = 0; i < oldfds; i++) {
 				fd = *fdp++;
 				if ((unsigned)fd >= fdescp->fd_nfiles ||
-				    fdescp->fd_ofiles[fd] == NULL) {
+				    ftable_get(fdescp, fd) == NULL) {
 					FILEDESC_SUNLOCK(fdescp);
 					error = EBADF;
 					goto out;
 				}
-				fp = fdescp->fd_ofiles[fd];
+				fp = ftable_get(fdescp, fd);
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
 					FILEDESC_SUNLOCK(fdescp);
 					error = EOPNOTSUPP;
@@ -1765,7 +1766,7 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
 			rp = (struct file **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			for (i = 0; i < oldfds; i++) {
-				fp = fdescp->fd_ofiles[*fdp++];
+				fp = ftable_get(fdescp, *fdp++);
 				*rp++ = fp;
 				FILE_LOCK(fp);
 				fp->f_count++;
diff --git a/src/sys/kern/vfs_syscalls.c b/src/sys/kern/vfs_syscalls.c
index 2f28263..fa0a5e6 100644
--- a/src/sys/kern/vfs_syscalls.c
+++ b/src/sys/kern/vfs_syscalls.c
@@ -4884,7 +4884,7 @@ getvnode(fdp, fd, fpp)
 	else {
 		FILEDESC_SLOCK(fdp);
 		if ((u_int)fd >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[fd]) == NULL)
+		    (fp = ftable_get(fdp, fd)) == NULL)
 			error = EBADF;
 		else if (fp->f_vnode == NULL) {
 			fp = NULL;
diff --git a/src/sys/netsmb/smb_dev.c b/src/sys/netsmb/smb_dev.c
index fd0dcbe..a0dd80c 100644
--- a/src/sys/netsmb/smb_dev.c
+++ b/src/sys/netsmb/smb_dev.c
@@ -370,7 +370,7 @@ nsmb_getfp(struct filedesc* fdp, int fd, int flag)
 
 	FILEDESC_SLOCK(fdp);
 	if (((u_int)fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL ||
+	    (fp = ftable_get(fdp, fd)) == NULL ||
 	    (fp->f_flag & flag) == 0) {
 		FILEDESC_SUNLOCK(fdp);
 		return (NULL);
diff --git a/src/sys/sys/filedesc.h b/src/sys/sys/filedesc.h
index 1831e5c..e9ea56a 100644
--- a/src/sys/sys/filedesc.h
+++ b/src/sys/sys/filedesc.h
@@ -45,16 +45,26 @@
  * This structure is used for the management of descriptors.  It may be
  * shared by multiple processes.
  */
-#define NDSLOTTYPE	u_long
+#define NDSLOTTYPE	uintptr_t
+
+/* Generic indirect block table */
+struct idb_table {
+	union {
+		void *flat;
+		void **indirect;
+	} idb_tbl;
+	int idb_nents; /* Current max # of entries. */
+	int idb_orig_nents; /* Orig # of entries for the flat table. */
+};
 
 struct filedesc {
-	struct	file **fd_ofiles;	/* file structures for open files */
-	char	*fd_ofileflags;		/* per-process open file flags */
+	struct idb_table fd_files;     /* table of open file structs */
+	struct idb_table fd_map;	/* bitmap of free fds */
+	struct idb_table fd_cloexec;	/* bitmap of fd close exec state */
 	struct	vnode *fd_cdir;		/* current directory */
 	struct	vnode *fd_rdir;		/* root directory */
 	struct	vnode *fd_jdir;		/* jail root directory */
 	int	fd_nfiles;		/* number of open files allocated */
-	NDSLOTTYPE *fd_map;		/* bitmap of free fds */
 	int	fd_lastfile;		/* high-water mark of fd_ofiles */
 	int	fd_freefile;		/* approx. next free file */
 	u_short	fd_cmask;		/* mask for file creation */
@@ -130,12 +140,18 @@ struct filedesc_to_leader *
 int	getvnode(struct filedesc *fdp, int fd, struct file **fpp);
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 void	setugidsafety(struct thread *td);
+struct	file *ftable_get(struct filedesc *fdp, int fd);
+void	ftable_set(struct filedesc *fdp, int fd, struct file *fp);
+int	ftable_get_cloexec(struct filedesc *fdp, int fd);
+void	ftable_set_cloexec(struct filedesc *fdp, int fd, int on);
+
 
 static __inline struct file *
 fget_locked(struct filedesc *fdp, int fd)
 {
 
-	return (fd < 0 || fd >= fdp->fd_nfiles ? NULL : fdp->fd_ofiles[fd]);
+	return (fd < 0 || fd >= fdp->fd_nfiles ? NULL :
+	    ftable_get(fdp, fd));
 }
 
 #endif /* _KERNEL */

--Apple-Mail-18--1040317416
Content-Type: text/plain;
	charset=US-ASCII;
	format=flowed;
	delsp=yes
Content-Transfer-Encoding: 7bit




On May 11, 2010, at 10:24 AM, Tim Prouty wrote:

> Hi,
>
> This is my first time sending a patch to the list, so let me know if  
> there
> are any conventions I missed.
>
> Attached is a patch that attempts to remove the data structure
> limitations on the number of open file descriptors in the system.  The
> patch is against our modified version of FreeBSD 7, so it probably
> won't apply cleanly against upstream, but I wanted to get this out
> there for discussion soon so if there is feedback, we can address it
> and then worry about porting a specific patch for upstream.
>
> We (Isilon) have been running this internally for a few months without
> any issues, although there is at least one known issue that I need to
> resolve, which is mentioned below.
>
> Motivation:
>
>  With the increasing amount of memory and processing power in modern
>  machines, there are certain userspace processes that are able to
>  handle much higher concurrent load than previously possible.  A
>  specific example is a single-process/multi-threaded SMB stack which
>  can handle thousands of connected clients, each with hundreds of
>  files open.  Once kernel sysctl limits are increased for max files,
>  the next limitation is in the actual actual file descriptor data
>  structures.
>
> Problem - Data Structure Limits:
>
>  The existing per-process data structures for the file descriptor are
>  flat tables, which are reallocated each time they need need to grow.
>  This is innefficient as the amount of data to allocate and copy each
>  time increases, but the bigger issue is the potentially limited
>  amount of contiguous KVA memory as the table grows very large.  Over
>  time as the KVA memory becomes fragmanted, malloc may be unable to
>  provide large enough blocks of contiguous memory.
>
>  In the current code the struct proc contains both an array of struct
>  file pointers and a bit field indicating which file descriptors are
>  in use.  The primary issue is how to handle these structures growing
>  beyond the kernel page size of 4K.
>
>  The array of file pointers will grow much faster than the bit field,
>  especially on a 64 bit kernel. The 4K block size will be hit at 512
>  files (64 bit kernel) for the file pointer array and 32,768 files
>  for the bit field.
>
> Solution:
>
> File Pointer Array
>
>  Focusing first on the file pointer array limitation, an indirect
>  block approach is used.  An indirect block size of 4K (equal to page
>  size) is used, allowing for 512 files per block.  To optimize for
>  the common case of low/normal fd usage, a flat array is initialized
>  to 20 entries and then grows at 2x each time until the block reaches
>  it's maximum size. Once more than 512 files are opened, the array
>  will transition to a single level indirect block table.
>
> Fd Bitfield:
>
>  The fd bit field as it stands can represent 32K files when it grows
>  to the page size limit.  Using the same indirect system as the file
>  pointer array, it is able to grow beyond it's existing limits.
>
> Close Exec Field:
>
>  One complication of the old file pointer table is that for each file
>  pointer there was 1 byte flags.  The memory was laid out such that
>  the file pointers are all in one contiguous array, followed by a
>  second array of chars where each char entry is a flags field that
>  corresponds to the file pointer at the same index.  Interestingly
>  there is actually only one flag that is used: UF_EXCLOSE, so it's
>  fairly wasteful to have an array of chars.  What linux does, and
>  what I have done is to just use a bitfield for all fds that should
>  be closed on exec.  This could be further optimized by doing some
>  pointer trickery to store the close exec bit in the struct file
>  pointer rather than keep a separate bitfield.
>
> Indirect Block Table:
>
>  Since there are three consumers of the indirect block table, I
>  generalized it so all of the consumers rely on the same code.  This
>  could eventually be refactored into a kernel library since it could
>  be generally useful in other areas.  The table uses a single level
>  of indirection, so the base table can still grow beyond the 4K.  As
>  a process uses more fds, the need to continue growing the base table
>  should be fairly limited, and a single realloc will significantly
>  increase the number of fds the process can allocate.
>
> Accessing the new data structures:
>
>  All consumers of the file pointer array and bitfield will now have
>  to use accessors rather than using direct access.
>
> Known Issues:
>
>  The new fdp locking in fdcopy needs to be reworked.
>
>
> Thank you for reviewing!
>
> -Tim
>
> <0001-Increase-scalabilty-of-per-process-file-descriptor-d.patch>
> _______________________________________________
> freebsd-arch@freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-arch
> To unsubscribe, send any mail to "freebsd-arch- 
> unsubscribe@freebsd.org"


--Apple-Mail-18--1040317416--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?A57FE0F9-47B9-4036-8F1C-0D30FB545980>