Date: Tue, 11 May 2010 16:13:45 -0700 From: Tim Prouty <tim.prouty@isilon.com> To: freebsd-arch@freebsd.org Cc: Zachary Loafman <zachary.loafman@isilon.com>, Matthew Fleming <matthew.fleming@isilon.com> Subject: Re: [PATCH]/[RFC] Increase scalability of per-process file descriptor data structures Message-ID: <A57FE0F9-47B9-4036-8F1C-0D30FB545980@isilon.com> In-Reply-To: <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com> References: <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com>
next in thread | previous in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
The patch was slightly truncated, I'm guessing because it was > 50K.
Attached is a slightly trimmed down patch.
-Tim
[-- Attachment #2 --]
diff --git a/src/sys/compat/linux/linux_stats.c b/src/sys/compat/linux/linux_stats.c
index 374ce39..905db20 100644
--- a/src/sys/compat/linux/linux_stats.c
+++ b/src/sys/compat/linux/linux_stats.c
@@ -129,7 +129,7 @@ translate_path_major_minor(struct thread *td, char *path, struct stat *buf)
fd = td->td_retval[0];
td->td_retval[0] = temp;
translate_fd_major_minor(td, fd, buf);
- fdclose(fdp, fdp->fd_ofiles[fd], fd, td);
+ fdclose(fdp, ftable_get(fdp, fd), fd, td);
}
static int
diff --git a/src/sys/compat/svr4/svr4_filio.c b/src/sys/compat/svr4/svr4_filio.c
index 701bf15..82364ca 100644
--- a/src/sys/compat/svr4/svr4_filio.c
+++ b/src/sys/compat/svr4/svr4_filio.c
@@ -212,13 +212,13 @@ svr4_fil_ioctl(fp, td, retval, fd, cmd, data)
switch (cmd) {
case SVR4_FIOCLEX:
FILEDESC_XLOCK(fdp);
- fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ ftable_set_cloexec(fdp, fd, 1);
FILEDESC_XUNLOCK(fdp);
return 0;
case SVR4_FIONCLEX:
FILEDESC_XLOCK(fdp);
- fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+ ftable_set_cloexec(fdp, fd, 0);
FILEDESC_XUNLOCK(fdp);
return 0;
diff --git a/src/sys/fs/fdescfs/fdesc_vfsops.c b/src/sys/fs/fdescfs/fdesc_vfsops.c
index 16fa4cf..fb2e45e 100644
--- a/src/sys/fs/fdescfs/fdesc_vfsops.c
+++ b/src/sys/fs/fdescfs/fdesc_vfsops.c
@@ -203,7 +203,7 @@ fdesc_statfs(mp, sbp, td)
last = min(fdp->fd_nfiles, lim);
freefd = 0;
for (i = fdp->fd_freefile; i < last; i++)
- if (fdp->fd_ofiles[i] == NULL)
+ if (ftable_get(fdp, i) == NULL)
freefd++;
/*
diff --git a/src/sys/fs/fdescfs/fdesc_vnops.c b/src/sys/fs/fdescfs/fdesc_vnops.c
index f39c3a7..0ea6607 100644
--- a/src/sys/fs/fdescfs/fdesc_vnops.c
+++ b/src/sys/fs/fdescfs/fdesc_vnops.c
@@ -581,7 +581,7 @@ fdesc_readdir(ap)
dp->d_type = DT_DIR;
break;
default:
- if (fdp->fd_ofiles[fcnt] == NULL) {
+ if (ftable_get(fdp, fcnt) == NULL) {
FILEDESC_SUNLOCK(fdp);
goto done;
}
diff --git a/src/sys/fs/nfsserver/nfs_nfsdport.c b/src/sys/fs/nfsserver/nfs_nfsdport.c
index 232e465..94fd81c 100644
--- a/src/sys/fs/nfsserver/nfs_nfsdport.c
+++ b/src/sys/fs/nfsserver/nfs_nfsdport.c
@@ -3103,7 +3103,7 @@ fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
fdp = p->td_proc->p_fd;
if (fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ (fp = ftable_get(fdp, fd)) == NULL)
return (EBADF);
*fpp = fp;
return (0);
diff --git a/src/sys/kern/kern_descrip.c b/src/sys/kern/kern_descrip.c
index 6ce0356..1a34987 100644
--- a/src/sys/kern/kern_descrip.c
+++ b/src/sys/kern/kern_descrip.c
@@ -112,9 +112,8 @@ enum dup_type { DUP_VARIABLE, DUP_FIXED };
static int do_dup(struct thread *td, enum dup_type type, int old, int new,
register_t *retval);
-static int fd_first_free(struct filedesc *, int, int);
+static int fd_first_free(struct filedesc *, int);
static int fd_last_used(struct filedesc *, int, int);
-static void fdgrowtable(struct filedesc *, int);
static int fdrop_locked(struct file *fp, struct thread *td);
static void fdunused(struct filedesc *fdp, int fd);
static void fdused(struct filedesc *fdp, int fd);
@@ -134,10 +133,406 @@ static void fdused(struct filedesc *fdp, int fd);
#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
-/*
- * Storage required per open file descriptor.
+#define IDB_BLOCK_SIZE PAGE_SIZE
+#define IDB_ENT_SIZE sizeof(uintptr_t)
+#define IDB_ENTS_PER_BLOCK (IDB_BLOCK_SIZE/IDB_ENT_SIZE)
+
+/* May be a perf impact on 32-bit kernels. */
+CTASSERT(NDSLOTSIZE == IDB_ENT_SIZE);
+
+/**
+ * Return the index into the indirect table given an entry.
+ */
+static inline int
+idb_block_index(int ent)
+{
+
+ return (ent / IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return offset into an indirect block given an entry.
+ */
+static inline int
+idb_block_off(int ent)
+{
+
+ return (ent % IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return 1 if the indirect block table is flat, else 0.
+ */
+static inline int
+idb_is_flat(struct idb_table *idb)
+{
+
+ return (idb->idb_nents <= IDB_ENTS_PER_BLOCK);
+}
+
+/**
+ * Return a pointer to the block. If the block is sparse or ent is outside
+ * the current size of the table, return NULL.
+ */
+static inline void *
+idb_block(struct idb_table *idb, int ent)
+{
+
+ return (ent >= idb->idb_nents ? NULL :
+ idb->idb_tbl.indirect[idb_block_index(ent)]);
+}
+
+/**
+ * Initialize a new indirect table. The caller is responsible for allocating
+ * the idb struct, and must provide an initial non-null flat table.
+ *
+ * @param idb Indirect table to initialize.
+ * @param idb_flat Initial non-null table.
+ * @param idb_nents Number of entries in the initial flat table.
+ */
+static void
+idb_init(struct idb_table *idb, void *idb_flat, int idb_nents)
+{
+
+ KASSERT(idb != NULL, ("idb table must be allocated by caller"));
+ KASSERT(idb_flat != NULL,
+ ("idb flat table must be allocated by caller"));
+
+ idb->idb_tbl.flat = idb_flat;
+ idb->idb_nents = idb_nents;
+ idb->idb_orig_nents = idb_nents;
+}
+
+/**
+ * Free all blocks associated with the indirect table.
+ */
+static void
+idb_free(struct idb_table *idb)
+{
+ int indx;
+ void *block;
+
+ if (idb_is_flat(idb)) {
+ if (idb->idb_nents > idb->idb_orig_nents)
+ free(idb->idb_tbl.flat, M_FILEDESC);
+ return;
+ }
+
+ /* Free indirect leaves. */
+ for (indx = idb_block_index(0);
+ indx < idb_block_index(idb->idb_nents);
+ indx++) {
+ block = idb->idb_tbl.indirect[indx];
+ if (block != NULL)
+ free(block, M_FILEDESC);
+ }
+
+ /* Free indirect root. */
+ free(idb->idb_tbl.indirect, M_FILEDESC);
+}
+
+/**
+ * Return a pointer into the table/block given an index.
+ */
+static void *
+idb_get_entry(struct idb_table *idb, int ent)
+{
+ void *block;
+
+ if (ent > idb->idb_nents)
+ return (NULL);
+
+ if (idb_is_flat(idb))
+ return (((caddr_t)idb->idb_tbl.flat) + (ent * IDB_ENT_SIZE));
+
+ /* Indirect block. Return NULL for sparse blocks. */
+ block = idb_block(idb, ent);
+ if (block == NULL)
+ return (NULL);
+
+ return (((caddr_t)block) + (idb_block_off(ent) * IDB_ENT_SIZE));
+}
+
+/**
+ * If the current table size doesn't accomodate the new number of entries,
+ * grow it to fit new_nents. Mult is a multiplying factor used to check the
+ * number of entries in the table against new_nents which allows growing the
+ * flat table or the indirect table. The current number of entries in the
+ * table must be a multiple of mult.
+ *
+ * @param idb Table to grow.
+ * @param new_nents Number of entries to grow the table to.
+ * @param mult Multiplier for new_nents.
+ * @param sx Exclusive lock that may be dropped/reqacquired.
*/
-#define OFILESIZE (sizeof(struct file *) + sizeof(char))
+static void
+idb_grow_table(struct idb_table *idb, int new_nents, int mult, struct sx *sx)
+{
+ int old_nents;
+ void *ntable;
+
+ KASSERT(idb->idb_nents % mult == 0,
+ ("%d is not a multiple of %d", idb->idb_nents, mult));
+
+ old_nents = idb->idb_nents / mult;
+
+ /* Do nothing if the table is already big enough. */
+ if (old_nents > new_nents)
+ return;
+
+ sx_xunlock(sx);
+ ntable = malloc(new_nents * IDB_ENT_SIZE, M_FILEDESC,
+ M_ZERO | M_WAITOK);
+ sx_xlock(sx);
+
+ /* Done if table grew when the lock was dropped. */
+ if (idb->idb_nents / mult > new_nents) {
+ free(ntable, M_FILEDESC);
+ return;
+ }
+
+ /* Copy the data to the new table and fix up the pointers. */
+ bcopy(idb->idb_tbl.flat, ntable, old_nents * IDB_ENT_SIZE);
+ if (idb->idb_nents > idb->idb_orig_nents)
+ free(idb->idb_tbl.flat, M_FILEDESC);
+ idb->idb_tbl.flat = ntable;
+ idb->idb_nents = new_nents * mult;
+}
+
+/**
+ * Transition a flat table to an indirect block table.
+ *
+ * @param idb Table to transition.
+ * @param sx Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_transition_to_indirect(struct idb_table *idb, struct sx *sx)
+{
+ void **ntable = NULL;
+
+ KASSERT(idb->idb_nents >= IDB_ENTS_PER_BLOCK,
+ ("Insufficient size for indirect transition: %d", idb->idb_nents));
+
+ /* Done if the table has already transitioned. */
+ if (idb->idb_nents > IDB_ENTS_PER_BLOCK) {
+ return;
+ }
+
+ sx_xunlock(sx);
+ ntable = malloc(IDB_BLOCK_SIZE, M_FILEDESC,
+ M_ZERO | M_WAITOK);
+ sx_xlock(sx);
+
+ /* Done if indirect transition done when the lock was dropped. */
+ if (idb->idb_nents > IDB_ENTS_PER_BLOCK) {
+ free(ntable, M_FILEDESC);
+ return;
+ }
+
+ /* Make indirect transition. */
+ ntable[0] = idb->idb_tbl.flat;
+ idb->idb_tbl.indirect = ntable;
+ idb->idb_nents = IDB_ENTS_PER_BLOCK * IDB_ENTS_PER_BLOCK;
+}
+
+/**
+ * Allocates an indirect block in the table if one doesn't already exist for
+ * new_ent.
+ *
+ * @param idb Table to ensure new_ent has an indirect block in.
+ * @param new_ent New entry index to create indirect block for.
+ * @param sx Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_ensure_indirect_block(struct idb_table *idb, int new_ent, struct sx *sx)
+{
+ void *nblock = NULL;
+
+ KASSERT(new_ent < idb->idb_nents,
+ ("Table too small (%d) for indirect block at index %d",
+ idb->idb_nents, new_ent));
+
+ /* Done if the block is already allocated. */
+ if (idb_block(idb, new_ent) != NULL)
+ return;
+
+ sx_xunlock(sx);
+ nblock = malloc(IDB_BLOCK_SIZE, M_FILEDESC, M_ZERO | M_WAITOK);
+ sx_xlock(sx);
+
+ /* Done if block was allocated when the lock was dropped. */
+ if (idb_block(idb, new_ent) != NULL) {
+ free(nblock, M_FILEDESC);
+ return;
+ }
+
+ idb->idb_tbl.indirect[idb_block_index(new_ent)] = nblock;
+}
+
+/**
+ * idb_ensure_size() guarantees that:
+ * 1. If the table is flat, the table will be made large enough for new_ent,
+ * possibly being transitioned to an indirect table.
+ *
+ * 2. If the table is indirect, the indirect table is large enough to have an
+ * entry to point to the indirect block, and the indirect block itself is
+ * allocated.
+ *
+ * The sx lock will be released if new memory needs to be allocated, but will
+ * be reacquired before returning.
+ *
+ * @param idb Table to ensure new_ent fits in.
+ * @param new_ent New entry index.
+ * @param maxsize Max size of the table so excess memory isn't used.
+ * @param sx Exclusive lock that may be dropped/reqacquired.
+ */
+static void
+idb_ensure_size(struct idb_table *idb, int new_ent, int maxsize, struct sx *sx)
+{
+ KASSERT(idb->idb_nents > 0, ("zero-length idb table"));
+ KASSERT(new_ent < maxsize,
+ ("new_ent(%d) >= maxsize(%d)", new_ent, maxsize));
+
+ sx_assert(sx, SX_XLOCKED | SX_NOTRECURSED);
+
+ /* Grow table 2x while it is flat. */
+ if (idb_is_flat(idb) && new_ent < IDB_ENTS_PER_BLOCK) {
+ if (new_ent >= idb->idb_nents) {
+ KASSERT(new_ent > 0, ("Negative new_ent %d", new_ent));
+ /* Round up to power of 2 to appease the allocator. */
+ idb_grow_table(idb, min(min(1 << (fls(new_ent)),
+ IDB_ENTS_PER_BLOCK), maxsize), 1, sx);
+ }
+ return;
+ }
+
+ /* Transition flat table to indirect. */
+ if (idb_is_flat(idb) && new_ent >= IDB_ENTS_PER_BLOCK) {
+ idb_grow_table(idb, IDB_ENTS_PER_BLOCK, 1, sx);
+ idb_transition_to_indirect(idb, sx);
+ }
+
+ /* Grow size of indirect table. */
+ if (new_ent >= idb->idb_nents) {
+ int grow_factor, new_nents;
+ /* Need to grow the indirect table. */
+ for (grow_factor = 2;; grow_factor <<= 1) {
+ if (idb_block_index(idb->idb_nents) * grow_factor >
+ idb_block_index(new_ent))
+ break;
+ }
+ new_nents = min(idb_block_index(idb->idb_nents) * grow_factor,
+ idb_block_index(maxsize));
+ idb_grow_table(idb, new_nents, IDB_ENTS_PER_BLOCK, sx);
+ }
+
+ /* Ensure block is allocated in sparse table. */
+ idb_ensure_indirect_block(idb, new_ent, sx);
+}
+
+/**
+ * Get the file struct for an fd from the ftable.
+ *
+ * @return The file struct for a particular or NULL.
+ */
+struct file *
+ftable_get(struct filedesc *fdp, int fd)
+{
+ struct file **fpp;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ fpp = idb_get_entry(&fdp->fd_files, fd);
+ return (fpp != NULL ? *fpp : NULL);
+}
+
+/**
+ * Set an entry in the table to point to a struct file. ftable_ensure_fd()
+ * must be first called to ensure the underlying data structure can support
+ * this entry.
+ */
+void
+ftable_set(struct filedesc *fdp, int fd, struct file *fp)
+{
+ struct file **fpp;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ fpp = idb_get_entry(&fdp->fd_files, fd);
+ KASSERT(fpp != NULL, ("Trying to set unallocated entry"));
+ *fpp = fp;
+}
+
+/**
+ * Get the close exec state of a file descriptor.
+ *
+ * @return 1 if close exec is set, otherwise 0.
+ */
+int
+ftable_get_cloexec(struct filedesc *fdp, int fd)
+{
+ NDSLOTTYPE *map;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd));
+ if (map == NULL)
+ return (0);
+
+ return ((*map & NDBIT(fd)) != 0);
+}
+
+/**
+ * Set the close exec state of a file descriptor.
+ *
+ * @param on 1: close exec state will be turned on.
+ * 0: close exec state will be turned off.
+ */
+void
+ftable_set_cloexec(struct filedesc *fdp, int fd, int on)
+{
+ NDSLOTTYPE *map;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd));
+ KASSERT(map != NULL, ("trying to set cloexec on an unallocated file"));
+
+ if (on)
+ *map |= NDBIT(fd);
+ else
+ *map &= ~NDBIT(fd);
+}
+
+/**
+ * If the ftable is already large enough to store the fd, then simply return.
+ * Otherwise, allocate the necessary blocks to accomodate the new fd. This
+ * allows for a sparse table. May malloc new blocks requiring the fdp lock to
+ * be dropped and reacquired.
+ *
+ * @param nfd File descriptor to possilbly grow the table to fit.
+ * @param maxfd Maximum fd so excess memory isn't used.
+ */
+static void
+ftable_ensure_fd(struct filedesc *fdp, int nfd, int maxfd)
+{
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(nfd <= maxfd, ("nfd(%d) > maxfd(%d)", nfd, maxfd));
+
+ idb_ensure_size(&fdp->fd_files, nfd, maxfd + 1, &fdp->fd_sx);
+ idb_ensure_size(&fdp->fd_map, NDSLOT(nfd), NDSLOT(maxfd) + 1,
+ &fdp->fd_sx);
+ idb_ensure_size(&fdp->fd_cloexec, NDSLOT(nfd), NDSLOT(maxfd) + 1,
+ &fdp->fd_sx);
+
+ /*
+ * ft_map and ft_cloexec grow at the same rate, but ft_files grows at
+ * a different rate, so advertise table size as the min.
+ */
+ fdp->fd_nfiles = min(fdp->fd_files.idb_nents,
+ fdp->fd_map.idb_nents * NDENTRIES);
+}
/*
* Basic allocation of descriptors:
@@ -150,8 +545,8 @@ struct filedesc0 {
* <= NDFILE, and are then pointed to by the pointers above.
*/
struct file *fd_dfiles[NDFILE];
- char fd_dfileflags[NDFILE];
NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+ NDSLOTTYPE fd_dcloexec[NDSLOTS(NDFILE)];
};
/*
@@ -166,14 +561,13 @@ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
/* A mutex to protect the association between a proc and filedesc. */
static struct mtx fdesc_mtx;
-/*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+/**
+ * Iterate a flat array searching for the first zero bit in the given bitmap,
+ * starting at low and not exceeding size - 1.
*/
static int
-fd_first_free(struct filedesc *fdp, int low, int size)
+fd_first_free_block(NDSLOTTYPE *map, int low, int size)
{
- NDSLOTTYPE *map = fdp->fd_map;
NDSLOTTYPE mask;
int off, maxoff;
@@ -193,14 +587,61 @@ fd_first_free(struct filedesc *fdp, int low, int size)
return (size);
}
+/**
+ * Iterate the indirect block table fd map searching for the first free fd,
+ * starting at low. Return the current number of entries in the table if none
+ * are free.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low)
+{
+ struct idb_table *idb = &fdp->fd_map;
+ NDSLOTTYPE *block;
+ int indx;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ /* Flat table. */
+ if (idb_is_flat(idb))
+ return (fd_first_free_block(idb->idb_tbl.flat, low,
+ idb->idb_nents * NDENTRIES));
+
+ /* Loop through the indirect blocks. */
+ for (indx = idb_block_index(NDSLOT(low));
+ indx < idb_block_index(idb->idb_nents);
+ indx++) {
+ int block_low, free_ent;
+
+ block = idb->idb_tbl.indirect[indx];
+ if (block == NULL) {
+ /* Unallocated block, so the first index is fine. */
+ free_ent = indx * IDB_ENTS_PER_BLOCK * NDENTRIES;
+ return (max(free_ent, low));
+ }
+
+ /* Scan block, starting mid-block if necessary. */
+ block_low = (indx == idb_block_index(NDSLOT(low))) ?
+ idb_block_off(NDSLOT(low)) * NDENTRIES : 0;
+ free_ent = fd_first_free_block(block, block_low,
+ IDB_ENTS_PER_BLOCK * NDENTRIES);
+
+ /* If there was a free fd, return it. */
+ if (free_ent < IDB_ENTS_PER_BLOCK * NDENTRIES)
+ return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES +
+ free_ent);
+ }
+
+ /* No free fds found. */
+ return (idb->idb_nents);
+}
+
/*
* Find the highest non-zero bit in the given bitmap, starting at low and
* not exceeding size - 1.
*/
static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used_block(NDSLOTTYPE *map, int low, int size)
{
- NDSLOTTYPE *map = fdp->fd_map;
NDSLOTTYPE mask;
int off, minoff;
@@ -220,12 +661,65 @@ fd_last_used(struct filedesc *fdp, int low, int size)
return (low - 1);
}
+/**
+ * Iterate the indirect block table fd map searching for the highest non-zero
+ * bit, starting at low and not exceeding size - 1. Return low -1 if no fds
+ * >= low are used.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int low, int size)
+{
+ struct idb_table *idb = &fdp->fd_map;
+ NDSLOTTYPE *block;
+ int indx;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ /* Flat table. */
+ if (idb_is_flat(idb))
+ return (fd_last_used_block(idb->idb_tbl.flat, low, size));
+
+ /* Loop through the indirect blocks backwards. */
+ for (indx = idb_block_index(NDSLOT(size));
+ indx >= idb_block_index(NDSLOT(low));
+ indx--) {
+ int block_low, block_high, used_ent;
+
+ block = idb->idb_tbl.indirect[indx];
+ /* If the block is sparse, move onto the next one. */
+ if (block == NULL)
+ continue;
+
+ /* Scan block, starting/ending mid-block if necessary. */
+ block_low = (indx == idb_block_index(NDSLOT(low))) ?
+ idb_block_off(NDSLOT(low)) * NDENTRIES : 0;
+ block_high = (indx == idb_block_index(NDSLOT(size))) ?
+ idb_block_off(NDSLOT(size)) * NDENTRIES :
+ IDB_ENTS_PER_BLOCK;
+ used_ent = fd_last_used_block(block, block_low, block_high);
+
+ /* If there was a used fd, return it. */
+ if (used_ent >= block_low)
+ return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES +
+ used_ent);
+ }
+
+ /* No used fds found. */
+ return (low - 1);
+}
+
static int
fdisused(struct filedesc *fdp, int fd)
{
+ NDSLOTTYPE *map;
+
+ FILEDESC_LOCK_ASSERT(fdp);
KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
- return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+
+ map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+
+ return (map && (*map & NDBIT(fd)) != 0);
}
/*
@@ -234,16 +728,19 @@ fdisused(struct filedesc *fdp, int fd)
static void
fdused(struct filedesc *fdp, int fd)
{
+ NDSLOTTYPE *map;
FILEDESC_XLOCK_ASSERT(fdp);
- KASSERT(!fdisused(fdp, fd),
- ("fd already used"));
+ KASSERT(!fdisused(fdp, fd), ("fd already used"));
- fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+ map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+ KASSERT(map != NULL, ("Map block is NULL"));
+
+ *map |= NDBIT(fd);
if (fd > fdp->fd_lastfile)
fdp->fd_lastfile = fd;
if (fd == fdp->fd_freefile)
- fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+ fdp->fd_freefile = fd_first_free(fdp, fd);
}
/*
@@ -253,13 +750,19 @@ static void
fdunused(struct filedesc *fdp, int fd)
{
+ NDSLOTTYPE *map;
+
FILEDESC_XLOCK_ASSERT(fdp);
KASSERT(fdisused(fdp, fd),
("fd is already unused"));
- KASSERT(fdp->fd_ofiles[fd] == NULL,
+ KASSERT(ftable_get(fdp, fd) == NULL,
("fd is still in use"));
- fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+ map = idb_get_entry(&fdp->fd_map, NDSLOT(fd));
+ KASSERT(map != NULL, ("Map block is NULL"));
+
+ *map &= ~NDBIT(fd);
+
if (fd < fdp->fd_freefile)
fdp->fd_freefile = fd;
if (fd == fdp->fd_lastfile)
@@ -410,7 +913,7 @@ fdtofp(int fd, struct filedesc *fdp)
FILEDESC_LOCK_ASSERT(fdp);
if ((unsigned)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ (fp = ftable_get(fdp, fd)) == NULL)
return (NULL);
return (fp);
}
@@ -422,7 +925,6 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
struct flock *flp;
struct file *fp;
struct proc *p;
- char *pop;
struct vnode *vp;
u_int newmin;
int error, flg, tmp;
@@ -467,8 +969,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
error = EBADF;
break;
}
- pop = &fdp->fd_ofileflags[fd];
- td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+ td->td_retval[0] = ftable_get_cloexec(fdp, fd) ?
+ FD_CLOEXEC :0;
FILEDESC_SUNLOCK(fdp);
break;
@@ -479,9 +981,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
error = EBADF;
break;
}
- pop = &fdp->fd_ofileflags[fd];
- *pop = (*pop &~ UF_EXCLOSE) |
- (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+ ftable_set_cloexec(fdp, fd, arg & FD_CLOEXEC);
FILEDESC_XUNLOCK(fdp);
break;
@@ -651,7 +1151,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
/* Check for race with close */
FILEDESC_SLOCK(fdp);
if ((unsigned) fd >= fdp->fd_nfiles ||
- fp != fdp->fd_ofiles[fd]) {
+ fp != ftable_get(fdp, fd)) {
FILEDESC_SUNLOCK(fdp);
flp->l_whence = SEEK_SET;
flp->l_start = 0;
@@ -750,7 +1250,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
return (EMFILE);
FILEDESC_XLOCK(fdp);
- if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+ if (old >= fdp->fd_nfiles || ftable_get(fdp, old) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
@@ -759,7 +1259,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
FILEDESC_XUNLOCK(fdp);
return (0);
}
- fp = fdp->fd_ofiles[old];
+ fp = ftable_get(fdp, old);
fhold(fp);
/*
@@ -770,9 +1270,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
* out for a race.
*/
if (type == DUP_FIXED) {
- if (new >= fdp->fd_nfiles)
- fdgrowtable(fdp, new + 1);
- if (fdp->fd_ofiles[new] == NULL)
+ ftable_ensure_fd(fdp, new, maxfd);
+ if (ftable_get(fdp, new) == NULL)
fdused(fdp, new);
} else {
if ((error = fdalloc(td, new, &new)) != 0) {
@@ -787,9 +1286,9 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
* bad file descriptor. Userland should do its own locking to
* avoid this case.
*/
- if (fdp->fd_ofiles[old] != fp) {
+ if (ftable_get(fdp, old) != fp) {
/* we've allocated a descriptor which we won't use */
- if (fdp->fd_ofiles[new] == NULL)
+ if (ftable_get(fdp, new) == NULL)
fdunused(fdp, new);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
@@ -805,7 +1304,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
*
* XXX this duplicates parts of close().
*/
- delfp = fdp->fd_ofiles[new];
+ delfp = ftable_get(fdp, new);
holdleaders = 0;
if (delfp != NULL) {
if (td->td_proc->p_fdtol != NULL) {
@@ -821,8 +1320,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new,
/*
* Duplicate the source descriptor
*/
- fdp->fd_ofiles[new] = fp;
- fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+ ftable_set(fdp, new, fp);
+ ftable_set_cloexec(fdp, new, 0);
if (new > fdp->fd_lastfile)
fdp->fd_lastfile = new;
*retval = new;
@@ -1111,12 +1610,12 @@ kern_close(td, fd)
FILEDESC_XLOCK(fdp);
if ((unsigned)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL) {
+ (fp = ftable_get(fdp, fd)) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
- fdp->fd_ofiles[fd] = NULL;
- fdp->fd_ofileflags[fd] = 0;
+ ftable_set(fdp, fd, NULL);
+ ftable_set_cloexec(fdp, fd, 0);
fdunused(fdp, fd);
if (td->td_proc->p_fdtol != NULL) {
/*
@@ -1178,7 +1677,7 @@ closefrom(struct thread *td, struct closefrom_args *uap)
uap->lowfd = 0;
FILEDESC_SLOCK(fdp);
for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
- if (fdp->fd_ofiles[fd] != NULL) {
+ if (ftable_get(fdp, fd) != NULL) {
FILEDESC_SUNLOCK(fdp);
(void)kern_close(td, fd);
FILEDESC_SLOCK(fdp);
@@ -1806,70 +2305,6 @@ out:
}
/*
- * Grow the file table to accomodate (at least) nfd descriptors. This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
- */
-static void
-fdgrowtable(struct filedesc *fdp, int nfd)
-{
- struct file **ntable;
- char *nfileflags;
- int nnfiles, onfiles;
- NDSLOTTYPE *nmap;
-
- FILEDESC_XLOCK_ASSERT(fdp);
-
- KASSERT(fdp->fd_nfiles > 0,
- ("zero-length file table"));
-
- /* compute the size of the new table */
- onfiles = fdp->fd_nfiles;
- nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
- if (nnfiles <= onfiles)
- /* the table is already large enough */
- return;
-
- /* allocate a new table and (if required) new bitmaps */
- FILEDESC_XUNLOCK(fdp);
- MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
- M_FILEDESC, M_ZERO | M_WAITOK);
- nfileflags = (char *)&ntable[nnfiles];
- if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
- MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
- M_FILEDESC, M_ZERO | M_WAITOK);
- else
- nmap = NULL;
- FILEDESC_XLOCK(fdp);
-
- /*
- * We now have new tables ready to go. Since we dropped the
- * filedesc lock to call malloc(), watch out for a race.
- */
- onfiles = fdp->fd_nfiles;
- if (onfiles >= nnfiles) {
- /* we lost the race, but that's OK */
- free(ntable, M_FILEDESC);
- if (nmap != NULL)
- free(nmap, M_FILEDESC);
- return;
- }
- bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
- bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
- if (onfiles > NDFILE)
- free(fdp->fd_ofiles, M_FILEDESC);
- fdp->fd_ofiles = ntable;
- fdp->fd_ofileflags = nfileflags;
- if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
- bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
- if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
- free(fdp->fd_map, M_FILEDESC);
- fdp->fd_map = nmap;
- }
- fdp->fd_nfiles = nnfiles;
-}
-
-/*
* Allocate a file descriptor for the process.
*/
int
@@ -1891,16 +2326,18 @@ fdalloc(struct thread *td, int minfd, int *result)
/*
* Search the bitmap for a free descriptor. If none is found, try
* to grow the file table. Keep at it until we either get a file
- * descriptor or run into process or system limits; fdgrowtable()
+ * descriptor or run into process or system limits; ftable_ensure_fd()
* may drop the filedesc lock, so we're in a race.
*/
for (;;) {
- fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+ fd = fd_first_free(fdp, minfd);
if (fd >= maxfd)
return (EMFILE);
- if (fd < fdp->fd_nfiles)
+ /* Grow if necessary. */
+ ftable_ensure_fd(fdp, fd, maxfd);
+ /* Required check since ftable_ensure_fd() can drop xlock. */
+ if (ftable_get(fdp, fd) == NULL)
break;
- fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
}
/*
@@ -1909,9 +2346,9 @@ fdalloc(struct thread *td, int minfd, int *result)
*/
KASSERT(!fdisused(fdp, fd),
("fd_first_free() returned non-free descriptor"));
- KASSERT(fdp->fd_ofiles[fd] == NULL,
+ KASSERT(ftable_get(fdp, fd) == NULL,
("free descriptor isn't"));
- fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+ ftable_set_cloexec(fdp, fd, 0); /* XXX needed? */
fdused(fdp, fd);
*result = fd;
return (0);
@@ -1926,7 +2363,7 @@ fdavail(struct thread *td, int n)
{
struct proc *p = td->td_proc;
struct filedesc *fdp = td->td_proc->p_fd;
- struct file **fpp;
+ struct file *fp;
int i, lim, last;
FILEDESC_LOCK_ASSERT(fdp);
@@ -1937,9 +2374,10 @@ fdavail(struct thread *td, int n)
if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
return (1);
last = min(fdp->fd_nfiles, lim);
- fpp = &fdp->fd_ofiles[fdp->fd_freefile];
- for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
- if (*fpp == NULL && --n <= 0)
+ fp = ftable_get(fdp, fdp->fd_freefile);
+ for (i = last - fdp->fd_freefile; --i >= 0;
+ fp = ftable_get(fdp, last - i)) {
+ if (fp == NULL && --n <= 0)
return (1);
}
return (0);
@@ -2017,7 +2455,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
ifs_init_lockdata(fp);
FILEDESC_XLOCK(p->p_fd);
- if ((fq = p->p_fd->fd_ofiles[0])) {
+ if ((fq = ftable_get(p->p_fd, 0))) {
LIST_INSERT_AFTER(fq, fp, f_list);
} else {
LIST_INSERT_HEAD(&filehead, fp, f_list);
@@ -2030,7 +2468,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd)
fdrop(fp, td);
return (error);
}
- p->p_fd->fd_ofiles[i] = fp;
+ ftable_set(p->p_fd, i, fp);
FILEDESC_XUNLOCK(p->p_fd);
if (resultfp)
*resultfp = fp;
@@ -2068,10 +2506,13 @@ fdinit(struct filedesc *fdp)
newfdp->fd_fd.fd_refcnt = 1;
newfdp->fd_fd.fd_holdcnt = 1;
newfdp->fd_fd.fd_cmask = CMASK;
- newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
- newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
newfdp->fd_fd.fd_nfiles = NDFILE;
- newfdp->fd_fd.fd_map = newfdp->fd_dmap;
+
+ idb_init(&newfdp->fd_fd.fd_files, &newfdp->fd_dfiles, NDFILE);
+ idb_init(&newfdp->fd_fd.fd_map, &newfdp->fd_dmap, NDSLOTS(NDFILE));
+ idb_init(&newfdp->fd_fd.fd_cloexec, &newfdp->fd_dcloexec,
+ NDSLOTS(NDFILE));
+
newfdp->fd_fd.fd_lastfile = -1;
return (&newfdp->fd_fd);
}
@@ -2144,6 +2585,7 @@ struct filedesc *
fdcopy(struct filedesc *fdp)
{
struct filedesc *newfdp;
+ struct file *fp;
int i;
/* Certain daemons might not have file descriptors. */
@@ -2152,23 +2594,23 @@ fdcopy(struct filedesc *fdp)
newfdp = fdinit(fdp);
FILEDESC_SLOCK(fdp);
- while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
- FILEDESC_SUNLOCK(fdp);
- FILEDESC_XLOCK(newfdp);
- fdgrowtable(newfdp, fdp->fd_lastfile + 1);
- FILEDESC_XUNLOCK(newfdp);
- FILEDESC_SLOCK(fdp);
- }
/* copy everything except kqueue descriptors */
newfdp->fd_freefile = -1;
for (i = 0; i <= fdp->fd_lastfile; ++i) {
- if (fdisused(fdp, i) &&
- fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE &&
- fdp->fd_ofiles[i]->f_ops != &badfileops) {
- newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
- newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
- fhold(newfdp->fd_ofiles[i]);
+ if (fdisused(fdp, i) && (fp = ftable_get(fdp, i)) &&
+ fp->f_type != DTYPE_KQUEUE && fp->f_ops != &badfileops) {
+ int cloexec = ftable_get_cloexec(fdp, i);
+ int maxfd = fdp->fd_lastfile;
+
+ FILEDESC_SUNLOCK(fdp);
+ FILEDESC_XLOCK(newfdp);
+ ftable_ensure_fd(newfdp, i, maxfd);
+ ftable_set(newfdp, i, fp);
+ ftable_set_cloexec(newfdp, i, cloexec);
newfdp->fd_lastfile = i;
+ FILEDESC_XUNLOCK(newfdp);
+ FILEDESC_SLOCK(fdp);
+ fhold(fp);
} else {
if (newfdp->fd_freefile == -1)
newfdp->fd_freefile = i;
@@ -2177,7 +2619,7 @@ fdcopy(struct filedesc *fdp)
FILEDESC_SUNLOCK(fdp);
FILEDESC_XLOCK(newfdp);
for (i = 0; i <= newfdp->fd_lastfile; ++i)
- if (newfdp->fd_ofiles[i] != NULL)
+ if (ftable_get(newfdp, i) != NULL)
fdused(newfdp, i);
FILEDESC_XUNLOCK(newfdp);
FILEDESC_SLOCK(fdp);
@@ -2195,7 +2637,6 @@ void
fdfree(struct thread *td)
{
struct filedesc *fdp;
- struct file **fpp;
int i, locked;
struct filedesc_to_leader *fdtol;
struct file *fp;
@@ -2216,13 +2657,10 @@ fdfree(struct thread *td)
fdtol->fdl_refcount));
if (fdtol->fdl_refcount == 1 &&
(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
- for (i = 0, fpp = fdp->fd_ofiles;
- i <= fdp->fd_lastfile;
- i++, fpp++) {
- if (*fpp == NULL ||
- (*fpp)->f_type != DTYPE_VNODE)
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = ftable_get(fdp, i);
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
continue;
- fp = *fpp;
fhold(fp);
FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
@@ -2240,7 +2678,6 @@ fdfree(struct thread *td)
VFS_UNLOCK_GIANT(locked);
FILEDESC_XLOCK(fdp);
fdrop(fp, td);
- fpp = fdp->fd_ofiles + i;
}
}
retry:
@@ -2281,31 +2718,29 @@ fdfree(struct thread *td)
}
FILEDESC_XLOCK(fdp);
i = --fdp->fd_refcnt;
- FILEDESC_XUNLOCK(fdp);
- if (i > 0)
+ if (i > 0) {
+ FILEDESC_XUNLOCK(fdp);
return;
+ }
- fpp = fdp->fd_ofiles;
- for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
- if (*fpp) {
- FILEDESC_XLOCK(fdp);
- fp = *fpp;
- *fpp = NULL;
+ for (i = fdp->fd_lastfile; i >= 0 ; i--) {
+ fp = ftable_get(fdp, i);
+ if (fp) {
+ ftable_set(fdp, i, NULL);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
+ FILEDESC_XLOCK(fdp);
}
}
- FILEDESC_XLOCK(fdp);
/* XXX This should happen earlier. */
mtx_lock(&fdesc_mtx);
td->td_proc->p_fd = NULL;
mtx_unlock(&fdesc_mtx);
- if (fdp->fd_nfiles > NDFILE)
- FREE(fdp->fd_ofiles, M_FILEDESC);
- if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
- FREE(fdp->fd_map, M_FILEDESC);
+ idb_free(&fdp->fd_files);
+ idb_free(&fdp->fd_map);
+ idb_free(&fdp->fd_cloexec);
fdp->fd_nfiles = 0;
@@ -2377,19 +2812,20 @@ setugidsafety(struct thread *td)
*/
FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
+ struct file *fp;
+
if (i > 2)
break;
- if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
- struct file *fp;
+ fp = ftable_get(fdp, i);
+ if (fp && is_unsafe(fp)) {
knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
*/
- fp = fdp->fd_ofiles[i];
- fdp->fd_ofiles[i] = NULL;
- fdp->fd_ofileflags[i] = 0;
+ ftable_set(fdp, i, NULL);
+ ftable_set_cloexec(fdp, i, 0);
fdunused(fdp, i);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
@@ -2411,8 +2847,8 @@ fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
{
FILEDESC_XLOCK(fdp);
- if (fdp->fd_ofiles[idx] == fp) {
- fdp->fd_ofiles[idx] = NULL;
+ if (ftable_get(fdp, idx) == fp) {
+ ftable_set(fdp, idx, NULL);
fdunused(fdp, idx);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
@@ -2441,19 +2877,20 @@ fdcloseexec(struct thread *td)
* may block and rip them out from under us.
*/
for (i = 0; i <= fdp->fd_lastfile; i++) {
- if (fdp->fd_ofiles[i] != NULL &&
- (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
- (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
- struct file *fp;
+ struct file *fp;
+
+ fp = ftable_get(fdp, i);
+ if (fp != NULL &&
+ (fp->f_type == DTYPE_MQUEUE ||
+ ftable_get_cloexec(fdp, i))) {
knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
*/
- fp = fdp->fd_ofiles[i];
- fdp->fd_ofiles[i] = NULL;
- fdp->fd_ofileflags[i] = 0;
+ ftable_set(fdp, i, NULL);
+ ftable_set_cloexec(fdp, i, 0);
fdunused(fdp, i);
if (fp->f_type == DTYPE_MQUEUE)
mq_fdclose(td, i, fp);
@@ -2486,7 +2923,7 @@ fdcheckstd(struct thread *td)
devnull = -1;
error = 0;
for (i = 0; i < 3; i++) {
- if (fdp->fd_ofiles[i] != NULL)
+ if (ftable_get(fdp, i) != NULL)
continue;
if (devnull < 0) {
save = td->td_retval[0];
@@ -2904,7 +3341,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
*/
FILEDESC_XLOCK(fdp);
if (dfd < 0 || dfd >= fdp->fd_nfiles ||
- (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+ (wfp = ftable_get(fdp, dfd)) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
@@ -2931,9 +3368,9 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
FILEDESC_XUNLOCK(fdp);
return (EACCES);
}
- fp = fdp->fd_ofiles[indx];
- fdp->fd_ofiles[indx] = wfp;
- fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
+ fp = ftable_get(fdp, indx);
+ ftable_set(fdp, indx, wfp);
+ ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd));
if (fp == NULL)
fdused(fdp, indx);
fhold_locked(wfp);
@@ -2951,11 +3388,11 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
/*
* Steal away the file pointer from dfd and stuff it into indx.
*/
- fp = fdp->fd_ofiles[indx];
- fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
- fdp->fd_ofiles[dfd] = NULL;
- fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
- fdp->fd_ofileflags[dfd] = 0;
+ fp = ftable_get(fdp, indx);
+ ftable_set(fdp, indx, ftable_get(fdp, dfd));
+ ftable_set(fdp, dfd, NULL);
+ ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd));
+ ftable_set_cloexec(fdp, dfd, 0);
fdunused(fdp, dfd);
if (fp == NULL)
fdused(fdp, indx);
@@ -3103,7 +3540,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
continue;
FILEDESC_SLOCK(fdp);
for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
- if ((fp = fdp->fd_ofiles[n]) == NULL)
+ if ((fp = ftable_get(fdp, n)) == NULL)
continue;
xf.xf_fd = n;
xf.xf_file = fp;
@@ -3215,7 +3652,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
fdp, req);
for (i = 0; i < fdp->fd_nfiles; i++) {
- if ((fp = fdp->fd_ofiles[i]) == NULL)
+ if ((fp = ftable_get(fdp, i)) == NULL)
continue;
bzero(kif, sizeof(*kif));
kif->kf_structsize = sizeof(*kif);
@@ -3450,7 +3887,7 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
fdp, req);
for (i = 0; i < fdp->fd_nfiles; i++) {
- if ((fp = fdp->fd_ofiles[i]) == NULL)
+ if ((fp = ftable_get(fdp, i)) == NULL)
continue;
bzero(kif, sizeof(*kif));
FILE_LOCK(fp);
@@ -3669,7 +4106,7 @@ file_to_first_proc(struct file *fp)
if (fdp == NULL)
continue;
for (n = 0; n < fdp->fd_nfiles; n++) {
- if (fp == fdp->fd_ofiles[n])
+ if (fp == ftable_get(fdp, n))
return (p);
}
}
diff --git a/src/sys/kern/kern_lsof.c b/src/sys/kern/kern_lsof.c
index 04e5dd7..70a8286 100644
--- a/src/sys/kern/kern_lsof.c
+++ b/src/sys/kern/kern_lsof.c
@@ -260,7 +260,7 @@ lsof(struct thread *td, struct lsof_args *uap)
/* Ordinary descriptors for files, pipes, sockets: */
} else if (msg.fd < fdp->fd_nfiles) {
- fp = fdp->fd_ofiles[msg.fd];
+ fp = ftable_get(fdp, msg.fd);
if (fp) {
switch (fp->f_type) {
case DTYPE_VNODE:
diff --git a/src/sys/kern/sys_generic.c b/src/sys/kern/sys_generic.c
index 1a9b061..8aba73d 100644
--- a/src/sys/kern/sys_generic.c
+++ b/src/sys/kern/sys_generic.c
@@ -596,12 +596,12 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
switch (com) {
case FIONCLEX:
FILEDESC_XLOCK(fdp);
- fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+ ftable_set_cloexec(fdp, fd, 0);
FILEDESC_XUNLOCK(fdp);
goto out;
case FIOCLEX:
FILEDESC_XLOCK(fdp);
- fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ ftable_set_cloexec(fdp, fd, 1);
FILEDESC_XUNLOCK(fdp);
goto out;
case FIONBIO:
@@ -1043,7 +1043,7 @@ pollscan(td, fds, nfd)
} else if (fds->fd < 0) {
fds->revents = 0;
} else {
- fp = fdp->fd_ofiles[fds->fd];
+ fp = ftable_get(fdp, fds->fd);
if (fp == NULL) {
fds->revents = POLLNVAL;
n++;
diff --git a/src/sys/kern/uipc_mqueue.c b/src/sys/kern/uipc_mqueue.c
index fb2ef6a..59c339a 100644
--- a/src/sys/kern/uipc_mqueue.c
+++ b/src/sys/kern/uipc_mqueue.c
@@ -2005,8 +2005,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap)
FILE_UNLOCK(fp);
FILEDESC_XLOCK(fdp);
- if (fdp->fd_ofiles[fd] == fp)
- fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ if (ftable_get(fdp, fd) == fp)
+ ftable_set_cloexec(fdp, fd, 1);
FILEDESC_XUNLOCK(fdp);
td->td_retval[0] = fd;
fdrop(fp, td);
diff --git a/src/sys/kern/uipc_sem.c b/src/sys/kern/uipc_sem.c
index d5525a4..b1e6b62 100644
--- a/src/sys/kern/uipc_sem.c
+++ b/src/sys/kern/uipc_sem.c
@@ -488,8 +488,8 @@ ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
fp->f_ops = &ksem_ops;
FILEDESC_XLOCK(fdp);
- if (fdp->fd_ofiles[fd] == fp)
- fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+ if (ftable_get(fdp, fd) == fp)
+ ftable_set_cloexec(fdp, fd, 1);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
diff --git a/src/sys/kern/uipc_usrreq.c b/src/sys/kern/uipc_usrreq.c
index b8255f0..1bfc037 100644
--- a/src/sys/kern/uipc_usrreq.c
+++ b/src/sys/kern/uipc_usrreq.c
@@ -1605,7 +1605,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp)
if (fdalloc(td, 0, &f))
panic("unp_externalize fdalloc failed");
fp = *rp++;
- td->td_proc->p_fd->fd_ofiles[f] = fp;
+ ftable_set(td->td_proc->p_fd, f,
+ fp);
FILE_LOCK(fp);
fp->f_msgcount--;
FILE_UNLOCK(fp);
@@ -1735,12 +1736,12 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
for (i = 0; i < oldfds; i++) {
fd = *fdp++;
if ((unsigned)fd >= fdescp->fd_nfiles ||
- fdescp->fd_ofiles[fd] == NULL) {
+ ftable_get(fdescp, fd) == NULL) {
FILEDESC_SUNLOCK(fdescp);
error = EBADF;
goto out;
}
- fp = fdescp->fd_ofiles[fd];
+ fp = ftable_get(fdescp, fd);
if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
FILEDESC_SUNLOCK(fdescp);
error = EOPNOTSUPP;
@@ -1765,7 +1766,7 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
rp = (struct file **)
CMSG_DATA(mtod(*controlp, struct cmsghdr *));
for (i = 0; i < oldfds; i++) {
- fp = fdescp->fd_ofiles[*fdp++];
+ fp = ftable_get(fdescp, *fdp++);
*rp++ = fp;
FILE_LOCK(fp);
fp->f_count++;
diff --git a/src/sys/kern/vfs_syscalls.c b/src/sys/kern/vfs_syscalls.c
index 2f28263..fa0a5e6 100644
--- a/src/sys/kern/vfs_syscalls.c
+++ b/src/sys/kern/vfs_syscalls.c
@@ -4884,7 +4884,7 @@ getvnode(fdp, fd, fpp)
else {
FILEDESC_SLOCK(fdp);
if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ (fp = ftable_get(fdp, fd)) == NULL)
error = EBADF;
else if (fp->f_vnode == NULL) {
fp = NULL;
diff --git a/src/sys/netsmb/smb_dev.c b/src/sys/netsmb/smb_dev.c
index fd0dcbe..a0dd80c 100644
--- a/src/sys/netsmb/smb_dev.c
+++ b/src/sys/netsmb/smb_dev.c
@@ -370,7 +370,7 @@ nsmb_getfp(struct filedesc* fdp, int fd, int flag)
FILEDESC_SLOCK(fdp);
if (((u_int)fd) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL ||
+ (fp = ftable_get(fdp, fd)) == NULL ||
(fp->f_flag & flag) == 0) {
FILEDESC_SUNLOCK(fdp);
return (NULL);
diff --git a/src/sys/sys/filedesc.h b/src/sys/sys/filedesc.h
index 1831e5c..e9ea56a 100644
--- a/src/sys/sys/filedesc.h
+++ b/src/sys/sys/filedesc.h
@@ -45,16 +45,26 @@
* This structure is used for the management of descriptors. It may be
* shared by multiple processes.
*/
-#define NDSLOTTYPE u_long
+#define NDSLOTTYPE uintptr_t
+
+/* Generic indirect block table */
+struct idb_table {
+ union {
+ void *flat;
+ void **indirect;
+ } idb_tbl;
+ int idb_nents; /* Current max # of entries. */
+ int idb_orig_nents; /* Orig # of entries for the flat table. */
+};
struct filedesc {
- struct file **fd_ofiles; /* file structures for open files */
- char *fd_ofileflags; /* per-process open file flags */
+ struct idb_table fd_files; /* table of open file structs */
+ struct idb_table fd_map; /* bitmap of free fds */
+ struct idb_table fd_cloexec; /* bitmap of fd close exec state */
struct vnode *fd_cdir; /* current directory */
struct vnode *fd_rdir; /* root directory */
struct vnode *fd_jdir; /* jail root directory */
int fd_nfiles; /* number of open files allocated */
- NDSLOTTYPE *fd_map; /* bitmap of free fds */
int fd_lastfile; /* high-water mark of fd_ofiles */
int fd_freefile; /* approx. next free file */
u_short fd_cmask; /* mask for file creation */
@@ -130,12 +140,18 @@ struct filedesc_to_leader *
int getvnode(struct filedesc *fdp, int fd, struct file **fpp);
void mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
void setugidsafety(struct thread *td);
+struct file *ftable_get(struct filedesc *fdp, int fd);
+void ftable_set(struct filedesc *fdp, int fd, struct file *fp);
+int ftable_get_cloexec(struct filedesc *fdp, int fd);
+void ftable_set_cloexec(struct filedesc *fdp, int fd, int on);
+
static __inline struct file *
fget_locked(struct filedesc *fdp, int fd)
{
- return (fd < 0 || fd >= fdp->fd_nfiles ? NULL : fdp->fd_ofiles[fd]);
+ return (fd < 0 || fd >= fdp->fd_nfiles ? NULL :
+ ftable_get(fdp, fd));
}
#endif /* _KERNEL */
[-- Attachment #3 --]
On May 11, 2010, at 10:24 AM, Tim Prouty wrote:
> Hi,
>
> This is my first time sending a patch to the list, so let me know if
> there
> are any conventions I missed.
>
> Attached is a patch that attempts to remove the data structure
> limitations on the number of open file descriptors in the system. The
> patch is against our modified version of FreeBSD 7, so it probably
> won't apply cleanly against upstream, but I wanted to get this out
> there for discussion soon so if there is feedback, we can address it
> and then worry about porting a specific patch for upstream.
>
> We (Isilon) have been running this internally for a few months without
> any issues, although there is at least one known issue that I need to
> resolve, which is mentioned below.
>
> Motivation:
>
> With the increasing amount of memory and processing power in modern
> machines, there are certain userspace processes that are able to
> handle much higher concurrent load than previously possible. A
> specific example is a single-process/multi-threaded SMB stack which
> can handle thousands of connected clients, each with hundreds of
> files open. Once kernel sysctl limits are increased for max files,
> the next limitation is in the actual actual file descriptor data
> structures.
>
> Problem - Data Structure Limits:
>
> The existing per-process data structures for the file descriptor are
> flat tables, which are reallocated each time they need need to grow.
> This is innefficient as the amount of data to allocate and copy each
> time increases, but the bigger issue is the potentially limited
> amount of contiguous KVA memory as the table grows very large. Over
> time as the KVA memory becomes fragmanted, malloc may be unable to
> provide large enough blocks of contiguous memory.
>
> In the current code the struct proc contains both an array of struct
> file pointers and a bit field indicating which file descriptors are
> in use. The primary issue is how to handle these structures growing
> beyond the kernel page size of 4K.
>
> The array of file pointers will grow much faster than the bit field,
> especially on a 64 bit kernel. The 4K block size will be hit at 512
> files (64 bit kernel) for the file pointer array and 32,768 files
> for the bit field.
>
> Solution:
>
> File Pointer Array
>
> Focusing first on the file pointer array limitation, an indirect
> block approach is used. An indirect block size of 4K (equal to page
> size) is used, allowing for 512 files per block. To optimize for
> the common case of low/normal fd usage, a flat array is initialized
> to 20 entries and then grows at 2x each time until the block reaches
> it's maximum size. Once more than 512 files are opened, the array
> will transition to a single level indirect block table.
>
> Fd Bitfield:
>
> The fd bit field as it stands can represent 32K files when it grows
> to the page size limit. Using the same indirect system as the file
> pointer array, it is able to grow beyond it's existing limits.
>
> Close Exec Field:
>
> One complication of the old file pointer table is that for each file
> pointer there was 1 byte flags. The memory was laid out such that
> the file pointers are all in one contiguous array, followed by a
> second array of chars where each char entry is a flags field that
> corresponds to the file pointer at the same index. Interestingly
> there is actually only one flag that is used: UF_EXCLOSE, so it's
> fairly wasteful to have an array of chars. What linux does, and
> what I have done is to just use a bitfield for all fds that should
> be closed on exec. This could be further optimized by doing some
> pointer trickery to store the close exec bit in the struct file
> pointer rather than keep a separate bitfield.
>
> Indirect Block Table:
>
> Since there are three consumers of the indirect block table, I
> generalized it so all of the consumers rely on the same code. This
> could eventually be refactored into a kernel library since it could
> be generally useful in other areas. The table uses a single level
> of indirection, so the base table can still grow beyond the 4K. As
> a process uses more fds, the need to continue growing the base table
> should be fairly limited, and a single realloc will significantly
> increase the number of fds the process can allocate.
>
> Accessing the new data structures:
>
> All consumers of the file pointer array and bitfield will now have
> to use accessors rather than using direct access.
>
> Known Issues:
>
> The new fdp locking in fdcopy needs to be reworked.
>
>
> Thank you for reviewing!
>
> -Tim
>
> <0001-Increase-scalabilty-of-per-process-file-descriptor-d.patch>
> _______________________________________________
> freebsd-arch@freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-arch
> To unsubscribe, send any mail to "freebsd-arch-
> unsubscribe@freebsd.org"
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?A57FE0F9-47B9-4036-8F1C-0D30FB545980>
