Date: Tue, 11 May 2010 16:13:45 -0700 From: Tim Prouty <tim.prouty@isilon.com> To: freebsd-arch@freebsd.org Cc: Zachary Loafman <zachary.loafman@isilon.com>, Matthew Fleming <matthew.fleming@isilon.com> Subject: Re: [PATCH]/[RFC] Increase scalability of per-process file descriptor data structures Message-ID: <A57FE0F9-47B9-4036-8F1C-0D30FB545980@isilon.com> In-Reply-To: <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com> References: <F2459D9D-4102-4D1D-BDCB-4F5AA8DE336D@isilon.com>
next in thread | previous in thread | raw e-mail | index | archive | help
--Apple-Mail-18--1040317416 Content-Type: text/plain; charset=US-ASCII; format=flowed Content-Transfer-Encoding: 7bit The patch was slightly truncated, I'm guessing because it was > 50K. Attached is a slightly trimmed down patch. -Tim --Apple-Mail-18--1040317416 Content-Disposition: attachment; filename=fd_scalability.patch Content-Type: application/octet-stream; x-unix-mode=0700; name="fd_scalability.patch" Content-Transfer-Encoding: 7bit diff --git a/src/sys/compat/linux/linux_stats.c b/src/sys/compat/linux/linux_stats.c index 374ce39..905db20 100644 --- a/src/sys/compat/linux/linux_stats.c +++ b/src/sys/compat/linux/linux_stats.c @@ -129,7 +129,7 @@ translate_path_major_minor(struct thread *td, char *path, struct stat *buf) fd = td->td_retval[0]; td->td_retval[0] = temp; translate_fd_major_minor(td, fd, buf); - fdclose(fdp, fdp->fd_ofiles[fd], fd, td); + fdclose(fdp, ftable_get(fdp, fd), fd, td); } static int diff --git a/src/sys/compat/svr4/svr4_filio.c b/src/sys/compat/svr4/svr4_filio.c index 701bf15..82364ca 100644 --- a/src/sys/compat/svr4/svr4_filio.c +++ b/src/sys/compat/svr4/svr4_filio.c @@ -212,13 +212,13 @@ svr4_fil_ioctl(fp, td, retval, fd, cmd, data) switch (cmd) { case SVR4_FIOCLEX: FILEDESC_XLOCK(fdp); - fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + ftable_set_cloexec(fdp, fd, 1); FILEDESC_XUNLOCK(fdp); return 0; case SVR4_FIONCLEX: FILEDESC_XLOCK(fdp); - fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; + ftable_set_cloexec(fdp, fd, 0); FILEDESC_XUNLOCK(fdp); return 0; diff --git a/src/sys/fs/fdescfs/fdesc_vfsops.c b/src/sys/fs/fdescfs/fdesc_vfsops.c index 16fa4cf..fb2e45e 100644 --- a/src/sys/fs/fdescfs/fdesc_vfsops.c +++ b/src/sys/fs/fdescfs/fdesc_vfsops.c @@ -203,7 +203,7 @@ fdesc_statfs(mp, sbp, td) last = min(fdp->fd_nfiles, lim); freefd = 0; for (i = fdp->fd_freefile; i < last; i++) - if (fdp->fd_ofiles[i] == NULL) + if (ftable_get(fdp, i) == NULL) freefd++; /* diff --git a/src/sys/fs/fdescfs/fdesc_vnops.c b/src/sys/fs/fdescfs/fdesc_vnops.c index f39c3a7..0ea6607 100644 --- a/src/sys/fs/fdescfs/fdesc_vnops.c +++ b/src/sys/fs/fdescfs/fdesc_vnops.c @@ -581,7 +581,7 @@ fdesc_readdir(ap) dp->d_type = DT_DIR; break; default: - if (fdp->fd_ofiles[fcnt] == NULL) { + if (ftable_get(fdp, fcnt) == NULL) { FILEDESC_SUNLOCK(fdp); goto done; } diff --git a/src/sys/fs/nfsserver/nfs_nfsdport.c b/src/sys/fs/nfsserver/nfs_nfsdport.c index 232e465..94fd81c 100644 --- a/src/sys/fs/nfsserver/nfs_nfsdport.c +++ b/src/sys/fs/nfsserver/nfs_nfsdport.c @@ -3103,7 +3103,7 @@ fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp) fdp = p->td_proc->p_fd; if (fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + (fp = ftable_get(fdp, fd)) == NULL) return (EBADF); *fpp = fp; return (0); diff --git a/src/sys/kern/kern_descrip.c b/src/sys/kern/kern_descrip.c index 6ce0356..1a34987 100644 --- a/src/sys/kern/kern_descrip.c +++ b/src/sys/kern/kern_descrip.c @@ -112,9 +112,8 @@ enum dup_type { DUP_VARIABLE, DUP_FIXED }; static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval); -static int fd_first_free(struct filedesc *, int, int); +static int fd_first_free(struct filedesc *, int); static int fd_last_used(struct filedesc *, int, int); -static void fdgrowtable(struct filedesc *, int); static int fdrop_locked(struct file *fp, struct thread *td); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); @@ -134,10 +133,406 @@ static void fdused(struct filedesc *fdp, int fd); #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) -/* - * Storage required per open file descriptor. +#define IDB_BLOCK_SIZE PAGE_SIZE +#define IDB_ENT_SIZE sizeof(uintptr_t) +#define IDB_ENTS_PER_BLOCK (IDB_BLOCK_SIZE/IDB_ENT_SIZE) + +/* May be a perf impact on 32-bit kernels. */ +CTASSERT(NDSLOTSIZE == IDB_ENT_SIZE); + +/** + * Return the index into the indirect table given an entry. + */ +static inline int +idb_block_index(int ent) +{ + + return (ent / IDB_ENTS_PER_BLOCK); +} + +/** + * Return offset into an indirect block given an entry. + */ +static inline int +idb_block_off(int ent) +{ + + return (ent % IDB_ENTS_PER_BLOCK); +} + +/** + * Return 1 if the indirect block table is flat, else 0. + */ +static inline int +idb_is_flat(struct idb_table *idb) +{ + + return (idb->idb_nents <= IDB_ENTS_PER_BLOCK); +} + +/** + * Return a pointer to the block. If the block is sparse or ent is outside + * the current size of the table, return NULL. + */ +static inline void * +idb_block(struct idb_table *idb, int ent) +{ + + return (ent >= idb->idb_nents ? NULL : + idb->idb_tbl.indirect[idb_block_index(ent)]); +} + +/** + * Initialize a new indirect table. The caller is responsible for allocating + * the idb struct, and must provide an initial non-null flat table. + * + * @param idb Indirect table to initialize. + * @param idb_flat Initial non-null table. + * @param idb_nents Number of entries in the initial flat table. + */ +static void +idb_init(struct idb_table *idb, void *idb_flat, int idb_nents) +{ + + KASSERT(idb != NULL, ("idb table must be allocated by caller")); + KASSERT(idb_flat != NULL, + ("idb flat table must be allocated by caller")); + + idb->idb_tbl.flat = idb_flat; + idb->idb_nents = idb_nents; + idb->idb_orig_nents = idb_nents; +} + +/** + * Free all blocks associated with the indirect table. + */ +static void +idb_free(struct idb_table *idb) +{ + int indx; + void *block; + + if (idb_is_flat(idb)) { + if (idb->idb_nents > idb->idb_orig_nents) + free(idb->idb_tbl.flat, M_FILEDESC); + return; + } + + /* Free indirect leaves. */ + for (indx = idb_block_index(0); + indx < idb_block_index(idb->idb_nents); + indx++) { + block = idb->idb_tbl.indirect[indx]; + if (block != NULL) + free(block, M_FILEDESC); + } + + /* Free indirect root. */ + free(idb->idb_tbl.indirect, M_FILEDESC); +} + +/** + * Return a pointer into the table/block given an index. + */ +static void * +idb_get_entry(struct idb_table *idb, int ent) +{ + void *block; + + if (ent > idb->idb_nents) + return (NULL); + + if (idb_is_flat(idb)) + return (((caddr_t)idb->idb_tbl.flat) + (ent * IDB_ENT_SIZE)); + + /* Indirect block. Return NULL for sparse blocks. */ + block = idb_block(idb, ent); + if (block == NULL) + return (NULL); + + return (((caddr_t)block) + (idb_block_off(ent) * IDB_ENT_SIZE)); +} + +/** + * If the current table size doesn't accomodate the new number of entries, + * grow it to fit new_nents. Mult is a multiplying factor used to check the + * number of entries in the table against new_nents which allows growing the + * flat table or the indirect table. The current number of entries in the + * table must be a multiple of mult. + * + * @param idb Table to grow. + * @param new_nents Number of entries to grow the table to. + * @param mult Multiplier for new_nents. + * @param sx Exclusive lock that may be dropped/reqacquired. */ -#define OFILESIZE (sizeof(struct file *) + sizeof(char)) +static void +idb_grow_table(struct idb_table *idb, int new_nents, int mult, struct sx *sx) +{ + int old_nents; + void *ntable; + + KASSERT(idb->idb_nents % mult == 0, + ("%d is not a multiple of %d", idb->idb_nents, mult)); + + old_nents = idb->idb_nents / mult; + + /* Do nothing if the table is already big enough. */ + if (old_nents > new_nents) + return; + + sx_xunlock(sx); + ntable = malloc(new_nents * IDB_ENT_SIZE, M_FILEDESC, + M_ZERO | M_WAITOK); + sx_xlock(sx); + + /* Done if table grew when the lock was dropped. */ + if (idb->idb_nents / mult > new_nents) { + free(ntable, M_FILEDESC); + return; + } + + /* Copy the data to the new table and fix up the pointers. */ + bcopy(idb->idb_tbl.flat, ntable, old_nents * IDB_ENT_SIZE); + if (idb->idb_nents > idb->idb_orig_nents) + free(idb->idb_tbl.flat, M_FILEDESC); + idb->idb_tbl.flat = ntable; + idb->idb_nents = new_nents * mult; +} + +/** + * Transition a flat table to an indirect block table. + * + * @param idb Table to transition. + * @param sx Exclusive lock that may be dropped/reqacquired. + */ +static void +idb_transition_to_indirect(struct idb_table *idb, struct sx *sx) +{ + void **ntable = NULL; + + KASSERT(idb->idb_nents >= IDB_ENTS_PER_BLOCK, + ("Insufficient size for indirect transition: %d", idb->idb_nents)); + + /* Done if the table has already transitioned. */ + if (idb->idb_nents > IDB_ENTS_PER_BLOCK) { + return; + } + + sx_xunlock(sx); + ntable = malloc(IDB_BLOCK_SIZE, M_FILEDESC, + M_ZERO | M_WAITOK); + sx_xlock(sx); + + /* Done if indirect transition done when the lock was dropped. */ + if (idb->idb_nents > IDB_ENTS_PER_BLOCK) { + free(ntable, M_FILEDESC); + return; + } + + /* Make indirect transition. */ + ntable[0] = idb->idb_tbl.flat; + idb->idb_tbl.indirect = ntable; + idb->idb_nents = IDB_ENTS_PER_BLOCK * IDB_ENTS_PER_BLOCK; +} + +/** + * Allocates an indirect block in the table if one doesn't already exist for + * new_ent. + * + * @param idb Table to ensure new_ent has an indirect block in. + * @param new_ent New entry index to create indirect block for. + * @param sx Exclusive lock that may be dropped/reqacquired. + */ +static void +idb_ensure_indirect_block(struct idb_table *idb, int new_ent, struct sx *sx) +{ + void *nblock = NULL; + + KASSERT(new_ent < idb->idb_nents, + ("Table too small (%d) for indirect block at index %d", + idb->idb_nents, new_ent)); + + /* Done if the block is already allocated. */ + if (idb_block(idb, new_ent) != NULL) + return; + + sx_xunlock(sx); + nblock = malloc(IDB_BLOCK_SIZE, M_FILEDESC, M_ZERO | M_WAITOK); + sx_xlock(sx); + + /* Done if block was allocated when the lock was dropped. */ + if (idb_block(idb, new_ent) != NULL) { + free(nblock, M_FILEDESC); + return; + } + + idb->idb_tbl.indirect[idb_block_index(new_ent)] = nblock; +} + +/** + * idb_ensure_size() guarantees that: + * 1. If the table is flat, the table will be made large enough for new_ent, + * possibly being transitioned to an indirect table. + * + * 2. If the table is indirect, the indirect table is large enough to have an + * entry to point to the indirect block, and the indirect block itself is + * allocated. + * + * The sx lock will be released if new memory needs to be allocated, but will + * be reacquired before returning. + * + * @param idb Table to ensure new_ent fits in. + * @param new_ent New entry index. + * @param maxsize Max size of the table so excess memory isn't used. + * @param sx Exclusive lock that may be dropped/reqacquired. + */ +static void +idb_ensure_size(struct idb_table *idb, int new_ent, int maxsize, struct sx *sx) +{ + KASSERT(idb->idb_nents > 0, ("zero-length idb table")); + KASSERT(new_ent < maxsize, + ("new_ent(%d) >= maxsize(%d)", new_ent, maxsize)); + + sx_assert(sx, SX_XLOCKED | SX_NOTRECURSED); + + /* Grow table 2x while it is flat. */ + if (idb_is_flat(idb) && new_ent < IDB_ENTS_PER_BLOCK) { + if (new_ent >= idb->idb_nents) { + KASSERT(new_ent > 0, ("Negative new_ent %d", new_ent)); + /* Round up to power of 2 to appease the allocator. */ + idb_grow_table(idb, min(min(1 << (fls(new_ent)), + IDB_ENTS_PER_BLOCK), maxsize), 1, sx); + } + return; + } + + /* Transition flat table to indirect. */ + if (idb_is_flat(idb) && new_ent >= IDB_ENTS_PER_BLOCK) { + idb_grow_table(idb, IDB_ENTS_PER_BLOCK, 1, sx); + idb_transition_to_indirect(idb, sx); + } + + /* Grow size of indirect table. */ + if (new_ent >= idb->idb_nents) { + int grow_factor, new_nents; + /* Need to grow the indirect table. */ + for (grow_factor = 2;; grow_factor <<= 1) { + if (idb_block_index(idb->idb_nents) * grow_factor > + idb_block_index(new_ent)) + break; + } + new_nents = min(idb_block_index(idb->idb_nents) * grow_factor, + idb_block_index(maxsize)); + idb_grow_table(idb, new_nents, IDB_ENTS_PER_BLOCK, sx); + } + + /* Ensure block is allocated in sparse table. */ + idb_ensure_indirect_block(idb, new_ent, sx); +} + +/** + * Get the file struct for an fd from the ftable. + * + * @return The file struct for a particular or NULL. + */ +struct file * +ftable_get(struct filedesc *fdp, int fd) +{ + struct file **fpp; + + FILEDESC_LOCK_ASSERT(fdp); + + fpp = idb_get_entry(&fdp->fd_files, fd); + return (fpp != NULL ? *fpp : NULL); +} + +/** + * Set an entry in the table to point to a struct file. ftable_ensure_fd() + * must be first called to ensure the underlying data structure can support + * this entry. + */ +void +ftable_set(struct filedesc *fdp, int fd, struct file *fp) +{ + struct file **fpp; + + FILEDESC_XLOCK_ASSERT(fdp); + + fpp = idb_get_entry(&fdp->fd_files, fd); + KASSERT(fpp != NULL, ("Trying to set unallocated entry")); + *fpp = fp; +} + +/** + * Get the close exec state of a file descriptor. + * + * @return 1 if close exec is set, otherwise 0. + */ +int +ftable_get_cloexec(struct filedesc *fdp, int fd) +{ + NDSLOTTYPE *map; + + FILEDESC_LOCK_ASSERT(fdp); + + map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd)); + if (map == NULL) + return (0); + + return ((*map & NDBIT(fd)) != 0); +} + +/** + * Set the close exec state of a file descriptor. + * + * @param on 1: close exec state will be turned on. + * 0: close exec state will be turned off. + */ +void +ftable_set_cloexec(struct filedesc *fdp, int fd, int on) +{ + NDSLOTTYPE *map; + + FILEDESC_XLOCK_ASSERT(fdp); + + map = idb_get_entry(&fdp->fd_cloexec, NDSLOT(fd)); + KASSERT(map != NULL, ("trying to set cloexec on an unallocated file")); + + if (on) + *map |= NDBIT(fd); + else + *map &= ~NDBIT(fd); +} + +/** + * If the ftable is already large enough to store the fd, then simply return. + * Otherwise, allocate the necessary blocks to accomodate the new fd. This + * allows for a sparse table. May malloc new blocks requiring the fdp lock to + * be dropped and reacquired. + * + * @param nfd File descriptor to possilbly grow the table to fit. + * @param maxfd Maximum fd so excess memory isn't used. + */ +static void +ftable_ensure_fd(struct filedesc *fdp, int nfd, int maxfd) +{ + FILEDESC_XLOCK_ASSERT(fdp); + + KASSERT(nfd <= maxfd, ("nfd(%d) > maxfd(%d)", nfd, maxfd)); + + idb_ensure_size(&fdp->fd_files, nfd, maxfd + 1, &fdp->fd_sx); + idb_ensure_size(&fdp->fd_map, NDSLOT(nfd), NDSLOT(maxfd) + 1, + &fdp->fd_sx); + idb_ensure_size(&fdp->fd_cloexec, NDSLOT(nfd), NDSLOT(maxfd) + 1, + &fdp->fd_sx); + + /* + * ft_map and ft_cloexec grow at the same rate, but ft_files grows at + * a different rate, so advertise table size as the min. + */ + fdp->fd_nfiles = min(fdp->fd_files.idb_nents, + fdp->fd_map.idb_nents * NDENTRIES); +} /* * Basic allocation of descriptors: @@ -150,8 +545,8 @@ struct filedesc0 { * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; - char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; + NDSLOTTYPE fd_dcloexec[NDSLOTS(NDFILE)]; }; /* @@ -166,14 +561,13 @@ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* A mutex to protect the association between a proc and filedesc. */ static struct mtx fdesc_mtx; -/* - * Find the first zero bit in the given bitmap, starting at low and not - * exceeding size - 1. +/** + * Iterate a flat array searching for the first zero bit in the given bitmap, + * starting at low and not exceeding size - 1. */ static int -fd_first_free(struct filedesc *fdp, int low, int size) +fd_first_free_block(NDSLOTTYPE *map, int low, int size) { - NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; @@ -193,14 +587,61 @@ fd_first_free(struct filedesc *fdp, int low, int size) return (size); } +/** + * Iterate the indirect block table fd map searching for the first free fd, + * starting at low. Return the current number of entries in the table if none + * are free. + */ +static int +fd_first_free(struct filedesc *fdp, int low) +{ + struct idb_table *idb = &fdp->fd_map; + NDSLOTTYPE *block; + int indx; + + FILEDESC_LOCK_ASSERT(fdp); + + /* Flat table. */ + if (idb_is_flat(idb)) + return (fd_first_free_block(idb->idb_tbl.flat, low, + idb->idb_nents * NDENTRIES)); + + /* Loop through the indirect blocks. */ + for (indx = idb_block_index(NDSLOT(low)); + indx < idb_block_index(idb->idb_nents); + indx++) { + int block_low, free_ent; + + block = idb->idb_tbl.indirect[indx]; + if (block == NULL) { + /* Unallocated block, so the first index is fine. */ + free_ent = indx * IDB_ENTS_PER_BLOCK * NDENTRIES; + return (max(free_ent, low)); + } + + /* Scan block, starting mid-block if necessary. */ + block_low = (indx == idb_block_index(NDSLOT(low))) ? + idb_block_off(NDSLOT(low)) * NDENTRIES : 0; + free_ent = fd_first_free_block(block, block_low, + IDB_ENTS_PER_BLOCK * NDENTRIES); + + /* If there was a free fd, return it. */ + if (free_ent < IDB_ENTS_PER_BLOCK * NDENTRIES) + return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES + + free_ent); + } + + /* No free fds found. */ + return (idb->idb_nents); +} + /* * Find the highest non-zero bit in the given bitmap, starting at low and * not exceeding size - 1. */ static int -fd_last_used(struct filedesc *fdp, int low, int size) +fd_last_used_block(NDSLOTTYPE *map, int low, int size) { - NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; @@ -220,12 +661,65 @@ fd_last_used(struct filedesc *fdp, int low, int size) return (low - 1); } +/** + * Iterate the indirect block table fd map searching for the highest non-zero + * bit, starting at low and not exceeding size - 1. Return low -1 if no fds + * >= low are used. + */ +static int +fd_last_used(struct filedesc *fdp, int low, int size) +{ + struct idb_table *idb = &fdp->fd_map; + NDSLOTTYPE *block; + int indx; + + FILEDESC_LOCK_ASSERT(fdp); + + /* Flat table. */ + if (idb_is_flat(idb)) + return (fd_last_used_block(idb->idb_tbl.flat, low, size)); + + /* Loop through the indirect blocks backwards. */ + for (indx = idb_block_index(NDSLOT(size)); + indx >= idb_block_index(NDSLOT(low)); + indx--) { + int block_low, block_high, used_ent; + + block = idb->idb_tbl.indirect[indx]; + /* If the block is sparse, move onto the next one. */ + if (block == NULL) + continue; + + /* Scan block, starting/ending mid-block if necessary. */ + block_low = (indx == idb_block_index(NDSLOT(low))) ? + idb_block_off(NDSLOT(low)) * NDENTRIES : 0; + block_high = (indx == idb_block_index(NDSLOT(size))) ? + idb_block_off(NDSLOT(size)) * NDENTRIES : + IDB_ENTS_PER_BLOCK; + used_ent = fd_last_used_block(block, block_low, block_high); + + /* If there was a used fd, return it. */ + if (used_ent >= block_low) + return (indx * IDB_ENTS_PER_BLOCK * NDENTRIES + + used_ent); + } + + /* No used fds found. */ + return (low - 1); +} + static int fdisused(struct filedesc *fdp, int fd) { + NDSLOTTYPE *map; + + FILEDESC_LOCK_ASSERT(fdp); KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); - return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); + + map = idb_get_entry(&fdp->fd_map, NDSLOT(fd)); + + return (map && (*map & NDBIT(fd)) != 0); } /* @@ -234,16 +728,19 @@ fdisused(struct filedesc *fdp, int fd) static void fdused(struct filedesc *fdp, int fd) { + NDSLOTTYPE *map; FILEDESC_XLOCK_ASSERT(fdp); - KASSERT(!fdisused(fdp, fd), - ("fd already used")); + KASSERT(!fdisused(fdp, fd), ("fd already used")); - fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); + map = idb_get_entry(&fdp->fd_map, NDSLOT(fd)); + KASSERT(map != NULL, ("Map block is NULL")); + + *map |= NDBIT(fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) - fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); + fdp->fd_freefile = fd_first_free(fdp, fd); } /* @@ -253,13 +750,19 @@ static void fdunused(struct filedesc *fdp, int fd) { + NDSLOTTYPE *map; + FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd is already unused")); - KASSERT(fdp->fd_ofiles[fd] == NULL, + KASSERT(ftable_get(fdp, fd) == NULL, ("fd is still in use")); - fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); + map = idb_get_entry(&fdp->fd_map, NDSLOT(fd)); + KASSERT(map != NULL, ("Map block is NULL")); + + *map &= ~NDBIT(fd); + if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) @@ -410,7 +913,7 @@ fdtofp(int fd, struct filedesc *fdp) FILEDESC_LOCK_ASSERT(fdp); if ((unsigned)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + (fp = ftable_get(fdp, fd)) == NULL) return (NULL); return (fp); } @@ -422,7 +925,6 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) struct flock *flp; struct file *fp; struct proc *p; - char *pop; struct vnode *vp; u_int newmin; int error, flg, tmp; @@ -467,8 +969,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = EBADF; break; } - pop = &fdp->fd_ofileflags[fd]; - td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; + td->td_retval[0] = ftable_get_cloexec(fdp, fd) ? + FD_CLOEXEC :0; FILEDESC_SUNLOCK(fdp); break; @@ -479,9 +981,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) error = EBADF; break; } - pop = &fdp->fd_ofileflags[fd]; - *pop = (*pop &~ UF_EXCLOSE) | - (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + ftable_set_cloexec(fdp, fd, arg & FD_CLOEXEC); FILEDESC_XUNLOCK(fdp); break; @@ -651,7 +1151,7 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) /* Check for race with close */ FILEDESC_SLOCK(fdp); if ((unsigned) fd >= fdp->fd_nfiles || - fp != fdp->fd_ofiles[fd]) { + fp != ftable_get(fdp, fd)) { FILEDESC_SUNLOCK(fdp); flp->l_whence = SEEK_SET; flp->l_start = 0; @@ -750,7 +1250,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, return (EMFILE); FILEDESC_XLOCK(fdp); - if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { + if (old >= fdp->fd_nfiles || ftable_get(fdp, old) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } @@ -759,7 +1259,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, FILEDESC_XUNLOCK(fdp); return (0); } - fp = fdp->fd_ofiles[old]; + fp = ftable_get(fdp, old); fhold(fp); /* @@ -770,9 +1270,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, * out for a race. */ if (type == DUP_FIXED) { - if (new >= fdp->fd_nfiles) - fdgrowtable(fdp, new + 1); - if (fdp->fd_ofiles[new] == NULL) + ftable_ensure_fd(fdp, new, maxfd); + if (ftable_get(fdp, new) == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { @@ -787,9 +1286,9 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, * bad file descriptor. Userland should do its own locking to * avoid this case. */ - if (fdp->fd_ofiles[old] != fp) { + if (ftable_get(fdp, old) != fp) { /* we've allocated a descriptor which we won't use */ - if (fdp->fd_ofiles[new] == NULL) + if (ftable_get(fdp, new) == NULL) fdunused(fdp, new); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); @@ -805,7 +1304,7 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, * * XXX this duplicates parts of close(). */ - delfp = fdp->fd_ofiles[new]; + delfp = ftable_get(fdp, new); holdleaders = 0; if (delfp != NULL) { if (td->td_proc->p_fdtol != NULL) { @@ -821,8 +1320,8 @@ do_dup(struct thread *td, enum dup_type type, int old, int new, /* * Duplicate the source descriptor */ - fdp->fd_ofiles[new] = fp; - fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + ftable_set(fdp, new, fp); + ftable_set_cloexec(fdp, new, 0); if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; @@ -1111,12 +1610,12 @@ kern_close(td, fd) FILEDESC_XLOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) { + (fp = ftable_get(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } - fdp->fd_ofiles[fd] = NULL; - fdp->fd_ofileflags[fd] = 0; + ftable_set(fdp, fd, NULL); + ftable_set_cloexec(fdp, fd, 0); fdunused(fdp, fd); if (td->td_proc->p_fdtol != NULL) { /* @@ -1178,7 +1677,7 @@ closefrom(struct thread *td, struct closefrom_args *uap) uap->lowfd = 0; FILEDESC_SLOCK(fdp); for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) { - if (fdp->fd_ofiles[fd] != NULL) { + if (ftable_get(fdp, fd) != NULL) { FILEDESC_SUNLOCK(fdp); (void)kern_close(td, fd); FILEDESC_SLOCK(fdp); @@ -1806,70 +2305,6 @@ out: } /* - * Grow the file table to accomodate (at least) nfd descriptors. This may - * block and drop the filedesc lock, but it will reacquire it before - * returning. - */ -static void -fdgrowtable(struct filedesc *fdp, int nfd) -{ - struct file **ntable; - char *nfileflags; - int nnfiles, onfiles; - NDSLOTTYPE *nmap; - - FILEDESC_XLOCK_ASSERT(fdp); - - KASSERT(fdp->fd_nfiles > 0, - ("zero-length file table")); - - /* compute the size of the new table */ - onfiles = fdp->fd_nfiles; - nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ - if (nnfiles <= onfiles) - /* the table is already large enough */ - return; - - /* allocate a new table and (if required) new bitmaps */ - FILEDESC_XUNLOCK(fdp); - MALLOC(ntable, struct file **, nnfiles * OFILESIZE, - M_FILEDESC, M_ZERO | M_WAITOK); - nfileflags = (char *)&ntable[nnfiles]; - if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) - MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE, - M_FILEDESC, M_ZERO | M_WAITOK); - else - nmap = NULL; - FILEDESC_XLOCK(fdp); - - /* - * We now have new tables ready to go. Since we dropped the - * filedesc lock to call malloc(), watch out for a race. - */ - onfiles = fdp->fd_nfiles; - if (onfiles >= nnfiles) { - /* we lost the race, but that's OK */ - free(ntable, M_FILEDESC); - if (nmap != NULL) - free(nmap, M_FILEDESC); - return; - } - bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); - bcopy(fdp->fd_ofileflags, nfileflags, onfiles); - if (onfiles > NDFILE) - free(fdp->fd_ofiles, M_FILEDESC); - fdp->fd_ofiles = ntable; - fdp->fd_ofileflags = nfileflags; - if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { - bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); - if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) - free(fdp->fd_map, M_FILEDESC); - fdp->fd_map = nmap; - } - fdp->fd_nfiles = nnfiles; -} - -/* * Allocate a file descriptor for the process. */ int @@ -1891,16 +2326,18 @@ fdalloc(struct thread *td, int minfd, int *result) /* * Search the bitmap for a free descriptor. If none is found, try * to grow the file table. Keep at it until we either get a file - * descriptor or run into process or system limits; fdgrowtable() + * descriptor or run into process or system limits; ftable_ensure_fd() * may drop the filedesc lock, so we're in a race. */ for (;;) { - fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); + fd = fd_first_free(fdp, minfd); if (fd >= maxfd) return (EMFILE); - if (fd < fdp->fd_nfiles) + /* Grow if necessary. */ + ftable_ensure_fd(fdp, fd, maxfd); + /* Required check since ftable_ensure_fd() can drop xlock. */ + if (ftable_get(fdp, fd) == NULL) break; - fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } /* @@ -1909,9 +2346,9 @@ fdalloc(struct thread *td, int minfd, int *result) */ KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); - KASSERT(fdp->fd_ofiles[fd] == NULL, + KASSERT(ftable_get(fdp, fd) == NULL, ("free descriptor isn't")); - fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ + ftable_set_cloexec(fdp, fd, 0); /* XXX needed? */ fdused(fdp, fd); *result = fd; return (0); @@ -1926,7 +2363,7 @@ fdavail(struct thread *td, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = td->td_proc->p_fd; - struct file **fpp; + struct file *fp; int i, lim, last; FILEDESC_LOCK_ASSERT(fdp); @@ -1937,9 +2374,10 @@ fdavail(struct thread *td, int n) if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); - fpp = &fdp->fd_ofiles[fdp->fd_freefile]; - for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { - if (*fpp == NULL && --n <= 0) + fp = ftable_get(fdp, fdp->fd_freefile); + for (i = last - fdp->fd_freefile; --i >= 0; + fp = ftable_get(fdp, last - i)) { + if (fp == NULL && --n <= 0) return (1); } return (0); @@ -2017,7 +2455,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd) ifs_init_lockdata(fp); FILEDESC_XLOCK(p->p_fd); - if ((fq = p->p_fd->fd_ofiles[0])) { + if ((fq = ftable_get(p->p_fd, 0))) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); @@ -2030,7 +2468,7 @@ falloc(struct thread *td, struct file **resultfp, int *resultfd) fdrop(fp, td); return (error); } - p->p_fd->fd_ofiles[i] = fp; + ftable_set(p->p_fd, i, fp); FILEDESC_XUNLOCK(p->p_fd); if (resultfp) *resultfp = fp; @@ -2068,10 +2506,13 @@ fdinit(struct filedesc *fdp) newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_holdcnt = 1; newfdp->fd_fd.fd_cmask = CMASK; - newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; - newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; - newfdp->fd_fd.fd_map = newfdp->fd_dmap; + + idb_init(&newfdp->fd_fd.fd_files, &newfdp->fd_dfiles, NDFILE); + idb_init(&newfdp->fd_fd.fd_map, &newfdp->fd_dmap, NDSLOTS(NDFILE)); + idb_init(&newfdp->fd_fd.fd_cloexec, &newfdp->fd_dcloexec, + NDSLOTS(NDFILE)); + newfdp->fd_fd.fd_lastfile = -1; return (&newfdp->fd_fd); } @@ -2144,6 +2585,7 @@ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; + struct file *fp; int i; /* Certain daemons might not have file descriptors. */ @@ -2152,23 +2594,23 @@ fdcopy(struct filedesc *fdp) newfdp = fdinit(fdp); FILEDESC_SLOCK(fdp); - while (fdp->fd_lastfile >= newfdp->fd_nfiles) { - FILEDESC_SUNLOCK(fdp); - FILEDESC_XLOCK(newfdp); - fdgrowtable(newfdp, fdp->fd_lastfile + 1); - FILEDESC_XUNLOCK(newfdp); - FILEDESC_SLOCK(fdp); - } /* copy everything except kqueue descriptors */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { - if (fdisused(fdp, i) && - fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE && - fdp->fd_ofiles[i]->f_ops != &badfileops) { - newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; - newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; - fhold(newfdp->fd_ofiles[i]); + if (fdisused(fdp, i) && (fp = ftable_get(fdp, i)) && + fp->f_type != DTYPE_KQUEUE && fp->f_ops != &badfileops) { + int cloexec = ftable_get_cloexec(fdp, i); + int maxfd = fdp->fd_lastfile; + + FILEDESC_SUNLOCK(fdp); + FILEDESC_XLOCK(newfdp); + ftable_ensure_fd(newfdp, i, maxfd); + ftable_set(newfdp, i, fp); + ftable_set_cloexec(newfdp, i, cloexec); newfdp->fd_lastfile = i; + FILEDESC_XUNLOCK(newfdp); + FILEDESC_SLOCK(fdp); + fhold(fp); } else { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; @@ -2177,7 +2619,7 @@ fdcopy(struct filedesc *fdp) FILEDESC_SUNLOCK(fdp); FILEDESC_XLOCK(newfdp); for (i = 0; i <= newfdp->fd_lastfile; ++i) - if (newfdp->fd_ofiles[i] != NULL) + if (ftable_get(newfdp, i) != NULL) fdused(newfdp, i); FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); @@ -2195,7 +2637,6 @@ void fdfree(struct thread *td) { struct filedesc *fdp; - struct file **fpp; int i, locked; struct filedesc_to_leader *fdtol; struct file *fp; @@ -2216,13 +2657,10 @@ fdfree(struct thread *td) fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { - for (i = 0, fpp = fdp->fd_ofiles; - i <= fdp->fd_lastfile; - i++, fpp++) { - if (*fpp == NULL || - (*fpp)->f_type != DTYPE_VNODE) + for (i = 0; i <= fdp->fd_lastfile; i++) { + fp = ftable_get(fdp, i); + if (fp == NULL || fp->f_type != DTYPE_VNODE) continue; - fp = *fpp; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; @@ -2240,7 +2678,6 @@ fdfree(struct thread *td) VFS_UNLOCK_GIANT(locked); FILEDESC_XLOCK(fdp); fdrop(fp, td); - fpp = fdp->fd_ofiles + i; } } retry: @@ -2281,31 +2718,29 @@ fdfree(struct thread *td) } FILEDESC_XLOCK(fdp); i = --fdp->fd_refcnt; - FILEDESC_XUNLOCK(fdp); - if (i > 0) + if (i > 0) { + FILEDESC_XUNLOCK(fdp); return; + } - fpp = fdp->fd_ofiles; - for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { - if (*fpp) { - FILEDESC_XLOCK(fdp); - fp = *fpp; - *fpp = NULL; + for (i = fdp->fd_lastfile; i >= 0 ; i--) { + fp = ftable_get(fdp, i); + if (fp) { + ftable_set(fdp, i, NULL); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); + FILEDESC_XLOCK(fdp); } } - FILEDESC_XLOCK(fdp); /* XXX This should happen earlier. */ mtx_lock(&fdesc_mtx); td->td_proc->p_fd = NULL; mtx_unlock(&fdesc_mtx); - if (fdp->fd_nfiles > NDFILE) - FREE(fdp->fd_ofiles, M_FILEDESC); - if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) - FREE(fdp->fd_map, M_FILEDESC); + idb_free(&fdp->fd_files); + idb_free(&fdp->fd_map); + idb_free(&fdp->fd_cloexec); fdp->fd_nfiles = 0; @@ -2377,19 +2812,20 @@ setugidsafety(struct thread *td) */ FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { + struct file *fp; + if (i > 2) break; - if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { - struct file *fp; + fp = ftable_get(fdp, i); + if (fp && is_unsafe(fp)) { knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ - fp = fdp->fd_ofiles[i]; - fdp->fd_ofiles[i] = NULL; - fdp->fd_ofileflags[i] = 0; + ftable_set(fdp, i, NULL); + ftable_set_cloexec(fdp, i, 0); fdunused(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); @@ -2411,8 +2847,8 @@ fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) { FILEDESC_XLOCK(fdp); - if (fdp->fd_ofiles[idx] == fp) { - fdp->fd_ofiles[idx] = NULL; + if (ftable_get(fdp, idx) == fp) { + ftable_set(fdp, idx, NULL); fdunused(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); @@ -2441,19 +2877,20 @@ fdcloseexec(struct thread *td) * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { - if (fdp->fd_ofiles[i] != NULL && - (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || - (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { - struct file *fp; + struct file *fp; + + fp = ftable_get(fdp, i); + if (fp != NULL && + (fp->f_type == DTYPE_MQUEUE || + ftable_get_cloexec(fdp, i))) { knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ - fp = fdp->fd_ofiles[i]; - fdp->fd_ofiles[i] = NULL; - fdp->fd_ofileflags[i] = 0; + ftable_set(fdp, i, NULL); + ftable_set_cloexec(fdp, i, 0); fdunused(fdp, i); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, i, fp); @@ -2486,7 +2923,7 @@ fdcheckstd(struct thread *td) devnull = -1; error = 0; for (i = 0; i < 3; i++) { - if (fdp->fd_ofiles[i] != NULL) + if (ftable_get(fdp, i) != NULL) continue; if (devnull < 0) { save = td->td_retval[0]; @@ -2904,7 +3341,7 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, */ FILEDESC_XLOCK(fdp); if (dfd < 0 || dfd >= fdp->fd_nfiles || - (wfp = fdp->fd_ofiles[dfd]) == NULL) { + (wfp = ftable_get(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } @@ -2931,9 +3368,9 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, FILEDESC_XUNLOCK(fdp); return (EACCES); } - fp = fdp->fd_ofiles[indx]; - fdp->fd_ofiles[indx] = wfp; - fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fp = ftable_get(fdp, indx); + ftable_set(fdp, indx, wfp); + ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd)); if (fp == NULL) fdused(fdp, indx); fhold_locked(wfp); @@ -2951,11 +3388,11 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, /* * Steal away the file pointer from dfd and stuff it into indx. */ - fp = fdp->fd_ofiles[indx]; - fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; - fdp->fd_ofiles[dfd] = NULL; - fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; - fdp->fd_ofileflags[dfd] = 0; + fp = ftable_get(fdp, indx); + ftable_set(fdp, indx, ftable_get(fdp, dfd)); + ftable_set(fdp, dfd, NULL); + ftable_set_cloexec(fdp, indx, ftable_get_cloexec(fdp, dfd)); + ftable_set_cloexec(fdp, dfd, 0); fdunused(fdp, dfd); if (fp == NULL) fdused(fdp, indx); @@ -3103,7 +3540,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { - if ((fp = fdp->fd_ofiles[n]) == NULL) + if ((fp = ftable_get(fdp, n)) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; @@ -3215,7 +3652,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, fdp, req); for (i = 0; i < fdp->fd_nfiles; i++) { - if ((fp = fdp->fd_ofiles[i]) == NULL) + if ((fp = ftable_get(fdp, i)) == NULL) continue; bzero(kif, sizeof(*kif)); kif->kf_structsize = sizeof(*kif); @@ -3450,7 +3887,7 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) export_vnode_for_sysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, fdp, req); for (i = 0; i < fdp->fd_nfiles; i++) { - if ((fp = fdp->fd_ofiles[i]) == NULL) + if ((fp = ftable_get(fdp, i)) == NULL) continue; bzero(kif, sizeof(*kif)); FILE_LOCK(fp); @@ -3669,7 +4106,7 @@ file_to_first_proc(struct file *fp) if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { - if (fp == fdp->fd_ofiles[n]) + if (fp == ftable_get(fdp, n)) return (p); } } diff --git a/src/sys/kern/kern_lsof.c b/src/sys/kern/kern_lsof.c index 04e5dd7..70a8286 100644 --- a/src/sys/kern/kern_lsof.c +++ b/src/sys/kern/kern_lsof.c @@ -260,7 +260,7 @@ lsof(struct thread *td, struct lsof_args *uap) /* Ordinary descriptors for files, pipes, sockets: */ } else if (msg.fd < fdp->fd_nfiles) { - fp = fdp->fd_ofiles[msg.fd]; + fp = ftable_get(fdp, msg.fd); if (fp) { switch (fp->f_type) { case DTYPE_VNODE: diff --git a/src/sys/kern/sys_generic.c b/src/sys/kern/sys_generic.c index 1a9b061..8aba73d 100644 --- a/src/sys/kern/sys_generic.c +++ b/src/sys/kern/sys_generic.c @@ -596,12 +596,12 @@ kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) switch (com) { case FIONCLEX: FILEDESC_XLOCK(fdp); - fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; + ftable_set_cloexec(fdp, fd, 0); FILEDESC_XUNLOCK(fdp); goto out; case FIOCLEX: FILEDESC_XLOCK(fdp); - fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + ftable_set_cloexec(fdp, fd, 1); FILEDESC_XUNLOCK(fdp); goto out; case FIONBIO: @@ -1043,7 +1043,7 @@ pollscan(td, fds, nfd) } else if (fds->fd < 0) { fds->revents = 0; } else { - fp = fdp->fd_ofiles[fds->fd]; + fp = ftable_get(fdp, fds->fd); if (fp == NULL) { fds->revents = POLLNVAL; n++; diff --git a/src/sys/kern/uipc_mqueue.c b/src/sys/kern/uipc_mqueue.c index fb2ef6a..59c339a 100644 --- a/src/sys/kern/uipc_mqueue.c +++ b/src/sys/kern/uipc_mqueue.c @@ -2005,8 +2005,8 @@ kmq_open(struct thread *td, struct kmq_open_args *uap) FILE_UNLOCK(fp); FILEDESC_XLOCK(fdp); - if (fdp->fd_ofiles[fd] == fp) - fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + if (ftable_get(fdp, fd) == fp) + ftable_set_cloexec(fdp, fd, 1); FILEDESC_XUNLOCK(fdp); td->td_retval[0] = fd; fdrop(fp, td); diff --git a/src/sys/kern/uipc_sem.c b/src/sys/kern/uipc_sem.c index d5525a4..b1e6b62 100644 --- a/src/sys/kern/uipc_sem.c +++ b/src/sys/kern/uipc_sem.c @@ -488,8 +488,8 @@ ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, fp->f_ops = &ksem_ops; FILEDESC_XLOCK(fdp); - if (fdp->fd_ofiles[fd] == fp) - fdp->fd_ofileflags[fd] |= UF_EXCLOSE; + if (ftable_get(fdp, fd) == fp) + ftable_set_cloexec(fdp, fd, 1); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); diff --git a/src/sys/kern/uipc_usrreq.c b/src/sys/kern/uipc_usrreq.c index b8255f0..1bfc037 100644 --- a/src/sys/kern/uipc_usrreq.c +++ b/src/sys/kern/uipc_usrreq.c @@ -1605,7 +1605,8 @@ unp_externalize(struct mbuf *control, struct mbuf **controlp) if (fdalloc(td, 0, &f)) panic("unp_externalize fdalloc failed"); fp = *rp++; - td->td_proc->p_fd->fd_ofiles[f] = fp; + ftable_set(td->td_proc->p_fd, f, + fp); FILE_LOCK(fp); fp->f_msgcount--; FILE_UNLOCK(fp); @@ -1735,12 +1736,12 @@ unp_internalize(struct mbuf **controlp, struct thread *td) for (i = 0; i < oldfds; i++) { fd = *fdp++; if ((unsigned)fd >= fdescp->fd_nfiles || - fdescp->fd_ofiles[fd] == NULL) { + ftable_get(fdescp, fd) == NULL) { FILEDESC_SUNLOCK(fdescp); error = EBADF; goto out; } - fp = fdescp->fd_ofiles[fd]; + fp = ftable_get(fdescp, fd); if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdescp); error = EOPNOTSUPP; @@ -1765,7 +1766,7 @@ unp_internalize(struct mbuf **controlp, struct thread *td) rp = (struct file **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < oldfds; i++) { - fp = fdescp->fd_ofiles[*fdp++]; + fp = ftable_get(fdescp, *fdp++); *rp++ = fp; FILE_LOCK(fp); fp->f_count++; diff --git a/src/sys/kern/vfs_syscalls.c b/src/sys/kern/vfs_syscalls.c index 2f28263..fa0a5e6 100644 --- a/src/sys/kern/vfs_syscalls.c +++ b/src/sys/kern/vfs_syscalls.c @@ -4884,7 +4884,7 @@ getvnode(fdp, fd, fpp) else { FILEDESC_SLOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + (fp = ftable_get(fdp, fd)) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; diff --git a/src/sys/netsmb/smb_dev.c b/src/sys/netsmb/smb_dev.c index fd0dcbe..a0dd80c 100644 --- a/src/sys/netsmb/smb_dev.c +++ b/src/sys/netsmb/smb_dev.c @@ -370,7 +370,7 @@ nsmb_getfp(struct filedesc* fdp, int fd, int flag) FILEDESC_SLOCK(fdp); if (((u_int)fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL || + (fp = ftable_get(fdp, fd)) == NULL || (fp->f_flag & flag) == 0) { FILEDESC_SUNLOCK(fdp); return (NULL); diff --git a/src/sys/sys/filedesc.h b/src/sys/sys/filedesc.h index 1831e5c..e9ea56a 100644 --- a/src/sys/sys/filedesc.h +++ b/src/sys/sys/filedesc.h @@ -45,16 +45,26 @@ * This structure is used for the management of descriptors. It may be * shared by multiple processes. */ -#define NDSLOTTYPE u_long +#define NDSLOTTYPE uintptr_t + +/* Generic indirect block table */ +struct idb_table { + union { + void *flat; + void **indirect; + } idb_tbl; + int idb_nents; /* Current max # of entries. */ + int idb_orig_nents; /* Orig # of entries for the flat table. */ +}; struct filedesc { - struct file **fd_ofiles; /* file structures for open files */ - char *fd_ofileflags; /* per-process open file flags */ + struct idb_table fd_files; /* table of open file structs */ + struct idb_table fd_map; /* bitmap of free fds */ + struct idb_table fd_cloexec; /* bitmap of fd close exec state */ struct vnode *fd_cdir; /* current directory */ struct vnode *fd_rdir; /* root directory */ struct vnode *fd_jdir; /* jail root directory */ int fd_nfiles; /* number of open files allocated */ - NDSLOTTYPE *fd_map; /* bitmap of free fds */ int fd_lastfile; /* high-water mark of fd_ofiles */ int fd_freefile; /* approx. next free file */ u_short fd_cmask; /* mask for file creation */ @@ -130,12 +140,18 @@ struct filedesc_to_leader * int getvnode(struct filedesc *fdp, int fd, struct file **fpp); void mountcheckdirs(struct vnode *olddp, struct vnode *newdp); void setugidsafety(struct thread *td); +struct file *ftable_get(struct filedesc *fdp, int fd); +void ftable_set(struct filedesc *fdp, int fd, struct file *fp); +int ftable_get_cloexec(struct filedesc *fdp, int fd); +void ftable_set_cloexec(struct filedesc *fdp, int fd, int on); + static __inline struct file * fget_locked(struct filedesc *fdp, int fd) { - return (fd < 0 || fd >= fdp->fd_nfiles ? NULL : fdp->fd_ofiles[fd]); + return (fd < 0 || fd >= fdp->fd_nfiles ? NULL : + ftable_get(fdp, fd)); } #endif /* _KERNEL */ --Apple-Mail-18--1040317416 Content-Type: text/plain; charset=US-ASCII; format=flowed; delsp=yes Content-Transfer-Encoding: 7bit On May 11, 2010, at 10:24 AM, Tim Prouty wrote: > Hi, > > This is my first time sending a patch to the list, so let me know if > there > are any conventions I missed. > > Attached is a patch that attempts to remove the data structure > limitations on the number of open file descriptors in the system. The > patch is against our modified version of FreeBSD 7, so it probably > won't apply cleanly against upstream, but I wanted to get this out > there for discussion soon so if there is feedback, we can address it > and then worry about porting a specific patch for upstream. > > We (Isilon) have been running this internally for a few months without > any issues, although there is at least one known issue that I need to > resolve, which is mentioned below. > > Motivation: > > With the increasing amount of memory and processing power in modern > machines, there are certain userspace processes that are able to > handle much higher concurrent load than previously possible. A > specific example is a single-process/multi-threaded SMB stack which > can handle thousands of connected clients, each with hundreds of > files open. Once kernel sysctl limits are increased for max files, > the next limitation is in the actual actual file descriptor data > structures. > > Problem - Data Structure Limits: > > The existing per-process data structures for the file descriptor are > flat tables, which are reallocated each time they need need to grow. > This is innefficient as the amount of data to allocate and copy each > time increases, but the bigger issue is the potentially limited > amount of contiguous KVA memory as the table grows very large. Over > time as the KVA memory becomes fragmanted, malloc may be unable to > provide large enough blocks of contiguous memory. > > In the current code the struct proc contains both an array of struct > file pointers and a bit field indicating which file descriptors are > in use. The primary issue is how to handle these structures growing > beyond the kernel page size of 4K. > > The array of file pointers will grow much faster than the bit field, > especially on a 64 bit kernel. The 4K block size will be hit at 512 > files (64 bit kernel) for the file pointer array and 32,768 files > for the bit field. > > Solution: > > File Pointer Array > > Focusing first on the file pointer array limitation, an indirect > block approach is used. An indirect block size of 4K (equal to page > size) is used, allowing for 512 files per block. To optimize for > the common case of low/normal fd usage, a flat array is initialized > to 20 entries and then grows at 2x each time until the block reaches > it's maximum size. Once more than 512 files are opened, the array > will transition to a single level indirect block table. > > Fd Bitfield: > > The fd bit field as it stands can represent 32K files when it grows > to the page size limit. Using the same indirect system as the file > pointer array, it is able to grow beyond it's existing limits. > > Close Exec Field: > > One complication of the old file pointer table is that for each file > pointer there was 1 byte flags. The memory was laid out such that > the file pointers are all in one contiguous array, followed by a > second array of chars where each char entry is a flags field that > corresponds to the file pointer at the same index. Interestingly > there is actually only one flag that is used: UF_EXCLOSE, so it's > fairly wasteful to have an array of chars. What linux does, and > what I have done is to just use a bitfield for all fds that should > be closed on exec. This could be further optimized by doing some > pointer trickery to store the close exec bit in the struct file > pointer rather than keep a separate bitfield. > > Indirect Block Table: > > Since there are three consumers of the indirect block table, I > generalized it so all of the consumers rely on the same code. This > could eventually be refactored into a kernel library since it could > be generally useful in other areas. The table uses a single level > of indirection, so the base table can still grow beyond the 4K. As > a process uses more fds, the need to continue growing the base table > should be fairly limited, and a single realloc will significantly > increase the number of fds the process can allocate. > > Accessing the new data structures: > > All consumers of the file pointer array and bitfield will now have > to use accessors rather than using direct access. > > Known Issues: > > The new fdp locking in fdcopy needs to be reworked. > > > Thank you for reviewing! > > -Tim > > <0001-Increase-scalabilty-of-per-process-file-descriptor-d.patch> > _______________________________________________ > freebsd-arch@freebsd.org mailing list > http://lists.freebsd.org/mailman/listinfo/freebsd-arch > To unsubscribe, send any mail to "freebsd-arch- > unsubscribe@freebsd.org" --Apple-Mail-18--1040317416--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?A57FE0F9-47B9-4036-8F1C-0D30FB545980>