Date: Sun, 12 May 2013 04:05:02 +0000 (UTC) From: Jeff Roberson <jeff@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r250551 - in head/sys: conf kern sys Message-ID: <201305120405.r4C452fr018368@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jeff Date: Sun May 12 04:05:01 2013 New Revision: 250551 URL: http://svnweb.freebsd.org/changeset/base/250551 Log: - Add a new general purpose path-compressed radix trie which can be used with any structure containing a uint64_t index. The tree code auto-generates type safe wrappers. - Eliminate the buf splay and replace it with pctrie. This is not only significantly faster with large files but also allows for the possibility of shared locking. Reviewed by: alc, attilio Sponsored by: EMC / Isilon Storage Division Added: head/sys/kern/subr_pctrie.c (contents, props changed) head/sys/sys/_pctrie.h - copied, changed from r249323, head/sys/vm/_vm_radix.h head/sys/sys/pctrie.h (contents, props changed) Modified: head/sys/conf/files head/sys/kern/vfs_subr.c head/sys/sys/buf.h head/sys/sys/bufobj.h Modified: head/sys/conf/files ============================================================================== --- head/sys/conf/files Sun May 12 03:36:28 2013 (r250550) +++ head/sys/conf/files Sun May 12 04:05:01 2013 (r250551) @@ -2760,6 +2760,7 @@ kern/subr_module.c standard kern/subr_msgbuf.c standard kern/subr_param.c standard kern/subr_pcpu.c standard +kern/subr_pctrie.c standard kern/subr_power.c standard kern/subr_prf.c standard kern/subr_prof.c standard Added: head/sys/kern/subr_pctrie.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/sys/kern/subr_pctrie.c Sun May 12 04:05:01 2013 (r250551) @@ -0,0 +1,705 @@ +/* + * Copyright (c) 2013 EMC Corp. + * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org> + * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Path-compressed radix trie implementation. + * + * The implementation takes into account the following rationale: + * - Size of the nodes should be as small as possible but still big enough + * to avoid a large maximum depth for the trie. This is a balance + * between the necessity to not wire too much physical memory for the nodes + * and the necessity to avoid too much cache pollution during the trie + * operations. + * - There is not a huge bias toward the number of lookup operations over + * the number of insert and remove operations. This basically implies + * that optimizations supposedly helping one operation but hurting the + * other might be carefully evaluated. + * - On average not many nodes are expected to be fully populated, hence + * level compression may just complicate things. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/pctrie.h> + +#ifdef DDB +#include <ddb/ddb.h> +#endif + +/* + * These widths should allow the pointers to a node's children to fit within + * a single cache line. The extra levels from a narrow width should not be + * a problem thanks to path compression. + */ +#ifdef __LP64__ +#define PCTRIE_WIDTH 4 +#else +#define PCTRIE_WIDTH 3 +#endif + +#define PCTRIE_COUNT (1 << PCTRIE_WIDTH) +#define PCTRIE_MASK (PCTRIE_COUNT - 1) +#define PCTRIE_LIMIT (howmany((sizeof(uint64_t) * NBBY), PCTRIE_WIDTH) - 1) + +/* Flag bits stored in node pointers. */ +#define PCTRIE_ISLEAF 0x1 +#define PCTRIE_FLAGS 0x1 +#define PCTRIE_PAD PCTRIE_FLAGS + +/* Returns one unit associated with specified level. */ +#define PCTRIE_UNITLEVEL(lev) \ + ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) + +struct pctrie_node { + uint64_t pn_owner; /* Owner of record. */ + uint16_t pn_count; /* Valid children. */ + uint16_t pn_clev; /* Current level. */ + void *pn_child[PCTRIE_COUNT]; /* Child nodes. */ +}; + +/* + * Allocate a node. Pre-allocation should ensure that the request + * will always be satisfied. + */ +static __inline struct pctrie_node * +pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner, + uint16_t count, uint16_t clevel) +{ + struct pctrie_node *node; + + node = allocfn(ptree); + if (node == NULL) + return (NULL); + node->pn_owner = owner; + node->pn_count = count; + node->pn_clev = clevel; + + return (node); +} + +/* + * Free radix node. + */ +static __inline void +pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, + pctrie_free_t freefn) +{ +#ifdef INVARIANTS + int slot; + + KASSERT(node->pn_count == 0, + ("pctrie_node_put: node %p has %d children", node, + node->pn_count)); + for (slot = 0; slot < PCTRIE_COUNT; slot++) + KASSERT(node->pn_child[slot] == NULL, + ("pctrie_node_put: node %p has a child", node)); +#endif + freefn(ptree, node); +} + +/* + * Return the position in the array for a given level. + */ +static __inline int +pctrie_slot(uint64_t index, uint16_t level) +{ + + return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); +} + +/* Trims the key after the specified level. */ +static __inline uint64_t +pctrie_trimkey(uint64_t index, uint16_t level) +{ + uint64_t ret; + + ret = index; + if (level > 0) { + ret >>= level * PCTRIE_WIDTH; + ret <<= level * PCTRIE_WIDTH; + } + return (ret); +} + +/* + * Get the root node for a tree. + */ +static __inline struct pctrie_node * +pctrie_getroot(struct pctrie *ptree) +{ + + return ((struct pctrie_node *)ptree->pt_root); +} + +/* + * Set the root node for a tree. + */ +static __inline void +pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node) +{ + + ptree->pt_root = (uintptr_t)node; +} + +/* + * Returns TRUE if the specified node is a leaf and FALSE otherwise. + */ +static __inline boolean_t +pctrie_isleaf(struct pctrie_node *node) +{ + + return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); +} + +/* + * Returns the associated val extracted from node. + */ +static __inline uint64_t * +pctrie_toval(struct pctrie_node *node) +{ + + return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); +} + +/* + * Adds the val as a child of the provided node. + */ +static __inline void +pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev, + uint64_t *val) +{ + int slot; + + slot = pctrie_slot(index, clev); + node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF); +} + +/* + * Returns the slot where two keys differ. + * It cannot accept 2 equal keys. + */ +static __inline uint16_t +pctrie_keydiff(uint64_t index1, uint64_t index2) +{ + uint16_t clev; + + KASSERT(index1 != index2, ("%s: passing the same key value %jx", + __func__, (uintmax_t)index1)); + + index1 ^= index2; + for (clev = PCTRIE_LIMIT;; clev--) + if (pctrie_slot(index1, clev) != 0) + return (clev); +} + +/* + * Returns TRUE if it can be determined that key does not belong to the + * specified node. Otherwise, returns FALSE. + */ +static __inline boolean_t +pctrie_keybarr(struct pctrie_node *node, uint64_t idx) +{ + + if (node->pn_clev < PCTRIE_LIMIT) { + idx = pctrie_trimkey(idx, node->pn_clev + 1); + return (idx != node->pn_owner); + } + return (FALSE); +} + +/* + * Internal helper for pctrie_reclaim_allnodes(). + * This function is recursive. + */ +static void +pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, + pctrie_free_t freefn) +{ + int slot; + + KASSERT(node->pn_count <= PCTRIE_COUNT, + ("pctrie_reclaim_allnodes_int: bad count in node %p", node)); + for (slot = 0; node->pn_count != 0; slot++) { + if (node->pn_child[slot] == NULL) + continue; + if (!pctrie_isleaf(node->pn_child[slot])) + pctrie_reclaim_allnodes_int(ptree, + node->pn_child[slot], freefn); + node->pn_child[slot] = NULL; + node->pn_count--; + } + pctrie_node_put(ptree, node, freefn); +} + +/* + * pctrie node zone initializer. + */ +int +pctrie_zone_init(void *mem, int size __unused, int flags __unused) +{ + struct pctrie_node *node; + + node = mem; + memset(node->pn_child, 0, sizeof(node->pn_child)); + return (0); +} + +size_t +pctrie_node_size(void) +{ + + return (sizeof(struct pctrie_node)); +} + +/* + * Inserts the key-value pair into the trie. + * Panics if the key already exists. + */ +int +pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) +{ + uint64_t index, newind; + void **parentp; + struct pctrie_node *node, *tmp; + uint64_t *m; + int slot; + uint16_t clev; + + index = *val; + + /* + * The owner of record for root is not really important because it + * will never be used. + */ + node = pctrie_getroot(ptree); + if (node == NULL) { + ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF; + return (0); + } + parentp = (void **)&ptree->pt_root; + for (;;) { + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m == index) + panic("%s: key %jx is already present", + __func__, (uintmax_t)index); + clev = pctrie_keydiff(*m, index); + tmp = pctrie_node_get(ptree, allocfn, + pctrie_trimkey(index, clev + 1), 2, clev); + if (tmp == NULL) + return (ENOMEM); + *parentp = tmp; + pctrie_addval(tmp, index, clev, val); + pctrie_addval(tmp, *m, clev, m); + return (0); + } else if (pctrie_keybarr(node, index)) + break; + slot = pctrie_slot(index, node->pn_clev); + if (node->pn_child[slot] == NULL) { + node->pn_count++; + pctrie_addval(node, index, node->pn_clev, val); + return (0); + } + parentp = &node->pn_child[slot]; + node = node->pn_child[slot]; + } + + /* + * A new node is needed because the right insertion level is reached. + * Setup the new intermediate node and add the 2 children: the + * new object and the older edge. + */ + newind = node->pn_owner; + clev = pctrie_keydiff(newind, index); + tmp = pctrie_node_get(ptree, allocfn, + pctrie_trimkey(index, clev + 1), 2, clev); + if (tmp == NULL) + return (ENOMEM); + *parentp = tmp; + pctrie_addval(tmp, index, clev, val); + slot = pctrie_slot(newind, clev); + tmp->pn_child[slot] = node; + + return (0); +} + +/* + * Returns the value stored at the index. If the index is not present, + * NULL is returned. + */ +uint64_t * +pctrie_lookup(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *node; + uint64_t *m; + int slot; + + node = pctrie_getroot(ptree); + while (node != NULL) { + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m == index) + return (m); + else + break; + } else if (pctrie_keybarr(node, index)) + break; + slot = pctrie_slot(index, node->pn_clev); + node = node->pn_child[slot]; + } + return (NULL); +} + +/* + * Look up the nearest entry at a position bigger than or equal to index. + */ +uint64_t * +pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *stack[PCTRIE_LIMIT]; + uint64_t inc; + uint64_t *m; + struct pctrie_node *child, *node; +#ifdef INVARIANTS + int loops = 0; +#endif + int slot, tos; + + node = pctrie_getroot(ptree); + if (node == NULL) + return (NULL); + else if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m >= index) + return (m); + else + return (NULL); + } + tos = 0; + for (;;) { + /* + * If the keys differ before the current bisection node, + * then the search key might rollback to the earliest + * available bisection node or to the smallest key + * in the current node (if the owner is bigger than the + * search key). + */ + if (pctrie_keybarr(node, index)) { + if (index > node->pn_owner) { +ascend: + KASSERT(++loops < 1000, + ("pctrie_lookup_ge: too many loops")); + + /* + * Pop nodes from the stack until either the + * stack is empty or a node that could have a + * matching descendant is found. + */ + do { + if (tos == 0) + return (NULL); + node = stack[--tos]; + } while (pctrie_slot(index, + node->pn_clev) == (PCTRIE_COUNT - 1)); + + /* + * The following computation cannot overflow + * because index's slot at the current level + * is less than PCTRIE_COUNT - 1. + */ + index = pctrie_trimkey(index, + node->pn_clev); + index += PCTRIE_UNITLEVEL(node->pn_clev); + } else + index = node->pn_owner; + KASSERT(!pctrie_keybarr(node, index), + ("pctrie_lookup_ge: keybarr failed")); + } + slot = pctrie_slot(index, node->pn_clev); + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m >= index) + return (m); + } else if (child != NULL) + goto descend; + + /* + * Look for an available edge or val within the current + * bisection node. + */ + if (slot < (PCTRIE_COUNT - 1)) { + inc = PCTRIE_UNITLEVEL(node->pn_clev); + index = pctrie_trimkey(index, node->pn_clev); + do { + index += inc; + slot++; + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m >= index) + return (m); + } else if (child != NULL) + goto descend; + } while (slot < (PCTRIE_COUNT - 1)); + } + KASSERT(child == NULL || pctrie_isleaf(child), + ("pctrie_lookup_ge: child is radix node")); + + /* + * If a value or edge bigger than the search slot is not found + * in the current node, ascend to the next higher-level node. + */ + goto ascend; +descend: + KASSERT(node->pn_clev > 0, + ("pctrie_lookup_ge: pushing leaf's parent")); + KASSERT(tos < PCTRIE_LIMIT, + ("pctrie_lookup_ge: stack overflow")); + stack[tos++] = node; + node = child; + } +} + +/* + * Look up the nearest entry at a position less than or equal to index. + */ +uint64_t * +pctrie_lookup_le(struct pctrie *ptree, uint64_t index) +{ + struct pctrie_node *stack[PCTRIE_LIMIT]; + uint64_t inc; + uint64_t *m; + struct pctrie_node *child, *node; +#ifdef INVARIANTS + int loops = 0; +#endif + int slot, tos; + + node = pctrie_getroot(ptree); + if (node == NULL) + return (NULL); + else if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m <= index) + return (m); + else + return (NULL); + } + tos = 0; + for (;;) { + /* + * If the keys differ before the current bisection node, + * then the search key might rollback to the earliest + * available bisection node or to the largest key + * in the current node (if the owner is smaller than the + * search key). + */ + if (pctrie_keybarr(node, index)) { + if (index > node->pn_owner) { + index = node->pn_owner + PCTRIE_COUNT * + PCTRIE_UNITLEVEL(node->pn_clev); + } else { +ascend: + KASSERT(++loops < 1000, + ("pctrie_lookup_le: too many loops")); + + /* + * Pop nodes from the stack until either the + * stack is empty or a node that could have a + * matching descendant is found. + */ + do { + if (tos == 0) + return (NULL); + node = stack[--tos]; + } while (pctrie_slot(index, + node->pn_clev) == 0); + + /* + * The following computation cannot overflow + * because index's slot at the current level + * is greater than 0. + */ + index = pctrie_trimkey(index, + node->pn_clev); + } + index--; + KASSERT(!pctrie_keybarr(node, index), + ("pctrie_lookup_le: keybarr failed")); + } + slot = pctrie_slot(index, node->pn_clev); + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m <= index) + return (m); + } else if (child != NULL) + goto descend; + + /* + * Look for an available edge or value within the current + * bisection node. + */ + if (slot > 0) { + inc = PCTRIE_UNITLEVEL(node->pn_clev); + index |= inc - 1; + do { + index -= inc; + slot--; + child = node->pn_child[slot]; + if (pctrie_isleaf(child)) { + m = pctrie_toval(child); + if (*m <= index) + return (m); + } else if (child != NULL) + goto descend; + } while (slot > 0); + } + KASSERT(child == NULL || pctrie_isleaf(child), + ("pctrie_lookup_le: child is radix node")); + + /* + * If a value or edge smaller than the search slot is not found + * in the current node, ascend to the next higher-level node. + */ + goto ascend; +descend: + KASSERT(node->pn_clev > 0, + ("pctrie_lookup_le: pushing leaf's parent")); + KASSERT(tos < PCTRIE_LIMIT, + ("pctrie_lookup_le: stack overflow")); + stack[tos++] = node; + node = child; + } +} + +/* + * Remove the specified index from the tree. + * Panics if the key is not present. + */ +void +pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) +{ + struct pctrie_node *node, *parent; + uint64_t *m; + int i, slot; + + node = pctrie_getroot(ptree); + if (pctrie_isleaf(node)) { + m = pctrie_toval(node); + if (*m != index) + panic("%s: invalid key found", __func__); + pctrie_setroot(ptree, NULL); + return; + } + parent = NULL; + for (;;) { + if (node == NULL) + panic("pctrie_remove: impossible to locate the key"); + slot = pctrie_slot(index, node->pn_clev); + if (pctrie_isleaf(node->pn_child[slot])) { + m = pctrie_toval(node->pn_child[slot]); + if (*m != index) + panic("%s: invalid key found", __func__); + node->pn_child[slot] = NULL; + node->pn_count--; + if (node->pn_count > 1) + break; + for (i = 0; i < PCTRIE_COUNT; i++) + if (node->pn_child[i] != NULL) + break; + KASSERT(i != PCTRIE_COUNT, + ("%s: invalid node configuration", __func__)); + if (parent == NULL) + pctrie_setroot(ptree, node->pn_child[i]); + else { + slot = pctrie_slot(index, parent->pn_clev); + KASSERT(parent->pn_child[slot] == node, + ("%s: invalid child value", __func__)); + parent->pn_child[slot] = node->pn_child[i]; + } + node->pn_count--; + node->pn_child[i] = NULL; + pctrie_node_put(ptree, node, freefn); + break; + } + parent = node; + node = node->pn_child[slot]; + } +} + +/* + * Remove and free all the nodes from the tree. + * This function is recursive but there is a tight control on it as the + * maximum depth of the tree is fixed. + */ +void +pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) +{ + struct pctrie_node *root; + + root = pctrie_getroot(ptree); + if (root == NULL) + return; + pctrie_setroot(ptree, NULL); + if (!pctrie_isleaf(root)) + pctrie_reclaim_allnodes_int(ptree, root, freefn); +} + +#ifdef DDB +/* + * Show details about the given node. + */ +DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) +{ + struct pctrie_node *node; + int i; + + if (!have_addr) + return; + node = (struct pctrie_node *)addr; + db_printf("node %p, owner %jx, children count %u, level %u:\n", + (void *)node, (uintmax_t)node->pn_owner, node->pn_count, + node->pn_clev); + for (i = 0; i < PCTRIE_COUNT; i++) + if (node->pn_child[i] != NULL) + db_printf("slot: %d, val: %p, value: %p, clev: %d\n", + i, (void *)node->pn_child[i], + pctrie_isleaf(node->pn_child[i]) ? + pctrie_toval(node->pn_child[i]) : NULL, + node->pn_clev); +} +#endif /* DDB */ Modified: head/sys/kern/vfs_subr.c ============================================================================== --- head/sys/kern/vfs_subr.c Sun May 12 03:36:28 2013 (r250550) +++ head/sys/kern/vfs_subr.c Sun May 12 04:05:01 2013 (r250551) @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/mount.h> #include <sys/namei.h> +#include <sys/pctrie.h> #include <sys/priv.h> #include <sys/reboot.h> #include <sys/rwlock.h> @@ -184,6 +185,8 @@ static struct mtx vnode_free_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; +static uma_zone_t buf_trie_zone; + /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; @@ -284,6 +287,24 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhe static int vnsz2log; /* + * Support for the bufobj clean & dirty pctrie. + */ +static void * +buf_trie_alloc(struct pctrie *ptree) +{ + + return uma_zalloc(buf_trie_zone, M_NOWAIT); +} + +static void +buf_trie_free(struct pctrie *ptree, void *node) +{ + + uma_zfree(buf_trie_zone, node); +} +PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free); + +/* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical @@ -329,6 +350,15 @@ vntblinit(void *dummy __unused) vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* + * Preallocate enough nodes to support one-per buf so that + * we can not fail an insert. reassignbuf() callers can not + * tolerate the insertion failure. + */ + buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), + NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE | UMA_ZONE_VM); + uma_prealloc(buf_trie_zone, nbuf); + /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, @@ -1476,75 +1506,9 @@ restartsync: return (0); } -/* - * buf_splay() - splay tree core for the clean/dirty list of buffers in - * a vnode. - * - * NOTE: We have to deal with the special case of a background bitmap - * buffer, a situation where two buffers will have the same logical - * block offset. We want (1) only the foreground buffer to be accessed - * in a lookup and (2) must differentiate between the foreground and - * background buffer in the splay tree algorithm because the splay - * tree cannot normally handle multiple entities with the same 'index'. - * We accomplish this by adding differentiating flags to the splay tree's - * numerical domain. - */ -static -struct buf * -buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) -{ - struct buf dummy; - struct buf *lefttreemax, *righttreemin, *y; - - if (root == NULL) - return (NULL); - lefttreemax = righttreemin = &dummy; - for (;;) { - if (lblkno < root->b_lblkno) { - if ((y = root->b_left) == NULL) - break; - if (lblkno < y->b_lblkno) { - /* Rotate right. */ - root->b_left = y->b_right; - y->b_right = root; - root = y; - if ((y = root->b_left) == NULL) - break; - } - /* Link into the new root's right tree. */ - righttreemin->b_left = root; - righttreemin = root; - } else if (lblkno > root->b_lblkno) { - if ((y = root->b_right) == NULL) - break; - if (lblkno > y->b_lblkno) { - /* Rotate left. */ - root->b_right = y->b_left; - y->b_left = root; - root = y; - if ((y = root->b_right) == NULL) - break; - } - /* Link into the new root's left tree. */ - lefttreemax->b_right = root; - lefttreemax = root; - } else { - break; - } - root = y; - } - /* Assemble the new root. */ - lefttreemax->b_right = root->b_left; - righttreemin->b_left = root->b_right; - root->b_left = dummy.b_right; - root->b_right = dummy.b_left; - return (root); -} - static void buf_vlist_remove(struct buf *bp) { - struct buf *root; struct bufv *bv; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); @@ -1556,33 +1520,23 @@ buf_vlist_remove(struct buf *bp) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; - if (bp != bv->bv_root) { - root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); - KASSERT(root == bp, ("splay lookup failed in remove")); - } - if (bp->b_left == NULL) { - root = bp->b_right; - } else { - root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); - root->b_right = bp->b_right; - } - bv->bv_root = root; + BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* - * Add the buffer to the sorted clean or dirty block list using a - * splay tree algorithm. + * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { - struct buf *root; struct bufv *bv; + struct buf *n; + int error; ASSERT_BO_LOCKED(bo); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, @@ -1593,24 +1547,22 @@ buf_vlist_add(struct buf *bp, struct buf else bv = &bo->bo_clean; - root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); - if (root == NULL) { - bp->b_left = NULL; - bp->b_right = NULL; + /* + * Keep the list ordered. Optimize empty list insertion. Assume + * we tend to grow at the tail so lookup_le should usually be cheaper + * than _ge. + */ + if (bv->bv_cnt == 0 || + bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); - } else if (bp->b_lblkno < root->b_lblkno) { - bp->b_left = root->b_left; - bp->b_right = root; - root->b_left = NULL; - TAILQ_INSERT_BEFORE(root, bp, b_bobufs); - } else { - bp->b_right = root->b_right; - bp->b_left = root; - root->b_right = NULL; - TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); - } + else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) + TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); + else + TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); + error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); + if (error) + panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; - bv->bv_root = bp; } /* @@ -1631,21 +1583,10 @@ gbincore(struct bufobj *bo, daddr_t lblk struct buf *bp; ASSERT_BO_LOCKED(bo); - if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno) + bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); + if (bp != NULL) return (bp); - if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno) - return (bp); - if ((bp = bo->bo_clean.bv_root) != NULL) { - bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); - if (bp->b_lblkno == lblkno) - return (bp); - } - if ((bp = bo->bo_dirty.bv_root) != NULL) { - bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); - if (bp->b_lblkno == lblkno) - return (bp); - } - return (NULL); + return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno); } /* @@ -2460,9 +2401,11 @@ vdropl(struct vnode *vp) VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); - VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); + VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, + ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); - VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); + VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, + ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); Copied and modified: head/sys/sys/_pctrie.h (from r249323, head/sys/vm/_vm_radix.h) ============================================================================== --- head/sys/vm/_vm_radix.h Wed Apr 10 02:40:03 2013 (r249323, copy source) +++ head/sys/sys/_pctrie.h Sun May 12 04:05:01 2013 (r250551) @@ -28,24 +28,24 @@ * $FreeBSD$ */ -#ifndef __VM_RADIX_H_ -#define __VM_RADIX_H_ +#ifndef __SYS_PCTRIE_H_ +#define __SYS_PCTRIE_H_ /* * Radix tree root. */ -struct vm_radix { - uintptr_t rt_root; +struct pctrie { + uintptr_t pt_root; }; #ifdef _KERNEL static __inline boolean_t -vm_radix_is_empty(struct vm_radix *rtree) +pctrie_is_empty(struct pctrie *ptree) { - return (rtree->rt_root == 0); + return (ptree->pt_root == 0); } *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201305120405.r4C452fr018368>