Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 25 Jul 2017 04:13:43 +0000 (UTC)
From:      Alan Cox <alc@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r321459 - in stable/10/sys: kern sys
Message-ID:  <201707250413.v6P4DhQR070299@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: alc
Date: Tue Jul 25 04:13:43 2017
New Revision: 321459
URL: https://svnweb.freebsd.org/changeset/base/321459

Log:
  MFC r320077
    Change blist_alloc()'s allocation policy from first-fit to next-fit so
    that disk writes are more likely to be sequential.  This change is
    beneficial on both the solid state and mechanical disks that I've
    tested.  (A similar change in allocation policy was made by DragonFly
    BSD in 2013 to speed up Poudriere with "stressful memory parameters".)
  
    Increase the width of blst_meta_alloc()'s parameter "skip" and the local
    variables whose values are derived from it to 64 bits.  (This matches the
    width of the field "skip" that is stored in the structure "blist" and
    passed to blst_meta_alloc().)
  
    Eliminate a pointless check for a NULL blist_t.
  
    Simplify blst_meta_alloc()'s handling of the ALL-FREE case.
  
    Address nearby style errors.
  
  MFC r320417
    Address the remaining integer overflow issues with the "skip" parameters
    and "next_skip" variables.  The "skip" value in struct blist has long been
    a 64-bit quantity but various functions have implicitly truncated this
    value to 32 bits.  Now, all arithmetic involving the "skip" value is 64
    bits wide.  (This should allow us to relax the size limit on a swap device
    in the swap pager.)
  
    Maintain the ability to test this allocator as a user-space application by
    including <stdbool.h>.
  
    Remove an unused variable from blst_radix_print().
  
  MFC r320527
    Change blst_leaf_alloc() to handle a cursor argument, and to improve
    performance.
  
    To find in the leaf bitmap all ranges of sufficient length, use a doubling
    strategy with shift-and-and until each bit still set represents a bit
    sequence of length 'count', or until the bitmask is zero.  In the latter
    case, update the hint based on the first bit sequence length not found to
    be available.  For example, seeking an interval of length 12, the set bits
    of the bitmap would represent intervals of length 1, then 2, then 3, then
    6, then 12.  If no bits are set at the point when each bit represents an
    interval of length 6, then the hint can be updated to 5 and the search
    terminated.
  
    If long-enough intervals are found, discard those before the cursor.  If
    any remain, use binary search to find the position of the first of them,
    and allocate that interval.

Modified:
  stable/10/sys/kern/subr_blist.c
  stable/10/sys/sys/blist.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/kern/subr_blist.c
==============================================================================
--- stable/10/sys/kern/subr_blist.c	Tue Jul 25 03:59:35 2017	(r321458)
+++ stable/10/sys/kern/subr_blist.c	Tue Jul 25 04:13:43 2017	(r321459)
@@ -105,6 +105,7 @@ __FBSDID("$FreeBSD$");
 #include <string.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdbool.h>
 
 #define	bitcount64(x)	__bitcount64((uint64_t)(x))
 #define malloc(a,b,c)	calloc(a, 1)
@@ -120,22 +121,23 @@ void panic(const char *ctl, ...);
  * static support functions
  */
 
-static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
-static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
-				daddr_t count, daddr_t radix, int skip);
+static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
+		    daddr_t cursor);
+static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t cursor);
 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
-					daddr_t radix, int skip, daddr_t blk);
+		    daddr_t radix, daddr_t skip, daddr_t blk);
 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
 				daddr_t skip, blist_t dest, daddr_t count);
 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
-				daddr_t radix, int skip, daddr_t blk);
-static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
-						int skip, daddr_t count);
+		    daddr_t radix, daddr_t skip, daddr_t blk);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
+		    daddr_t count);
 #ifndef _KERNEL
-static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
-					daddr_t radix, int skip, int tab);
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
+		    daddr_t skip, int tab);
 #endif
 
 #ifdef _KERNEL
@@ -157,18 +159,18 @@ blist_t 
 blist_create(daddr_t blocks, int flags)
 {
 	blist_t bl;
-	daddr_t nodes, radix;
-	int skip = 0;
+	daddr_t nodes, radix, skip;
 
 	/*
 	 * Calculate radix and skip field used for scanning.
 	 */
 	radix = BLIST_BMAP_RADIX;
-
+	skip = 0;
 	while (radix < blocks) {
 		radix *= BLIST_META_RADIX;
 		skip = (skip + 1) * BLIST_META_RADIX;
 	}
+	nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
 
 	bl = malloc(sizeof(struct blist), M_SWAP, flags);
 	if (bl == NULL)
@@ -177,13 +179,13 @@ blist_create(daddr_t blocks, int flags)
 	bl->bl_blocks = blocks;
 	bl->bl_radix = radix;
 	bl->bl_skip = skip;
-	nodes = 1 + blst_radix_init(NULL, radix, bl->bl_skip, blocks);
+	bl->bl_cursor = 0;
 	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
 	if (bl->bl_root == NULL) {
 		free(bl, M_SWAP);
 		return (NULL);
 	}
-	blst_radix_init(bl->bl_root, radix, bl->bl_skip, blocks);
+	blst_radix_init(bl->bl_root, radix, skip, blocks);
 
 #if defined(BLIST_DEBUG)
 	printf(
@@ -218,13 +220,24 @@ blist_alloc(blist_t bl, daddr_t count)
 {
 	daddr_t blk;
 
-	if (bl != NULL && count <= bl->bl_root->bm_bighint) {
+	/*
+	 * This loop iterates at most twice.  An allocation failure in the
+	 * first iteration leads to a second iteration only if the cursor was
+	 * non-zero.  When the cursor is zero, an allocation failure will
+	 * reduce the hint, stopping further iterations.
+	 */
+	while (count <= bl->bl_root->bm_bighint) {
 		if (bl->bl_radix == BLIST_BMAP_RADIX)
-			blk = blst_leaf_alloc(bl->bl_root, 0, count);
+			blk = blst_leaf_alloc(bl->bl_root, 0, count,
+			    bl->bl_cursor);
 		else
 			blk = blst_meta_alloc(bl->bl_root, 0, count,
-			    bl->bl_radix, bl->bl_skip);
-		return (blk);
+			    bl->bl_radix, bl->bl_skip, bl->bl_cursor);
+		if (blk != SWAPBLK_NONE) {
+			bl->bl_cursor = blk + count;
+			return (blk);
+		} else if (bl->bl_cursor != 0)
+			bl->bl_cursor = 0;
 	}
 	return (SWAPBLK_NONE);
 }
@@ -341,77 +354,92 @@ blist_print(blist_t bl)
 /*
  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
  *
- *	This is the core of the allocator and is optimized for the 1 block
- *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
- *	somewhat slower.  The 1 block allocation case is log2 and extremely
- *	quick.
+ *	This is the core of the allocator and is optimized for the
+ *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
+ *	time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
  */
 
 static daddr_t
-blst_leaf_alloc(
-	blmeta_t *scan,
-	daddr_t blk,
-	int count
-) {
-	u_daddr_t orig = scan->u.bmu_bitmap;
+blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
+{
+	u_daddr_t mask;
+	int count1, hi, lo, mid, num_shifts, range1, range_ext;
 
-	if (orig == 0) {
+	if (count == BLIST_BMAP_RADIX) {
 		/*
-		 * Optimize bitmap all-allocated case.  Also, count = 1
-		 * case assumes at least 1 bit is free in the bitmap, so
-		 * we have to take care of this case here.
+		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
+		 * a special case, then forming the final value of 'mask' below
+		 * would require special handling to avoid an invalid left shift
+		 * when count equals the number of bits in mask.
 		 */
+		if (~scan->u.bmu_bitmap != 0) {
+			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
+			return (SWAPBLK_NONE);
+		}
+		if (cursor != blk)
+			return (SWAPBLK_NONE);
+		scan->u.bmu_bitmap = 0;
 		scan->bm_bighint = 0;
-		return(SWAPBLK_NONE);
+		return (blk);
 	}
-	if (count == 1) {
+	range1 = 0;
+	count1 = count - 1;
+	num_shifts = fls(count1);
+	mask = scan->u.bmu_bitmap;
+	while (mask != 0 && num_shifts > 0) {
 		/*
-		 * Optimized code to allocate one bit out of the bitmap
+		 * If bit i is set in mask, then bits in [i, i+range1] are set
+		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
+		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
+		 * while preserving these invariants.  The updates to mask leave
+		 * fewer bits set, but each bit that remains set represents a
+		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
 		 */
-		u_daddr_t mask;
-		int j = BLIST_BMAP_RADIX/2;
-		int r = 0;
-
-		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
-
-		while (j) {
-			if ((orig & mask) == 0) {
-			    r += j;
-			    orig >>= j;
-			}
-			j >>= 1;
-			mask >>= j;
-		}
-		scan->u.bmu_bitmap &= ~((u_daddr_t)1 << r);
-		return(blk + r);
+		num_shifts--;
+		range_ext = range1 + ((count1 >> num_shifts) & 1);
+		mask &= mask >> range_ext;
+		range1 += range_ext;
 	}
-	if (count <= BLIST_BMAP_RADIX) {
+	if (mask == 0) {
 		/*
-		 * non-optimized code to allocate N bits out of the bitmap.
-		 * The more bits, the faster the code runs.  It will run
-		 * the slowest allocating 2 bits, but since there aren't any
-		 * memory ops in the core loop (or shouldn't be, anyway),
-		 * you probably won't notice the difference.
+		 * Update bighint.  There is no allocation bigger than range1
+		 * available in this leaf.
 		 */
-		int j;
-		int n = BLIST_BMAP_RADIX - count;
-		u_daddr_t mask;
+		scan->bm_bighint = range1;
+		return (SWAPBLK_NONE);
+	}
 
-		mask = (u_daddr_t)-1 >> n;
+	/*
+	 * Discard any candidates that appear before the cursor.
+	 */
+	lo = cursor - blk;
+	mask &= ~(u_daddr_t)0 << lo;
 
-		for (j = 0; j <= n; ++j) {
-			if ((orig & mask) == mask) {
-				scan->u.bmu_bitmap &= ~mask;
-				return(blk + j);
-			}
-			mask = (mask << 1);
-		}
+	if (mask == 0)
+		return (SWAPBLK_NONE);
+
+	/*
+	 * The least significant set bit in mask marks the start of the first
+	 * available range of sufficient size.  Clear all the bits but that one,
+	 * and then perform a binary search to find its position.
+	 */
+	mask &= -mask;
+	hi = BLIST_BMAP_RADIX - count1;
+	while (lo + 1 < hi) {
+		mid = (lo + hi) >> 1;
+		if ((mask >> mid) != 0)
+			lo = mid;
+		else
+			hi = mid;
 	}
+
 	/*
-	 * We couldn't allocate count in this subtree, update bighint.
+	 * Set in mask exactly the bits being allocated, and clear them from
+	 * the set of available bits.
 	 */
-	scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+	mask = (mask << count) - mask;
+	scan->u.bmu_bitmap &= ~mask;
+	return (blk + lo);
 }
 
 /*
@@ -424,16 +452,12 @@ blst_leaf_alloc(
  */
 
 static daddr_t
-blst_meta_alloc(
-	blmeta_t *scan, 
-	daddr_t blk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip
-) {
-	daddr_t r;
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t cursor)
+{
+	daddr_t i, next_skip, r;
+	int child;
+	bool scan_from_start;
 
 	if (scan->u.bmu_avail < count) {
 		/*
@@ -444,6 +468,7 @@ blst_meta_alloc(
 		scan->bm_bighint = scan->u.bmu_avail;
 		return (SWAPBLK_NONE);
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
 	/*
 	 * An ALL-FREE meta node requires special handling before allocating
@@ -457,13 +482,11 @@ blst_meta_alloc(
 		 * meta node cannot have a terminator in any subtree.
 		 */
 		for (i = 1; i <= skip; i += next_skip) {
-			if (next_skip == 1) {
+			if (next_skip == 1)
 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
-				scan[i].bm_bighint = BLIST_BMAP_RADIX;
-			} else {
-				scan[i].bm_bighint = radix;
+			else
 				scan[i].u.bmu_avail = radix;
-			}
+			scan[i].bm_bighint = radix;
 		}
 	} else {
 		radix /= BLIST_META_RADIX;
@@ -476,16 +499,21 @@ blst_meta_alloc(
 		 */
 		panic("allocation too large");
 	}
-	for (i = 1; i <= skip; i += next_skip) {
+	scan_from_start = cursor == blk;
+	child = (cursor - blk) / radix;
+	blk += child * radix;
+	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
 		if (count <= scan[i].bm_bighint) {
 			/*
 			 * The allocation might fit in the i'th subtree.
 			 */
 			if (next_skip == 1) {
-				r = blst_leaf_alloc(&scan[i], blk, count);
+				r = blst_leaf_alloc(&scan[i], blk, count,
+				    cursor > blk ? cursor : blk);
 			} else {
 				r = blst_meta_alloc(&scan[i], blk, count,
-				    radix, next_skip - 1);
+				    radix, next_skip - 1, cursor > blk ?
+				    cursor : blk);
 			}
 			if (r != SWAPBLK_NONE) {
 				scan->u.bmu_avail -= count;
@@ -503,9 +531,10 @@ blst_meta_alloc(
 	/*
 	 * We couldn't allocate count in this subtree, update bighint.
 	 */
-	if (scan->bm_bighint >= count)
+	if (scan_from_start && scan->bm_bighint >= count)
 		scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+
+	return (SWAPBLK_NONE);
 }
 
 /*
@@ -558,16 +587,11 @@ blst_leaf_free(
  */
 
 static void 
-blst_meta_free(
-	blmeta_t *scan, 
-	daddr_t freeBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, next_skip, v;
+	int child;
 
 #if 0
 	printf("free (%llx,%lld) FROM (%llx,%lld)\n",
@@ -575,6 +599,7 @@ blst_meta_free(
 	    (long long)blk, (long long)radix
 	);
 #endif
+	next_skip = skip / BLIST_META_RADIX;
 
 	if (scan->u.bmu_avail == 0) {
 		/*
@@ -619,13 +644,10 @@ blst_meta_free(
 
 	radix /= BLIST_META_RADIX;
 
-	i = (freeBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	child = (freeBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < freeBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - freeBlk;
 		if (v > count)
 			v = count;
@@ -662,8 +684,7 @@ static void blst_copy(
 	blist_t dest,
 	daddr_t count
 ) {
-	int next_skip;
-	int i;
+	daddr_t i, next_skip;
 
 	/*
 	 * Leaf node
@@ -708,7 +729,7 @@ static void blst_copy(
 
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; count && i <= skip; i += next_skip) {
 		if (scan[i].bm_bighint == (daddr_t)-1)
@@ -775,17 +796,11 @@ blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
  *	number of blocks allocated by the call.
  */
 static daddr_t
-blst_meta_fill(
-	blmeta_t *scan,
-	daddr_t allocBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
-	daddr_t nblks = 0;
+blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, nblks, next_skip, v;
+	int child;
 
 	if (count > radix) {
 		/*
@@ -803,6 +818,7 @@ blst_meta_fill(
 		scan->bm_bighint = 0;
 		return nblks;
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
 	/*
 	 * An ALL-FREE meta node requires special handling before allocating
@@ -828,13 +844,11 @@ blst_meta_fill(
 		radix /= BLIST_META_RADIX;
 	}
 
-	i = (allocBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	nblks = 0;
+	child = (allocBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < allocBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - allocBlk;
 		if (v > count)
 			v = count;
@@ -867,12 +881,12 @@ blst_meta_fill(
  */
 
 static daddr_t	
-blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
 {
-	int i;
-	int next_skip;
-	daddr_t memindex = 0;
+	daddr_t i, memindex, next_skip;
 
+	memindex = 0;
+
 	/*
 	 * Leaf node
 	 */
@@ -897,7 +911,7 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int ski
 	}
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; i <= skip; i += next_skip) {
 		if (count >= radix) {
@@ -939,11 +953,10 @@ blst_radix_init(blmeta_t *scan, daddr_t radix, int ski
 #ifdef BLIST_DEBUG
 
 static void	
-blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+    int tab)
 {
-	int i;
-	int next_skip;
-	int lastState = 0;
+	daddr_t i, next_skip;
 
 	if (radix == BLIST_BMAP_RADIX) {
 		printf(
@@ -985,7 +998,7 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t 
 	);
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 	tab += 4;
 
 	for (i = 1; i <= skip; i += next_skip) {
@@ -995,7 +1008,6 @@ blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t 
 			    tab, tab, "",
 			    (long long)blk, (long long)radix
 			);
-			lastState = 0;
 			break;
 		}
 		blst_radix_print(

Modified: stable/10/sys/sys/blist.h
==============================================================================
--- stable/10/sys/sys/blist.h	Tue Jul 25 03:59:35 2017	(r321458)
+++ stable/10/sys/sys/blist.h	Tue Jul 25 04:13:43 2017	(r321459)
@@ -82,6 +82,7 @@ typedef struct blist {
 	daddr_t		bl_blocks;	/* area of coverage		*/
 	daddr_t		bl_radix;	/* coverage radix		*/
 	daddr_t		bl_skip;	/* starting skip		*/
+	daddr_t		bl_cursor;	/* next-fit search starts at	*/
 	blmeta_t	*bl_root;	/* root of radix tree		*/
 } *blist_t;
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201707250413.v6P4DhQR070299>