Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 7 Dec 2009 16:03:30 -0800
From:      Matt Reimer <mattjreimer@gmail.com>
To:        freebsd-fs <freebsd-fs@freebsd.org>
Subject:   PATCH: more efficient raidz memory usage for (gpt)zfsboot
Message-ID:  <f383264b0912071603j3bd0034cu7f3b1e72f4a220c4@mail.gmail.com>

next in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Teach the (gpt)zfsboot raidz code to use its buffers more efficiently.

Before this patch, in the worst case memory use would increase
exponentially on the number of drives in the raidz vdev.

Sponsored by: VPOP Technologies, Inc.

Matt Reimer

[-- Attachment #2 --]
--- /sys/cddl/boot/zfs/zfssubr.c.ORIG	2009-11-14 08:14:51.000000000 -0800
+++ /sys/cddl/boot/zfs/zfssubr.c	2009-12-07 15:27:49.000000000 -0800
@@ -454,7 +454,7 @@
 
 static void
 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
-    int x, int y)
+    int x, int y, void *temp_p, void *temp_q)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
@@ -478,10 +478,8 @@
 	xsize = cols[x].rc_size;
 	ysize = cols[y].rc_size;
 
-	cols[VDEV_RAIDZ_P].rc_data =
-		zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
-	cols[VDEV_RAIDZ_Q].rc_data =
-		zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
+	cols[VDEV_RAIDZ_P].rc_data = temp_p;
+	cols[VDEV_RAIDZ_Q].rc_data = temp_q;
 	cols[x].rc_size = 0;
 	cols[y].rc_size = 0;
 
@@ -551,9 +549,12 @@
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, coff;
-	int c, c1, bc, col, acols, devidx, asize, n;
+	int c, c1, bc, col, acols, devidx, asize, n, max_rc_size;
 	static raidz_col_t cols[16];
 	raidz_col_t *rc, *rc1;
+	void *orig, *orig1, *temp_p, *temp_q;
+
+	orig = orig1 = temp_p = temp_q = NULL;
 
 	q = s / (dcols - nparity);
 	r = s - q * (dcols - nparity);
@@ -561,6 +562,7 @@
 
 	acols = (q == 0 ? bc : dcols);
 	asize = 0;
+	max_rc_size = 0;
 	
 	for (c = 0; c < acols; c++) {
 		col = f + c;
@@ -577,6 +579,8 @@
 		cols[c].rc_tried = 0;
 		cols[c].rc_skipped = 0;
 		asize += cols[c].rc_size;
+		if (cols[c].rc_size > max_rc_size)
+			max_rc_size = cols[c].rc_size;
 	}
 
 	asize = roundup(asize, (nparity + 1) << unit_shift);
@@ -777,8 +781,13 @@
 			//ASSERT(c != acols);
 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
 
+			if (!temp_p)
+				temp_p = zfs_alloc_temp(max_rc_size);
+			if (!temp_q)
+				temp_q = zfs_alloc_temp(max_rc_size);
+
 			vdev_raidz_reconstruct_pq(cols, nparity, acols,
-			    c1, c);
+			    c1, c, temp_p, temp_q);
 
 			if (zio_checksum_error(bp, buf) == 0)
 				return (0);
@@ -845,18 +854,12 @@
 		return (EIO);
 	}
 
-	asize = 0;
-	for (c = 0; c < acols; c++) {
-		rc = &cols[c];
-		if (rc->rc_size > asize)
-			asize = rc->rc_size;
-	}
 	if (cols[VDEV_RAIDZ_P].rc_error == 0) {
 		/*
 		 * Attempt to reconstruct the data from parity P.
 		 */
-		void *orig;
-		orig = zfs_alloc_temp(asize);
+		if (!orig)
+			orig = zfs_alloc_temp(max_rc_size);
 		for (c = nparity; c < acols; c++) {
 			rc = &cols[c];
 
@@ -874,8 +877,8 @@
 		/*
 		 * Attempt to reconstruct the data from parity Q.
 		 */
-		void *orig;
-		orig = zfs_alloc_temp(asize);
+		if (!orig)
+			orig = zfs_alloc_temp(max_rc_size);
 		for (c = nparity; c < acols; c++) {
 			rc = &cols[c];
 
@@ -895,9 +898,14 @@
 		/*
 		 * Attempt to reconstruct the data from both P and Q.
 		 */
-		void *orig, *orig1;
-		orig = zfs_alloc_temp(asize);
-		orig1 = zfs_alloc_temp(asize);
+		if (!orig)
+			orig = zfs_alloc_temp(max_rc_size);
+		if (!orig1)
+			orig1 = zfs_alloc_temp(max_rc_size);
+		if (!temp_p)
+			temp_p = zfs_alloc_temp(max_rc_size);
+		if (!temp_q)
+			temp_q = zfs_alloc_temp(max_rc_size);
 		for (c = nparity; c < acols - 1; c++) {
 			rc = &cols[c];
 
@@ -909,7 +917,7 @@
 				memcpy(orig1, rc1->rc_data, rc1->rc_size);
 
 				vdev_raidz_reconstruct_pq(cols, nparity,
-				    acols, c, c1);
+				    acols, c, c1, temp_p, temp_q);
 
 				if (zio_checksum_error(bp, buf) == 0)
 					return (0);

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?f383264b0912071603j3bd0034cu7f3b1e72f4a220c4>