changeset 10105:17811c723fb4

6854612 triple-parity RAID-Z 6854621 RAID-Z should mind the gap on writes
author Adam Leventhal <adam.leventhal@sun.com>
date Thu, 16 Jul 2009 15:51:38 -0700
parents 915ce9b192fd
children b235491976d3
files usr/src/cmd/zpool/zpool_vdev.c usr/src/cmd/ztest/ztest.c usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h usr/src/uts/common/fs/zfs/sys/zio.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_label.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/vdev_raidz.c usr/src/uts/common/sys/fs/zfs.h
diffstat 9 files changed, 1167 insertions(+), 325 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/zpool/zpool_vdev.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/cmd/zpool/zpool_vdev.c	Thu Jul 16 15:51:38 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +67,7 @@
 #include <libdiskmgt.h>
 #include <libintl.h>
 #include <libnvpair.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
@@ -1093,19 +1094,34 @@
 }
 
 static const char *
-is_grouping(const char *type, int *mindev)
+is_grouping(const char *type, int *mindev, int *maxdev)
 {
-	if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
+	if (strncmp(type, "raidz", 5) == 0) {
+		const char *p = type + 5;
+		char *end;
+		long nparity;
+
+		if (*p == '\0') {
+			nparity = 1;
+		} else if (*p == '0') {
+			return (NULL); /* no zero prefixes allowed */
+		} else {
+			errno = 0;
+			nparity = strtol(p, &end, 10);
+			if (errno != 0 || nparity < 1 || nparity >= 255 ||
+			    *end != '\0')
+				return (NULL);
+		}
+
 		if (mindev != NULL)
-			*mindev = 2;
+			*mindev = nparity + 1;
+		if (maxdev != NULL)
+			*maxdev = 255;
 		return (VDEV_TYPE_RAIDZ);
 	}
 
-	if (strcmp(type, "raidz2") == 0) {
-		if (mindev != NULL)
-			*mindev = 3;
-		return (VDEV_TYPE_RAIDZ);
-	}
+	if (maxdev != NULL)
+		*maxdev = INT_MAX;
 
 	if (strcmp(type, "mirror") == 0) {
 		if (mindev != NULL)
@@ -1144,7 +1160,7 @@
 construct_spec(int argc, char **argv)
 {
 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
-	int t, toplevels, mindev, nspares, nlogs, nl2cache;
+	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
 	const char *type;
 	uint64_t is_log;
 	boolean_t seen_logs;
@@ -1166,7 +1182,7 @@
 		 * If it's a mirror or raidz, the subsequent arguments are
 		 * its leaves -- until we encounter the next mirror or raidz.
 		 */
-		if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;
 
@@ -1223,7 +1239,7 @@
 			}
 
 			for (c = 1; c < argc; c++) {
-				if (is_grouping(argv[c], NULL) != NULL)
+				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
 				children++;
 				child = realloc(child,
@@ -1243,6 +1259,13 @@
 				return (NULL);
 			}
 
+			if (children > maxdev) {
+				(void) fprintf(stderr, gettext("invalid vdev "
+				    "specification: %s supports no more than "
+				    "%d devices\n"), argv[0], maxdev);
+				return (NULL);
+			}
+
 			argc -= c;
 			argv += c;
 
--- a/usr/src/cmd/ztest/ztest.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Thu Jul 16 15:51:38 2009 -0700
@@ -479,7 +479,7 @@
 			zopt_raidz = MAX(1, value);
 			break;
 		case 'R':
-			zopt_raidz_parity = MIN(MAX(value, 1), 2);
+			zopt_raidz_parity = MIN(MAX(value, 1), 3);
 			break;
 		case 'd':
 			zopt_datasets = MAX(1, value);
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Thu Jul 16 15:51:38 2009 -0700
@@ -27,7 +27,7 @@
 /*
  * On-disk version number.
  */
-#define	SPA_VERSION			16ULL
+#define	SPA_VERSION			17ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Thu Jul 16 15:51:38 2009 -0700
@@ -143,6 +143,8 @@
 #define	ZIO_FLAG_GODFATHER		0x080000
 
 #define	ZIO_FLAG_TRYHARD		0x100000
+#define	ZIO_FLAG_NODATA			0x200000
+#define	ZIO_FLAG_OPTIONAL		0x400000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -161,7 +163,9 @@
 	ZIO_FLAG_IO_REPAIR |		\
 	ZIO_FLAG_IO_RETRY |		\
 	ZIO_FLAG_PROBE |		\
-	ZIO_FLAG_TRYHARD)
+	ZIO_FLAG_TRYHARD |		\
+	ZIO_FLAG_NODATA |		\
+	ZIO_FLAG_OPTIONAL)
 
 #define	ZIO_FLAG_AGG_INHERIT		\
 	(ZIO_FLAG_DONT_AGGREGATE |	\
--- a/usr/src/uts/common/fs/zfs/vdev.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Thu Jul 16 15:51:38 2009 -0700
@@ -405,22 +405,26 @@
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			/*
-			 * Currently, we can only support 2 parity devices.
+			 * Currently, we can only support 3 parity devices.
 			 */
-			if (nparity == 0 || nparity > 2)
+			if (nparity == 0 || nparity > 3)
 				return (EINVAL);
 			/*
-			 * Older versions can only support 1 parity device.
+			 * Previous versions could only support 1 or 2 parity
+			 * device.
 			 */
-			if (nparity == 2 &&
-			    spa_version(spa) < SPA_VERSION_RAID6)
+			if (nparity > 1 &&
+			    spa_version(spa) < SPA_VERSION_RAIDZ2)
+				return (ENOTSUP);
+			if (nparity > 2 &&
+			    spa_version(spa) < SPA_VERSION_RAIDZ3)
 				return (ENOTSUP);
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
-			if (spa_version(spa) >= SPA_VERSION_RAID6)
+			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 				return (EINVAL);
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Thu Jul 16 15:51:38 2009 -0700
@@ -246,8 +246,10 @@
 		 * into a crufty old storage pool.
 		 */
 		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity == 2 &&
-		    spa_version(spa) >= SPA_VERSION_RAID6));
+		    (vd->vdev_nparity <= 2 &&
+		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+		    (vd->vdev_nparity <= 3 &&
+		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
 
 		/*
 		 * Note that we'll add the nparity tag even on storage pools
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Thu Jul 16 15:51:38 2009 -0700
@@ -24,7 +24,7 @@
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
@@ -48,11 +48,14 @@
 int zfs_vdev_ramp_rate = 2;
 
 /*
- * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
- * For read i/os, we also aggregate across small adjacency gaps.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
  */
 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
  * Virtual device vector for disk I/O scheduling.
@@ -172,12 +175,14 @@
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
-	zio_t *fio, *lio, *aio, *dio, *nio;
+	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
 	avl_tree_t *t;
 	int flags;
 	uint64_t maxspan = zfs_vdev_aggregation_limit;
 	uint64_t maxgap;
+	int stretch;
 
+again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
@@ -192,21 +197,88 @@
 
 	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
 		/*
-		 * We can aggregate I/Os that are adjacent and of the
-		 * same flavor, as expressed by the AGG_INHERIT flags.
-		 * The latter is necessary so that certain attributes
-		 * of the I/O, such as whether it's a normal I/O or a
-		 * scrub/resilver, can be preserved in the aggregate.
+		 * We can aggregate I/Os that are sufficiently adjacent and of
+		 * the same flavor, as expressed by the AGG_INHERIT flags.
+		 * The latter requirement is necessary so that certain
+		 * attributes of the I/O, such as whether it's a normal I/O
+		 * or a scrub/resilver, can be preserved in the aggregate.
+		 * We can include optional I/Os, but don't allow them
+		 * to begin a range as they add no benefit in that situation.
+		 */
+
+		/*
+		 * We keep track of the last non-optional I/O.
+		 */
+		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+		/*
+		 * Walk backwards through sufficiently contiguous I/Os
+		 * recording the last non-option I/O.
 		 */
 		while ((dio = AVL_PREV(t, fio)) != NULL &&
 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
+		    IO_SPAN(dio, lio) <= maxspan &&
+		    IO_GAP(dio, fio) <= maxgap) {
 			fio = dio;
+			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+				mio = fio;
+		}
 
+		/*
+		 * Skip any initial optional I/Os.
+		 */
+		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+			fio = AVL_NEXT(t, fio);
+			ASSERT(fio != NULL);
+		}
+
+		/*
+		 * Walk forward through sufficiently contiguous I/Os.
+		 */
 		while ((dio = AVL_NEXT(t, lio)) != NULL &&
 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
+		    IO_SPAN(fio, dio) <= maxspan &&
+		    IO_GAP(lio, dio) <= maxgap) {
 			lio = dio;
+			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+				mio = lio;
+		}
+
+		/*
+		 * Now that we've established the range of the I/O aggregation
+		 * we must decide what to do with trailing optional I/Os.
+		 * For reads, there's nothing to do. While we are unable to
+		 * aggregate further, it's possible that a trailing optional
+		 * I/O would allow the underlying device to aggregate with
+		 * subsequent I/Os. We must therefore determine if the next
+		 * non-optional I/O is close enough to make aggregation
+		 * worthwhile.
+		 */
+		stretch = B_FALSE;
+		if (t != &vq->vq_read_tree && mio != NULL) {
+			nio = lio;
+			while ((dio = AVL_NEXT(t, nio)) != NULL &&
+			    IO_GAP(nio, dio) == 0 &&
+			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+				nio = dio;
+				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+					stretch = B_TRUE;
+					break;
+				}
+			}
+		}
+
+		if (stretch) {
+			/* This may be a no-op. */
+			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+		} else {
+			while (lio != mio && lio != fio) {
+				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+				lio = AVL_PREV(t, lio);
+				ASSERT(lio != NULL);
+			}
+		}
 	}
 
 	if (fio != lio) {
@@ -225,10 +297,15 @@
 			ASSERT(dio->io_type == aio->io_type);
 			ASSERT(dio->io_vdev_tree == t);
 
-			if (dio->io_type == ZIO_TYPE_WRITE)
+			if (dio->io_flags & ZIO_FLAG_NODATA) {
+				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+				bzero((char *)aio->io_data + (dio->io_offset -
+				    aio->io_offset), dio->io_size);
+			} else if (dio->io_type == ZIO_TYPE_WRITE) {
 				bcopy(dio->io_data, (char *)aio->io_data +
 				    (dio->io_offset - aio->io_offset),
 				    dio->io_size);
+			}
 
 			zio_add_child(dio, aio);
 			vdev_queue_io_remove(vq, dio);
@@ -244,6 +321,20 @@
 	ASSERT(fio->io_vdev_tree == t);
 	vdev_queue_io_remove(vq, fio);
 
+	/*
+	 * If the I/O is or was optional and therefore has no data, we need to
+	 * simply discard it. We need to drop the vdev queue's lock to avoid a
+	 * deadlock that we could encounter since this I/O will complete
+	 * immediately.
+	 */
+	if (fio->io_flags & ZIO_FLAG_NODATA) {
+		mutex_exit(&vq->vq_lock);
+		zio_vdev_io_bypass(fio);
+		zio_execute(fio);
+		mutex_enter(&vq->vq_lock);
+		goto again;
+	}
+
 	avl_add(&vq->vq_pending_tree, fio);
 
 	return (fio);
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Thu Jul 16 15:51:38 2009 -0700
@@ -35,12 +35,27 @@
 /*
  * Virtual device vector for RAID-Z.
  *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
@@ -55,22 +70,32 @@
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
  *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
@@ -84,21 +109,49 @@
 } raidz_col_t;
 
 typedef struct raidz_map {
-	uint64_t rm_cols;		/* Column count */
+	uint64_t rm_cols;		/* Regular column count */
+	uint64_t rm_scols;		/* Count including skipped columns */
 	uint64_t rm_bigcols;		/* Number of oversized columns */
 	uint64_t rm_asize;		/* Actual total I/O size */
 	uint64_t rm_missingdata;	/* Count of missing data devices */
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
+	uint64_t rm_skipped;		/* Skipped sectors for padding */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
+#define	VDEV_RAIDZ_R		2
+#define	VDEV_RAIDZ_MAXPARITY	3
 
-#define	VDEV_RAIDZ_MAXPARITY	2
+#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
-#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define	VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+	(mask) = (x) & 0x8080808080808080ULL; \
+	(mask) = ((mask) << 1) - ((mask) >> 7); \
+	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
+
+#define	VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
@@ -201,7 +254,7 @@
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
 static raidz_map_t *
@@ -213,24 +266,35 @@
 	uint64_t s = zio->io_size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, coff, devidx;
+	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
 	q = s / (dcols - nparity);
 	r = s - q * (dcols - nparity);
 	bc = (r == 0 ? 0 : r + nparity);
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
-	acols = (q == 0 ? bc : dcols);
+	if (q == 0) {
+		acols = bc;
+		scols = MIN(dcols, roundup(bc, nparity + 1));
+	} else {
+		acols = dcols;
+		scols = dcols;
+	}
 
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+	ASSERT3U(acols, <=, scols);
+
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
 	rm->rm_cols = acols;
+	rm->rm_scols = scols;
 	rm->rm_bigcols = bc;
-	rm->rm_asize = 0;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
 
-	for (c = 0; c < acols; c++) {
+	asize = 0;
+
+	for (c = 0; c < scols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
@@ -239,15 +303,26 @@
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
 		rm->rm_col[c].rc_data = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
 		rm->rm_col[c].rc_skipped = 0;
-		rm->rm_asize += rm->rm_col[c].rc_size;
+
+		if (c >= acols)
+			rm->rm_col[c].rc_size = 0;
+		else if (c < bc)
+			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		else
+			rm->rm_col[c].rc_size = q << unit_shift;
+
+		asize += rm->rm_col[c].rc_size;
 	}
 
-	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+	ASSERT3U(asize, ==, tot << unit_shift);
+	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+	rm->rm_skipped = roundup(tot, nparity + 1) - tot;
+	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
+	ASSERT3U(rm->rm_skipped, <=, nparity);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -305,12 +380,12 @@
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccount == pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p = *src;
 			}
 		} else {
 			ASSERT(ccount <= pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p ^= *src;
 			}
 		}
@@ -320,10 +395,10 @@
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-	uint64_t *q, *p, *src, pcount, ccount, mask, i;
+	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 	int c;
 
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
@@ -331,55 +406,138 @@
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount || ccount == 0);
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				*q = *src;
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p = *src;
+				*q = *src;
 			}
-			for (; i < pcount; i++, p++, q++, src++) {
+			for (; i < pcnt; i++, src++, p++, q++) {
+				*p = 0;
 				*q = 0;
-				*p = 0;
 			}
 		} else {
-			ASSERT(ccount <= pcount);
+			ASSERT(ccnt <= pcnt);
 
 			/*
-			 * Rather than multiplying each byte individually (as
-			 * described above), we are able to handle 8 at once
-			 * by generating a mask based on the high bit in each
-			 * byte and using that to conditionally XOR in 0x1d.
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
 			 */
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
+				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
-				*p ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
 			 */
-			for (; i < pcount; i++, q++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (; i < pcnt; i++, q++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
 			}
 		}
 	}
 }
 
 static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+	int c;
+
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		src = rm->rm_col[c].rc_data;
+		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		if (c == rm->rm_firstdatacol) {
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+				*p = *src;
+				*q = *src;
+				*r = *src;
+			}
+			for (; i < pcnt; i++, src++, p++, q++, r++) {
+				*p = 0;
+				*q = 0;
+				*r = 0;
+			}
+		} else {
+			ASSERT(ccnt <= pcnt);
+
+			/*
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
+			 */
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				*q ^= *src;
+
+				VDEV_RAIDZ_64MUL_4(*r, mask);
+				*r ^= *src;
+			}
+
+			/*
+			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
+			 */
+			for (; i < pcnt; i++, q++, r++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				VDEV_RAIDZ_64MUL_4(*r, mask);
+			}
+		}
+	}
+}
+
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
+static void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+	switch (rm->rm_firstdatacol) {
+	case 1:
+		vdev_raidz_generate_parity_p(rm);
+		break;
+	case 2:
+		vdev_raidz_generate_parity_pq(rm);
+		break;
+	case 3:
+		vdev_raidz_generate_parity_pqr(rm);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+	}
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, i;
+	int x = tgts[0];
 	int c;
 
+	ASSERT(ntgts == 1);
+	ASSERT(x >= rm->rm_firstdatacol);
+	ASSERT(x < rm->rm_cols);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 	ASSERT(xcount > 0);
@@ -404,15 +562,20 @@
 			*dst ^= *src;
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_P);
 }
 
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
 	uint8_t *b;
+	int x = tgts[0];
 	int c, j, exp;
 
+	ASSERT(ntgts == 1);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
@@ -436,23 +599,13 @@
 			}
 
 		} else {
-			/*
-			 * For an explanation of this, see the comment in
-			 * vdev_raidz_generate_parity_pq() above.
-			 */
 			for (i = 0; i < count; i++, dst++, src++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 				*dst ^= *src;
 			}
 
 			for (; i < xcount; i++, dst++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 			}
 		}
 	}
@@ -467,15 +620,20 @@
 			*b = vdev_raidz_exp2(*b, exp);
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_Q);
 }
 
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
 	uint64_t xsize, ysize, i;
+	int x = tgts[0];
+	int y = tgts[1];
 
+	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(y < rm->rm_cols);
@@ -553,13 +711,554 @@
 	 */
 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+    uint8_t **rows)
+{
+	int i, j;
+	int pow;
+
+	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+	/*
+	 * Fill in the missing rows of interest.
+	 */
+	for (i = 0; i < nmap; i++) {
+		ASSERT3S(0, <=, map[i]);
+		ASSERT3S(map[i], <=, 2);
+
+		pow = map[i] * n;
+		if (pow > 255)
+			pow -= 255;
+		ASSERT(pow <= 255);
+
+		for (j = 0; j < n; j++) {
+			pow -= map[i];
+			if (pow < 0)
+				pow += 255;
+			rows[i][j] = vdev_raidz_pow2[pow];
+		}
+	}
 }
 
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, ii, jj;
+	uint8_t log;
+
+	/*
+	 * Assert that the first nmissing entries from the array of used
+	 * columns correspond to parity columns and that subsequent entries
+	 * correspond to data columns.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		ASSERT3S(used[i], <, rm->rm_firstdatacol);
+	}
+	for (; i < n; i++) {
+		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+	}
+
+	/*
+	 * First initialize the storage where we'll compute the inverse rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			invrows[i][j] = (i == j) ? 1 : 0;
+		}
+	}
+
+	/*
+	 * Subtract all trivial rows from the rows of consequence.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = nmissing; j < n; j++) {
+			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+			jj = used[j] - rm->rm_firstdatacol;
+			ASSERT3S(jj, <, n);
+			invrows[i][j] = rows[i][jj];
+			rows[i][jj] = 0;
+		}
+	}
+
+	/*
+	 * For each of the rows of interest, we must normalize it and subtract
+	 * a multiple of it from the other rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < missing[i]; j++) {
+			ASSERT3U(rows[i][j], ==, 0);
+		}
+		ASSERT3U(rows[i][missing[i]], !=, 0);
+
+		/*
+		 * Compute the inverse of the first element and multiply each
+		 * element in the row by that value.
+		 */
+		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+		for (j = 0; j < n; j++) {
+			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+		}
+
+		for (ii = 0; ii < nmissing; ii++) {
+			if (i == ii)
+				continue;
+
+			ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+			log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+			for (j = 0; j < n; j++) {
+				rows[ii][j] ^=
+				    vdev_raidz_exp2(rows[i][j], log);
+				invrows[ii][j] ^=
+				    vdev_raidz_exp2(invrows[i][j], log);
+			}
+		}
+	}
+
+	/*
+	 * Verify that the data that is left in the rows are properly part of
+	 * an identity matrix.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			if (j == missing[i]) {
+				ASSERT3U(rows[i][j], ==, 1);
+			} else {
+				ASSERT3U(rows[i][j], ==, 0);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, x, cc, c;
+	uint8_t *src;
+	uint64_t ccount;
+	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+	uint8_t log, val;
+	int ll;
+	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *p, *pp;
+	size_t psize;
+
+	psize = sizeof (invlog[0][0]) * n * nmissing;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing; i++) {
+		invlog[i] = pp;
+		pp += n;
+	}
+
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			ASSERT3U(invrows[i][j], !=, 0);
+			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		c = used[i];
+		ASSERT3U(c, <, rm->rm_cols);
+
+		src = rm->rm_col[c].rc_data;
+		ccount = rm->rm_col[c].rc_size;
+		for (j = 0; j < nmissing; j++) {
+			cc = missing[j] + rm->rm_firstdatacol;
+			ASSERT3U(cc, >=, rm->rm_firstdatacol);
+			ASSERT3U(cc, <, rm->rm_cols);
+			ASSERT3U(cc, !=, c);
+
+			dst[j] = rm->rm_col[cc].rc_data;
+			dcount[j] = rm->rm_col[cc].rc_size;
+		}
+
+		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+		for (x = 0; x < ccount; x++, src++) {
+			if (*src != 0)
+				log = vdev_raidz_log2[*src];
+
+			for (cc = 0; cc < nmissing; cc++) {
+				if (x >= dcount[cc])
+					continue;
+
+				if (*src == 0) {
+					val = 0;
+				} else {
+					if ((ll = log + invlog[cc][i]) >= 255)
+						ll -= 255;
+					val = vdev_raidz_pow2[ll];
+				}
+
+				if (i == 0)
+					dst[cc][x] = val;
+				else
+					dst[cc][x] ^= val;
+			}
+		}
+	}
+
+	kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+	int n, i, c, t, tt;
+	int nmissing_rows;
+	int missing_rows[VDEV_RAIDZ_MAXPARITY];
+	int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+	uint8_t *p, *pp;
+	size_t psize;
+
+	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *used;
+
+	int code = 0;
+
+
+	n = rm->rm_cols - rm->rm_firstdatacol;
+
+	/*
+	 * Figure out which data columns are missing.
+	 */
+	nmissing_rows = 0;
+	for (t = 0; t < ntgts; t++) {
+		if (tgts[t] >= rm->rm_firstdatacol) {
+			missing_rows[nmissing_rows++] =
+			    tgts[t] - rm->rm_firstdatacol;
+		}
+	}
+
+	/*
+	 * Figure out which parity columns to use to help generate the missing
+	 * data columns.
+	 */
+	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+		ASSERT(tt < ntgts);
+		ASSERT(c < rm->rm_firstdatacol);
+
+		/*
+		 * Skip any targeted parity columns.
+		 */
+		if (c == tgts[tt]) {
+			tt++;
+			continue;
+		}
+
+		code |= 1 << c;
+
+		parity_map[i] = c;
+		i++;
+	}
+
+	ASSERT(code != 0);
+	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+	    nmissing_rows * n + sizeof (used[0]) * n;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing_rows; i++) {
+		rows[i] = pp;
+		pp += n;
+		invrows[i] = pp;
+		pp += n;
+	}
+	used = pp;
+
+	for (i = 0; i < nmissing_rows; i++) {
+		used[i] = parity_map[i];
+	}
+
+	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		if (tt < nmissing_rows &&
+		    c == missing_rows[tt] + rm->rm_firstdatacol) {
+			tt++;
+			continue;
+		}
+
+		ASSERT3S(i, <, n);
+		used[i] = c;
+		i++;
+	}
+
+	/*
+	 * Initialize the interesting rows of the matrix.
+	 */
+	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+	/*
+	 * Invert the matrix.
+	 */
+	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+	    invrows, used);
+
+	/*
+	 * Reconstruct the missing data using the generated matrix.
+	 */
+	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+	    invrows, used);
+
+	kmem_free(p, psize);
+
+	return (code);
+}
+
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+	int ntgts;
+	int i, c;
+	int code;
+	int nbadparity, nbaddata;
+	int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+	/*
+	 * The tgts list must already be sorted.
+	 */
+	for (i = 1; i < nt; i++) {
+		ASSERT(t[i] > t[i - 1]);
+	}
+
+	nbadparity = rm->rm_firstdatacol;
+	nbaddata = rm->rm_cols - nbadparity;
+	ntgts = 0;
+	for (i = 0, c = 0; c < rm->rm_cols; c++) {
+		if (c < rm->rm_firstdatacol)
+			parity_valid[c] = B_FALSE;
+
+		if (i < nt && c == t[i]) {
+			tgts[ntgts++] = c;
+			i++;
+		} else if (rm->rm_col[c].rc_error != 0) {
+			tgts[ntgts++] = c;
+		} else if (c >= rm->rm_firstdatacol) {
+			nbaddata--;
+		} else {
+			parity_valid[c] = B_TRUE;
+			nbadparity--;
+		}
+	}
+
+	ASSERT(ntgts >= nt);
+	ASSERT(nbaddata >= 0);
+	ASSERT(nbaddata + nbadparity == ntgts);
+
+	dt = &tgts[nbadparity];
+
+	/*
+	 * See if we can use any of our optimized reconstruction routines.
+	 */
+	if (!vdev_raidz_default_to_general) {
+		switch (nbaddata) {
+		case 1:
+			if (parity_valid[VDEV_RAIDZ_P])
+				return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+			break;
+
+		case 2:
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_P] &&
+			    parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+
+			break;
+		}
+	}
+
+	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+	ASSERT(code > 0);
+	return (code);
+}
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
+	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
+	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -573,10 +1272,10 @@
 
 	vdev_open_children(vd);
 
-	for (int c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
 
-		if (cvd->vdev_open_error) {
+		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
@@ -599,7 +1298,9 @@
 static void
 vdev_raidz_close(vdev_t *vd)
 {
-	for (int c = 0; c < vd->vdev_children; c++)
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
@@ -637,7 +1338,7 @@
 	blkptr_t *bp = zio->io_bp;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
-	int c;
+	int c, i;
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
@@ -645,13 +1346,7 @@
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * Generate RAID parity in the first virtual columns.
-		 */
-		if (rm->rm_firstdatacol == 1)
-			vdev_raidz_generate_parity_p(rm);
-		else
-			vdev_raidz_generate_parity_pq(rm);
+		vdev_raidz_generate_parity(rm);
 
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
@@ -662,6 +1357,23 @@
 			    vdev_raidz_child_done, rc));
 		}
 
+		/*
+		 * Generate optional I/Os for any skipped sectors to improve
+		 * aggregation contiguity.
+		 */
+		for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
+			ASSERT(c <= rm->rm_scols);
+			if (c == rm->rm_scols)
+				c = 0;
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset + rc->rc_size, NULL,
+			    1 << tvd->vdev_ashift,
+			    zio->io_type, zio->io_priority,
+			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+		}
+
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
@@ -669,8 +1381,7 @@
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
-	 * last -- any errors along the way will force us to read the parity
-	 * data.
+	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
@@ -746,10 +1457,7 @@
 		bcopy(rc->rc_data, orig[c], rc->rc_size);
 	}
 
-	if (rm->rm_firstdatacol == 1)
-		vdev_raidz_generate_parity_p(rm);
-	else
-		vdev_raidz_generate_parity_pq(rm);
+	vdev_raidz_generate_parity(rm);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
@@ -766,9 +1474,10 @@
 	return (ret);
 }
 
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
@@ -781,19 +1490,176 @@
 	return (error);
 }
 
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+	void *orig[VDEV_RAIDZ_MAXPARITY];
+	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+	int *tgts = &tstore[1];
+	int current, next, i, c, n;
+	int code, ret = 0;
+
+	ASSERT(total_errors < rm->rm_firstdatacol);
+
+	/*
+	 * This simplifies one edge condition.
+	 */
+	tgts[-1] = -1;
+
+	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+		/*
+		 * Initialize the targets array by finding the first n columns
+		 * that contain no error.
+		 *
+		 * If there were no data errors, we need to ensure that we're
+		 * always explicitly attempting to reconstruct at least one
+		 * data column. To do this, we simply push the highest target
+		 * up into the data columns.
+		 */
+		for (c = 0, i = 0; i < n; i++) {
+			if (i == n - 1 && data_errors == 0 &&
+			    c < rm->rm_firstdatacol) {
+				c = rm->rm_firstdatacol;
+			}
+
+			while (rm->rm_col[c].rc_error != 0) {
+				c++;
+				ASSERT3S(c, <, rm->rm_cols);
+			}
+
+			tgts[i] = c++;
+		}
+
+		/*
+		 * Setting tgts[n] simplifies the other edge condition.
+		 */
+		tgts[n] = rm->rm_cols;
+
+		/*
+		 * These buffers were allocated in previous iterations.
+		 */
+		for (i = 0; i < n - 1; i++) {
+			ASSERT(orig[i] != NULL);
+		}
+
+		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+		current = 0;
+		next = tgts[current];
+
+		while (current != n) {
+			tgts[current] = next;
+			current = 0;
+
+			/*
+			 * Save off the original data that we're going to
+			 * attempt to reconstruct.
+			 */
+			for (i = 0; i < n; i++) {
+				ASSERT(orig[i] != NULL);
+				c = tgts[i];
+				ASSERT3S(c, >=, 0);
+				ASSERT3S(c, <, rm->rm_cols);
+				rc = &rm->rm_col[c];
+				bcopy(rc->rc_data, orig[i], rc->rc_size);
+			}
+
+			/*
+			 * Attempt a reconstruction and exit the outer loop on
+			 * success.
+			 */
+			code = vdev_raidz_reconstruct(rm, tgts, n);
+			if (zio_checksum_error(zio) == 0) {
+				atomic_inc_64(&raidz_corrected[code]);
+
+				for (i = 0; i < n; i++) {
+					c = tgts[i];
+					rc = &rm->rm_col[c];
+					ASSERT(rc->rc_error == 0);
+					if (rc->rc_tried)
+						raidz_checksum_error(zio, rc);
+					rc->rc_error = ECKSUM;
+				}
+
+				ret = code;
+				goto done;
+			}
+
+			/*
+			 * Restore the original data.
+			 */
+			for (i = 0; i < n; i++) {
+				c = tgts[i];
+				rc = &rm->rm_col[c];
+				bcopy(orig[i], rc->rc_data, rc->rc_size);
+			}
+
+			do {
+				/*
+				 * Find the next valid column after the current
+				 * position..
+				 */
+				for (next = tgts[current] + 1;
+				    next < rm->rm_cols &&
+				    rm->rm_col[next].rc_error != 0; next++)
+					continue;
+
+				ASSERT(next <= tgts[current + 1]);
+
+				/*
+				 * If that spot is available, we're done here.
+				 */
+				if (next != tgts[current + 1])
+					break;
+
+				/*
+				 * Otherwise, find the next valid column after
+				 * the previous position.
+				 */
+				for (c = tgts[current - 1] + 1;
+				    rm->rm_col[c].rc_error != 0; c++)
+					continue;
+
+				tgts[current] = c;
+				current++;
+
+			} while (current != n);
+		}
+	}
+	n--;
+done:
+	for (i = 0; i < n; i++) {
+		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+	}
+
+	return (ret);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc, *rc1;
+	raidz_col_t *rc;
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
-	int n, c, c1;
+	int n, c;
+	int tgts[VDEV_RAIDZ_MAXPARITY];
+	int code;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
@@ -857,8 +1723,7 @@
 	 * any errors.
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-		switch (data_errors) {
-		case 0:
+		if (data_errors == 0) {
 			if (zio_checksum_error(zio) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
@@ -878,9 +1743,7 @@
 				}
 				goto done;
 			}
-			break;
-
-		case 1:
+		} else {
 			/*
 			 * We either attempt to read all the parity columns or
 			 * none of them. If we didn't try to read parity, we
@@ -892,45 +1755,38 @@
 			ASSERT(parity_errors < rm->rm_firstdatacol);
 
 			/*
-			 * Find the column that reported the error.
+			 * Identify the data columns that reported an error.
 			 */
+			n = 0;
 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
+				if (rc->rc_error != 0) {
+					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+					tgts[n++] = c;
+				}
 			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
 
-			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-				vdev_raidz_reconstruct_p(rm, c);
-			} else {
-				ASSERT(rm->rm_firstdatacol > 1);
-				vdev_raidz_reconstruct_q(rm, c);
-			}
+			ASSERT(rm->rm_firstdatacol >= n);
+
+			code = vdev_raidz_reconstruct(rm, tgts, n);
 
 			if (zio_checksum_error(zio) == 0) {
-				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-					atomic_inc_64(&raidz_corrected_p);
-				else
-					atomic_inc_64(&raidz_corrected_q);
+				atomic_inc_64(&raidz_corrected[code]);
 
 				/*
-				 * If there's more than one parity disk that
-				 * was successfully read, confirm that the
-				 * other parity disk produced the correct data.
-				 * This routine is suboptimal in that it
-				 * regenerates both the parity we wish to test
-				 * as well as the parity we just used to
-				 * perform the reconstruction, but this should
-				 * be a relatively uncommon case, and can be
-				 * optimized if it becomes a problem.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
+				 * If we read more parity disks than were used
+				 * for reconstruction, confirm that the other
+				 * parity disks produced correct data. This
+				 * routine is suboptimal in that it regenerates
+				 * the parity that we already used in addition
+				 * to the parity that we're attempting to
+				 * verify, but this should be a relatively
+				 * uncommon case, and can be optimized if it
+				 * becomes a problem. Note that we regenerate
+				 * parity when resilvering so we can write it
+				 * out to failed devices later.
 				 */
-				if (parity_errors < rm->rm_firstdatacol - 1 ||
+				if (parity_errors < rm->rm_firstdatacol - n ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
@@ -940,46 +1796,6 @@
 
 				goto done;
 			}
-			break;
-
-		case 2:
-			/*
-			 * Two data column errors require double parity.
-			 */
-			ASSERT(rm->rm_firstdatacol == 2);
-
-			/*
-			 * Find the two columns that reported errors.
-			 */
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			for (c1 = c++; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			vdev_raidz_reconstruct_pq(rm, c1, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				atomic_inc_64(&raidz_corrected_pq);
-				goto done;
-			}
-			break;
-
-		default:
-			ASSERT(rm->rm_firstdatacol <= 2);
-			ASSERT(0);
 		}
 	}
 
@@ -1018,8 +1834,10 @@
 	 * errors we detected, and we've attempted to read all columns. There
 	 * must, therefore, be one or more additional problems -- silent errors
 	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. Before we attempt combinatorial reconstruction make
-	 * sure we have a chance of coming up with the right answer.
+	 * in absent data. We check if there is enough additional data to
+	 * possibly reconstruct the data and then perform combinatorial
+	 * reconstruction over all possible combinations. If that fails,
+	 * we're cooked.
 	 */
 	if (total_errors >= rm->rm_firstdatacol) {
 		zio->io_error = vdev_raidz_worst_error(rm);
@@ -1030,133 +1848,30 @@
 		 */
 		if (total_errors == rm->rm_firstdatacol)
 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
-		goto done;
-	}
-
-	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from parity P.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_p(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_p);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from parity Q.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_q(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_q);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
 
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 &&
-	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+	} else if ((code = vdev_raidz_combrec(zio, total_errors,
+	    data_errors)) != 0) {
 		/*
-		 * Attempt to reconstruct the data from both P and Q.
+		 * If we didn't use all the available parity for the
+		 * combinatorial reconstruction, verify that the remaining
+		 * parity is correct.
 		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-			void *orig, *orig1;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-
-			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-				rc1 = &rm->rm_col[c1];
-
-				orig1 = zio_buf_alloc(rc1->rc_size);
-				bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-				vdev_raidz_reconstruct_pq(rm, c, c1);
-
-				if (zio_checksum_error(zio) == 0) {
-					zio_buf_free(orig, rc->rc_size);
-					zio_buf_free(orig1, rc1->rc_size);
-					atomic_inc_64(&raidz_corrected_pq);
+		if (code != (1 << rm->rm_firstdatacol) - 1)
+			(void) raidz_parity_verify(zio, rm);
+	} else {
+		/*
+		 * All combinations failed to checksum. Generate checksum
+		 * ereports for all children.
+		 */
+		zio->io_error = ECKSUM;
 
-					/*
-					 * If these children didn't know they
-					 * returned bad data, inform them.
-					 */
-					if (rc->rc_tried && rc->rc_error == 0)
-						raidz_checksum_error(zio, rc);
-					if (rc1->rc_tried && rc1->rc_error == 0)
-						raidz_checksum_error(zio, rc1);
-
-					rc->rc_error = ECKSUM;
-					rc1->rc_error = ECKSUM;
-
-					goto done;
-				}
-
-				bcopy(orig1, rc1->rc_data, rc1->rc_size);
-				zio_buf_free(orig1, rc1->rc_size);
+		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			for (c = 0; c < rm->rm_cols; c++) {
+				rc = &rm->rm_col[c];
+				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
+				    zio, rc->rc_offset, rc->rc_size);
 			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	/*
-	 * All combinations failed to checksum. Generate checksum ereports for
-	 * all children.
-	 */
-	zio->io_error = ECKSUM;
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-			    rc->rc_offset, rc->rc_size);
 		}
 	}
 
--- a/usr/src/uts/common/sys/fs/zfs.h	Thu Jul 16 14:12:07 2009 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Thu Jul 16 15:51:38 2009 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -280,14 +281,15 @@
 #define	SPA_VERSION_14			14ULL
 #define	SPA_VERSION_15			15ULL
 #define	SPA_VERSION_16			16ULL
+#define	SPA_VERSION_17			17ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_16
-#define	SPA_VERSION_STRING		"16"
+#define	SPA_VERSION			SPA_VERSION_17
+#define	SPA_VERSION_STRING		"17"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -303,7 +305,7 @@
 #define	SPA_VERSION_INITIAL		SPA_VERSION_1
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
-#define	SPA_VERSION_RAID6		SPA_VERSION_3
+#define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
 #define	SPA_VERSION_BPLIST_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
@@ -325,6 +327,7 @@
 #define	SPA_VERSION_PASSTHROUGH_X	SPA_VERSION_14
 #define	SPA_VERSION_USERSPACE		SPA_VERSION_15
 #define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
+#define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change