changeset 24358:b0bb68020907

[illumos-gate merge] commit d9172ac4b58b7d14d2dc462871e5a554135283bf 12004 format: err_check is duplicate of efi_err_check commit edbad4fe075c5178507eaf0bfb85aa903b66bf91 11997 format: cstyle cleanup commit 5ac07b12fb4c39cb2415c0997f7c5b4dd5209f96 12058 loader.efi: use libi386/comconsole with x86 commit 4d7988d6050abba5c1ff60e7fd196e95c22e20f4 11971 Reduce loaded range tree memory usage
author Jerry Jelinek <jerry.jelinek@joyent.com>
date Mon, 09 Dec 2019 14:15:34 +0000
parents 4361c1ef6e00 (current diff) 07deccf83e23 (diff)
children 15ed84de7de9
files usr/src/boot/sys/boot/common/mb_header.S usr/src/boot/sys/boot/common/multiboot.S usr/src/boot/sys/boot/efi/loader/Makefile.com usr/src/boot/sys/boot/efi/loader/comconsole.c usr/src/boot/sys/boot/efi/loader/efiserialio.c usr/src/boot/sys/boot/i386/gptzfsboot/Makefile usr/src/cmd/mdb/common/modules/zfs/zfs.c usr/src/cmd/zdb/zdb.c usr/src/lib/libzfs/common/libzfs_dataset.c usr/src/lib/libzfs/common/libzfs_iter.c usr/src/lib/libzfs/common/libzfs_sendrecv.c usr/src/lib/libzpool/common/sys/zfs_context.h usr/src/uts/common/Makefile.files usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dmu_recv.c usr/src/uts/common/fs/zfs/dnode.c usr/src/uts/common/fs/zfs/dsl_deleg.c usr/src/uts/common/fs/zfs/dsl_scan.c usr/src/uts/common/fs/zfs/metaslab.c usr/src/uts/common/fs/zfs/sa.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/metaslab.h usr/src/uts/common/fs/zfs/sys/metaslab_impl.h usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/vdev_impl.h usr/src/uts/common/fs/zfs/sys/zfs_context.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/vdev_cache.c usr/src/uts/common/fs/zfs/vdev_initialize.c usr/src/uts/common/fs/zfs/vdev_queue.c usr/src/uts/common/fs/zfs/vdev_raidz.c usr/src/uts/common/fs/zfs/vdev_trim.c usr/src/uts/common/fs/zfs/zil.c
diffstat 67 files changed, 5120 insertions(+), 1993 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/boot/Makefile.version	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/Makefile.version	Mon Dec 09 14:15:34 2019 +0000
@@ -33,4 +33,4 @@
 # Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes.
 # The version is processed from left to right, the version number can only
 # be increased.
-BOOT_VERSION = $(LOADER_VERSION)-2019.12.03.1
+BOOT_VERSION = $(LOADER_VERSION)-2019.12.05.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/boot/sys/boot/common/mb_header.S	Mon Dec 09 14:15:34 2019 +0000
@@ -0,0 +1,43 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2018 Toomas Soome <tsoome@me.com>
+ */
+
+	.file	"mb_header.S"
+
+/*
+ * Provide fake multiboot header to support versioning and partition
+ * start. The fake MB header is used by versioning code located in
+ * usr/src/cmd/boot/common. Since the BIOS bootblock is stored on raw disk,
+ * this fake header is used to store the location of the version info block.
+ * Additionally we use it to store partition start_sector, so we can identify
+ * our root file system partition. Note we are using LBA64 here.
+ */
+
+#define	ASM_FILE
+#include <sys/multiboot.h>
+
+		.globl mb_header, start_sector
+		.text
+
+		.align 4
+mb_header:
+		.long	MULTIBOOT_HEADER_MAGIC
+		.long	MULTIBOOT_AOUT_KLUDGE
+		.long	-(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_AOUT_KLUDGE)
+		.long	0                       /* header_addr */
+		.long	0                       /* load_addr */
+		.long	0                       /* load_end_addr */
+start_sector:	.long	0			/* partition LBA */
+		.long	0
+
--- a/usr/src/boot/sys/boot/common/multiboot.S	Fri Dec 06 12:00:18 2019 -0600
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-/*
- * This file and its contents are supplied under the terms of the
- * Common Development and Distribution License ("CDDL"), version 1.0.
- * You may only use this file in accordance with the terms of version
- * 1.0 of the CDDL.
- *
- * A full copy of the text of the CDDL should have accompanied this
- * source.  A copy of the CDDL is also available via the Internet at
- * http://www.illumos.org/license/CDDL.
- */
-
-/*
- * Copyright 2018 Toomas Soome <tsoome@me.com>
- */
-
-	.file	"multiboot.s"
-
-/*
- * Provide fake multiboot header to support versioning and partition
- * start. The fake MB header is used by versioning code located in
- * usr/src/cmd/boot/common. Since the BIOS bootblock is stored on raw disk,
- * this fake header is used to store the location of the version info block.
- * Additionally we use it to store partition start_sector, so we can identify
- * our root file system partition. Note we are using LBA64 here.
- */
-
-#define	ASM_FILE
-#include <sys/multiboot.h>
-
-		.globl mb_header, start_sector
-		.text
-
-		.align 4
-mb_header:
-		.long	MULTIBOOT_HEADER_MAGIC
-		.long	MULTIBOOT_AOUT_KLUDGE
-		.long	-(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_AOUT_KLUDGE)
-		.long	0                       /* header_addr */
-		.long	0                       /* load_addr */
-		.long	0                       /* load_end_addr */
-start_sector:	.long	0			/* partition LBA */
-		.long	0
-
--- a/usr/src/boot/sys/boot/efi/loader/Makefile.com	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/Makefile.com	Mon Dec 09 14:15:34 2019 +0000
@@ -34,7 +34,7 @@
 	framebuffer.c \
 	main.c \
 	memmap.c \
-	multiboot.S \
+	mb_header.S \
 	multiboot2.c \
 	self_reloc.c \
 	smbios.c \
@@ -53,7 +53,7 @@
 	framebuffer.o \
 	main.o \
 	memmap.o \
-	multiboot.o \
+	mb_header.o \
 	multiboot2.o \
 	self_reloc.o \
 	smbios.o \
@@ -186,9 +186,6 @@
 %.o:	../../../common/linenoise/%.c
 	$(COMPILE.c) $<
 
-%.o: ../../../i386/libi386/%.c
-	$(COMPILE.c) $<
-
 %.o: $(SRC)/common/font/%.c
 	$(COMPILE.c) $<
 
--- a/usr/src/boot/sys/boot/efi/loader/amd64/Makefile	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/amd64/Makefile	Mon Dec 09 14:15:34 2019 +0000
@@ -31,3 +31,6 @@
 CLEANFILES +=	machine x86 $(EFIPROG)
 
 $(OBJS):	machine x86
+
+%.o: ../../../i386/libi386/%.c
+	$(COMPILE.c) $<
--- a/usr/src/boot/sys/boot/efi/loader/arch/amd64/ldscript.amd64	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/arch/amd64/ldscript.amd64	Mon Dec 09 14:15:34 2019 +0000
@@ -9,7 +9,7 @@
   .hash : { *(.hash) }  /* this MUST come first! */
   . = ALIGN(4096);
   .text		: {
-    multiboot.o(.text)
+    mb_header.o(.text)
     *(.text .stub .text.* .gnu.linkonce.t.*)
     /* .gnu.warning sections are handled specially by elf32.em. */
     *(.gnu.warning)
--- a/usr/src/boot/sys/boot/efi/loader/arch/i386/ldscript.i386	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/arch/i386/ldscript.i386	Mon Dec 09 14:15:34 2019 +0000
@@ -9,7 +9,7 @@
   . = SIZEOF_HEADERS;
   . = ALIGN(4096);
   .text		: {
-    multiboot.o(.text)
+    mb_header.o(.text)
     *(.text .stub .text.* .gnu.linkonce.t.*)
     /* .gnu.warning sections are handled specially by elf32.em. */
     *(.gnu.warning)
--- a/usr/src/boot/sys/boot/efi/loader/comconsole.c	Fri Dec 06 12:00:18 2019 -0600
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 1998 Michael Smith (msmith@freebsd.org)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-
-#include <stand.h>
-#include <sys/errno.h>
-#include <bootstrap.h>
-#include <stdbool.h>
-
-#include <efi.h>
-#include <efilib.h>
-
-#include "loader_efi.h"
-
-static EFI_GUID serial = SERIAL_IO_PROTOCOL;
-
-#define	COMC_TXWAIT	0x40000		/* transmit timeout */
-
-#ifndef	COMSPEED
-#define	COMSPEED	9600
-#endif
-
-#define	PNP0501		0x501		/* 16550A-compatible COM port */
-
-struct serial {
-	uint64_t	baudrate;
-	uint8_t		databits;
-	EFI_PARITY_TYPE	parity;
-	EFI_STOP_BITS_TYPE stopbits;
-	uint8_t		ignore_cd;	/* boolean */
-	uint8_t		rtsdtr_off;	/* boolean */
-	int		ioaddr;		/* index in handles array */
-	SERIAL_IO_INTERFACE *sio;
-};
-
-static void	comc_probe(struct console *);
-static int	comc_init(struct console *, int);
-static void	comc_putchar(struct console *, int);
-static int	comc_getchar(struct console *);
-static int	comc_ischar(struct console *);
-static int	comc_ioctl(struct console *, int, void *);
-static void	comc_devinfo(struct console *);
-static bool	comc_setup(struct console *);
-static char	*comc_asprint_mode(struct serial *);
-static int	comc_parse_mode(struct serial *, const char *);
-static int	comc_mode_set(struct env_var *, int, const void *);
-static int	comc_cd_set(struct env_var *, int, const void *);
-static int	comc_rtsdtr_set(struct env_var *, int, const void *);
-
-struct console ttya = {
-	.c_name = "ttya",
-	.c_desc = "serial port a",
-	.c_flags = 0,
-	.c_probe = comc_probe,
-	.c_init = comc_init,
-	.c_out = comc_putchar,
-	.c_in = comc_getchar,
-	.c_ready = comc_ischar,
-	.c_ioctl = comc_ioctl,
-	.c_devinfo = comc_devinfo,
-	.c_private = NULL
-};
-
-struct console ttyb = {
-	.c_name = "ttyb",
-	.c_desc = "serial port b",
-	.c_flags = 0,
-	.c_probe = comc_probe,
-	.c_init = comc_init,
-	.c_out = comc_putchar,
-	.c_in = comc_getchar,
-	.c_ready = comc_ischar,
-	.c_ioctl = comc_ioctl,
-	.c_devinfo = comc_devinfo,
-	.c_private = NULL
-};
-
-struct console ttyc = {
-	.c_name = "ttyc",
-	.c_desc = "serial port c",
-	.c_flags = 0,
-	.c_probe = comc_probe,
-	.c_init = comc_init,
-	.c_out = comc_putchar,
-	.c_in = comc_getchar,
-	.c_ready = comc_ischar,
-	.c_ioctl = comc_ioctl,
-	.c_devinfo = comc_devinfo,
-	.c_private = NULL
-};
-
-struct console ttyd = {
-	.c_name = "ttyd",
-	.c_desc = "serial port d",
-	.c_flags = 0,
-	.c_probe = comc_probe,
-	.c_init = comc_init,
-	.c_out = comc_putchar,
-	.c_in = comc_getchar,
-	.c_ready = comc_ischar,
-	.c_ioctl = comc_ioctl,
-	.c_devinfo = comc_devinfo,
-	.c_private = NULL
-};
-
-static EFI_STATUS
-efi_serial_init(EFI_HANDLE **handlep, int *nhandles)
-{
-	UINTN bufsz = 0;
-	EFI_STATUS status;
-	EFI_HANDLE *handles;
-
-	/*
-	 * get buffer size
-	 */
-	*nhandles = 0;
-	handles = NULL;
-	status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles);
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return (status);
-
-	if ((handles = malloc(bufsz)) == NULL)
-		return (ENOMEM);
-
-	*nhandles = (int)(bufsz / sizeof (EFI_HANDLE));
-	/*
-	 * get handle array
-	 */
-	status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles);
-	if (EFI_ERROR(status)) {
-		free(handles);
-		*nhandles = 0;
-	} else
-		*handlep = handles;
-	return (status);
-}
-
-/*
- * Find serial device number from device path.
- * Return -1 if not found.
- */
-static int
-efi_serial_get_index(EFI_DEVICE_PATH *devpath)
-{
-	ACPI_HID_DEVICE_PATH  *acpi;
-
-	while (!IsDevicePathEnd(devpath)) {
-		if (DevicePathType(devpath) == ACPI_DEVICE_PATH &&
-		    DevicePathSubType(devpath) == ACPI_DP) {
-
-			acpi = (ACPI_HID_DEVICE_PATH *)devpath;
-			if (acpi->HID == EISA_PNP_ID(PNP0501)) {
-				return (acpi->UID);
-			}
-		}
-
-		devpath = NextDevicePathNode(devpath);
-	}
-	return (-1);
-}
-
-/*
- * The order of handles from LocateHandle() is not known, we need to
- * iterate handles, pick device path for handle, and check the device
- * number.
- */
-static EFI_HANDLE
-efi_serial_get_handle(int port)
-{
-	EFI_STATUS status;
-	EFI_HANDLE *handles, handle;
-	EFI_DEVICE_PATH *devpath;
-	int index, nhandles;
-
-	if (port == -1)
-		return (NULL);
-
-	handles = NULL;
-	nhandles = 0;
-	status = efi_serial_init(&handles, &nhandles);
-	if (EFI_ERROR(status))
-		return (NULL);
-
-	handle = NULL;
-	for (index = 0; index < nhandles; index++) {
-		devpath = efi_lookup_devpath(handles[index]);
-		if (port == efi_serial_get_index(devpath)) {
-			handle = (handles[index]);
-			break;
-		}
-	}
-
-	/*
-	 * In case we did fail to identify the device by path, use port as
-	 * array index. Note, we did check port == -1 above.
-	 */
-	if (port < nhandles && handle == NULL)
-		handle = handles[port];
-
-	free(handles);
-	return (handle);
-}
-
-static void
-comc_probe(struct console *cp)
-{
-	EFI_STATUS status;
-	EFI_HANDLE handle;
-	struct serial *port;
-	char name[20];
-	char value[20];
-	char *env;
-
-	/* are we already set up? */
-	if (cp->c_private != NULL)
-		return;
-
-	cp->c_private = malloc(sizeof (struct serial));
-	port = cp->c_private;
-	port->baudrate = COMSPEED;
-
-	port->ioaddr = -1;	/* invalid port */
-	if (strcmp(cp->c_name, "ttya") == 0)
-		port->ioaddr = 0;
-	else if (strcmp(cp->c_name, "ttyb") == 0)
-		port->ioaddr = 1;
-	else if (strcmp(cp->c_name, "ttyc") == 0)
-		port->ioaddr = 2;
-	else if (strcmp(cp->c_name, "ttyd") == 0)
-		port->ioaddr = 3;
-
-	port->databits = 8;		/* 8,n,1 */
-	port->parity = NoParity;	/* 8,n,1 */
-	port->stopbits = OneStopBit;	/* 8,n,1 */
-	port->ignore_cd = 1;		/* ignore cd */
-	port->rtsdtr_off = 0;		/* rts-dtr is on */
-	port->sio = NULL;
-
-	handle = efi_serial_get_handle(port->ioaddr);
-
-	if (handle != NULL) {
-		status = BS->OpenProtocol(handle, &serial,
-		    (void**)&port->sio, IH, NULL,
-		    EFI_OPEN_PROTOCOL_GET_PROTOCOL);
-
-		if (EFI_ERROR(status))
-			port->sio = NULL;
-	}
-
-	snprintf(name, sizeof (name), "%s-mode", cp->c_name);
-	env = getenv(name);
-
-	if (env != NULL)
-		(void) comc_parse_mode(port, env);
-
-	env = comc_asprint_mode(port);
-
-	if (env != NULL) {
-		unsetenv(name);
-		env_setenv(name, EV_VOLATILE, env, comc_mode_set, env_nounset);
-		free(env);
-	}
-
-	snprintf(name, sizeof (name), "%s-ignore-cd", cp->c_name);
-	env = getenv(name);
-	if (env != NULL) {
-		if (strcmp(env, "true") == 0)
-			port->ignore_cd = 1;
-		else if (strcmp(env, "false") == 0)
-			port->ignore_cd = 0;
-	}
-
-	snprintf(value, sizeof (value), "%s",
-	    port->ignore_cd? "true" : "false");
-	unsetenv(name);
-	env_setenv(name, EV_VOLATILE, value, comc_cd_set, env_nounset);
-
-	snprintf(name, sizeof (name), "%s-rts-dtr-off", cp->c_name);
-	env = getenv(name);
-	if (env != NULL) {
-		if (strcmp(env, "true") == 0)
-			port->rtsdtr_off = 1;
-		else if (strcmp(env, "false") == 0)
-			port->rtsdtr_off = 0;
-	}
-
-	snprintf(value, sizeof (value), "%s",
-	    port->rtsdtr_off? "true" : "false");
-	unsetenv(name);
-	env_setenv(name, EV_VOLATILE, value, comc_rtsdtr_set, env_nounset);
-
-	cp->c_flags = 0;
-	if (comc_setup(cp))
-		cp->c_flags = C_PRESENTIN | C_PRESENTOUT;
-}
-
-static int
-comc_init(struct console *cp, int arg __attribute((unused)))
-{
-
-	if (comc_setup(cp))
-		return (CMD_OK);
-
-	cp->c_flags = 0;
-	return (CMD_ERROR);
-}
-
-static void
-comc_putchar(struct console *cp, int c)
-{
-	int wait;
-	EFI_STATUS status;
-	UINTN bufsz = 1;
-	char cb = c;
-	struct serial *sp = cp->c_private;
-
-	if (sp->sio == NULL)
-		return;
-
-	for (wait = COMC_TXWAIT; wait > 0; wait--) {
-		status = sp->sio->Write(sp->sio, &bufsz, &cb);
-		if (status != EFI_TIMEOUT)
-			break;
-	}
-}
-
-static int
-comc_getchar(struct console *cp)
-{
-	EFI_STATUS status;
-	UINTN bufsz = 1;
-	char c;
-	struct serial *sp = cp->c_private;
-
-	if (sp->sio == NULL || !comc_ischar(cp))
-		return (-1);
-
-	status = sp->sio->Read(sp->sio, &bufsz, &c);
-	if (EFI_ERROR(status) || bufsz == 0)
-		return (-1);
-
-	return (c);
-}
-
-static int
-comc_ischar(struct console *cp)
-{
-	EFI_STATUS status;
-	uint32_t control;
-	struct serial *sp = cp->c_private;
-
-	if (sp->sio == NULL)
-		return (0);
-
-	status = sp->sio->GetControl(sp->sio, &control);
-	if (EFI_ERROR(status))
-		return (0);
-
-	return (!(control & EFI_SERIAL_INPUT_BUFFER_EMPTY));
-}
-
-static int
-comc_ioctl(struct console *cp __unused, int cmd __unused, void *data __unused)
-{
-	return (ENOTTY);
-}
-
-static void
-comc_devinfo(struct console *cp)
-{
-	struct serial *port = cp->c_private;
-	EFI_HANDLE handle;
-	EFI_DEVICE_PATH *dp;
-	CHAR16 *text;
-
-	handle = efi_serial_get_handle(port->ioaddr);
-	if (handle == NULL) {
-		printf("\tdevice is not present");
-		return;
-	}
-
-	dp = efi_lookup_devpath(handle);
-	if (dp == NULL)
-		return;
-
-	text = efi_devpath_name(dp);
-	if (text == NULL)
-		return;
-
-	printf("\t%S", text);
-	efi_free_devpath_name(text);
-}
-
-static char *
-comc_asprint_mode(struct serial *sp)
-{
-	char par, *buf;
-	char *stop;
-
-	if (sp == NULL)
-		return (NULL);
-
-	switch (sp->parity) {
-	case NoParity:
-		par = 'n';
-		break;
-	case EvenParity:
-		par = 'e';
-		break;
-	case OddParity:
-		par = 'o';
-		break;
-	case MarkParity:
-		par = 'm';
-		break;
-	case SpaceParity:
-		par = 's';
-		break;
-	default:
-		par = 'n';
-		break;
-	}
-
-	switch (sp->stopbits) {
-	case OneStopBit:
-		stop = "1";
-		break;
-	case TwoStopBits:
-		stop = "2";
-		break;
-	case OneFiveStopBits:
-		stop = "1.5";
-		break;
-	default:
-		stop = "1";
-		break;
-	}
-
-	asprintf(&buf, "%ju,%d,%c,%s,-", sp->baudrate, sp->databits, par, stop);
-	return (buf);
-}
-
-static int
-comc_parse_mode(struct serial *sp, const char *value)
-{
-	unsigned long n;
-	uint64_t baudrate;
-	uint8_t databits = 8;
-	int parity = NoParity;
-	int stopbits = OneStopBit;
-	char *ep;
-
-	if (value == NULL || *value == '\0')
-		return (CMD_ERROR);
-
-	errno = 0;
-	n = strtoul(value, &ep, 10);
-	if (errno != 0 || *ep != ',')
-		return (CMD_ERROR);
-	baudrate = n;
-
-	ep++;
-	n = strtoul(ep, &ep, 10);
-	if (errno != 0 || *ep != ',')
-		return (CMD_ERROR);
-
-	switch (n) {
-	case 5: databits = 5;
-		break;
-	case 6: databits = 6;
-		break;
-	case 7: databits = 7;
-		break;
-	case 8: databits = 8;
-		break;
-	default:
-		return (CMD_ERROR);
-	}
-
-	ep++;
-	switch (*ep++) {
-	case 'n': parity = NoParity;
-		break;
-	case 'e': parity = EvenParity;
-		break;
-	case 'o': parity = OddParity;
-		break;
-	case 'm': parity = MarkParity;
-		break;
-	case 's': parity = SpaceParity;
-		break;
-	default:
-		return (CMD_ERROR);
-	}
-
-	if (*ep == ',')
-		ep++;
-	else
-		return (CMD_ERROR);
-
-	switch (*ep++) {
-	case '1': stopbits = OneStopBit;
-		if (ep[0] == '.' && ep[1] == '5') {
-			ep += 2;
-			stopbits = OneFiveStopBits;
-		}
-		break;
-	case '2': stopbits = TwoStopBits;
-		break;
-	default:
-		return (CMD_ERROR);
-	}
-
-	/* handshake is ignored, but we check syntax anyhow */
-	if (*ep == ',')
-		ep++;
-	else
-		return (CMD_ERROR);
-
-	switch (*ep++) {
-	case '-':
-	case 'h':
-	case 's':
-		break;
-	default:
-		return (CMD_ERROR);
-	}
-
-	if (*ep != '\0')
-		return (CMD_ERROR);
-
-	sp->baudrate = baudrate;
-	sp->databits = databits;
-	sp->parity = parity;
-	sp->stopbits = stopbits;
-	return (CMD_OK);
-}
-
-static struct console *
-get_console(char *name)
-{
-	struct console *cp = NULL;
-
-	switch (name[3]) {
-	case 'a': cp = &ttya;
-		break;
-	case 'b': cp = &ttyb;
-		break;
-	case 'c': cp = &ttyc;
-		break;
-	case 'd': cp = &ttyd;
-		break;
-	}
-	return (cp);
-}
-
-static int
-comc_mode_set(struct env_var *ev, int flags, const void *value)
-{
-	struct console *cp;
-
-	if (value == NULL)
-		return (CMD_ERROR);
-
-	if ((cp = get_console(ev->ev_name)) == NULL)
-		return (CMD_ERROR);
-
-	if (comc_parse_mode(cp->c_private, value) == CMD_ERROR)
-		return (CMD_ERROR);
-
-	(void) comc_setup(cp);
-
-	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
-
-	return (CMD_OK);
-}
-
-static int
-comc_cd_set(struct env_var *ev, int flags, const void *value)
-{
-	struct console *cp;
-	struct serial *sp;
-
-	if (value == NULL)
-		return (CMD_ERROR);
-
-	if ((cp = get_console(ev->ev_name)) == NULL)
-		return (CMD_ERROR);
-
-	sp = cp->c_private;
-	if (strcmp(value, "true") == 0)
-		sp->ignore_cd = 1;
-	else if (strcmp(value, "false") == 0)
-		sp->ignore_cd = 0;
-	else
-		return (CMD_ERROR);
-
-	(void) comc_setup(cp);
-
-	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
-
-	return (CMD_OK);
-}
-
-static int
-comc_rtsdtr_set(struct env_var *ev, int flags, const void *value)
-{
-	struct console *cp;
-	struct serial *sp;
-
-	if (value == NULL)
-		return (CMD_ERROR);
-
-	if ((cp = get_console(ev->ev_name)) == NULL)
-		return (CMD_ERROR);
-
-	sp = cp->c_private;
-	if (strcmp(value, "true") == 0)
-		sp->rtsdtr_off = 1;
-	else if (strcmp(value, "false") == 0)
-		sp->rtsdtr_off = 0;
-	else
-		return (CMD_ERROR);
-
-	(void) comc_setup(cp);
-
-	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
-
-	return (CMD_OK);
-}
-
-/*
- * In case of error, we also reset ACTIVE flags, so the console
- * framefork will try alternate consoles.
- */
-static bool
-comc_setup(struct console *cp)
-{
-	EFI_STATUS status;
-	UINT32 control;
-	struct serial *sp = cp->c_private;
-
-	/* port is not usable */
-	if (sp->sio == NULL)
-		return (false);
-
-	status = sp->sio->Reset(sp->sio);
-	if (EFI_ERROR(status))
-		return (false);
-
-	status = sp->sio->SetAttributes(sp->sio, sp->baudrate, 0, 0, sp->parity,
-	    sp->databits, sp->stopbits);
-	if (EFI_ERROR(status))
-		return (false);
-
-	status = sp->sio->GetControl(sp->sio, &control);
-	if (EFI_ERROR(status))
-		return (false);
-	if (sp->rtsdtr_off) {
-		control &= ~(EFI_SERIAL_REQUEST_TO_SEND |
-		    EFI_SERIAL_DATA_TERMINAL_READY);
-	} else {
-		control |= EFI_SERIAL_REQUEST_TO_SEND;
-	}
-
-	(void) sp->sio->SetControl(sp->sio, control);
-
-	/* Mark this port usable. */
-	cp->c_flags |= (C_PRESENTIN | C_PRESENTOUT);
-	return (true);
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/boot/sys/boot/efi/loader/efiserialio.c	Mon Dec 09 14:15:34 2019 +0000
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 1998 Michael Smith (msmith@freebsd.org)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * We do not use this implementation with x86 till we can fix two issues:
+ * 1. Reliably identify the serial ports in correct order.
+ * 2. Ensure we get properly working reads from serial io.
+ */
+
+#include <sys/cdefs.h>
+
+#include <stand.h>
+#include <sys/errno.h>
+#include <bootstrap.h>
+#include <stdbool.h>
+
+#include <efi.h>
+#include <efilib.h>
+
+#include "loader_efi.h"
+
+static EFI_GUID serial = SERIAL_IO_PROTOCOL;
+
+#define	COMC_TXWAIT	0x40000		/* transmit timeout */
+
+#ifndef	COMSPEED
+#define	COMSPEED	9600
+#endif
+
+#define	PNP0501		0x501		/* 16550A-compatible COM port */
+
+struct serial {
+	uint64_t	baudrate;
+	uint8_t		databits;
+	EFI_PARITY_TYPE	parity;
+	EFI_STOP_BITS_TYPE stopbits;
+	uint8_t		ignore_cd;	/* boolean */
+	uint8_t		rtsdtr_off;	/* boolean */
+	int		ioaddr;		/* index in handles array */
+	SERIAL_IO_INTERFACE *sio;
+};
+
+static void	comc_probe(struct console *);
+static int	comc_init(struct console *, int);
+static void	comc_putchar(struct console *, int);
+static int	comc_getchar(struct console *);
+static int	comc_ischar(struct console *);
+static int	comc_ioctl(struct console *, int, void *);
+static void	comc_devinfo(struct console *);
+static bool	comc_setup(struct console *);
+static char	*comc_asprint_mode(struct serial *);
+static int	comc_parse_mode(struct serial *, const char *);
+static int	comc_mode_set(struct env_var *, int, const void *);
+static int	comc_cd_set(struct env_var *, int, const void *);
+static int	comc_rtsdtr_set(struct env_var *, int, const void *);
+
+struct console ttya = {
+	.c_name = "ttya",
+	.c_desc = "serial port a",
+	.c_flags = 0,
+	.c_probe = comc_probe,
+	.c_init = comc_init,
+	.c_out = comc_putchar,
+	.c_in = comc_getchar,
+	.c_ready = comc_ischar,
+	.c_ioctl = comc_ioctl,
+	.c_devinfo = comc_devinfo,
+	.c_private = NULL
+};
+
+struct console ttyb = {
+	.c_name = "ttyb",
+	.c_desc = "serial port b",
+	.c_flags = 0,
+	.c_probe = comc_probe,
+	.c_init = comc_init,
+	.c_out = comc_putchar,
+	.c_in = comc_getchar,
+	.c_ready = comc_ischar,
+	.c_ioctl = comc_ioctl,
+	.c_devinfo = comc_devinfo,
+	.c_private = NULL
+};
+
+struct console ttyc = {
+	.c_name = "ttyc",
+	.c_desc = "serial port c",
+	.c_flags = 0,
+	.c_probe = comc_probe,
+	.c_init = comc_init,
+	.c_out = comc_putchar,
+	.c_in = comc_getchar,
+	.c_ready = comc_ischar,
+	.c_ioctl = comc_ioctl,
+	.c_devinfo = comc_devinfo,
+	.c_private = NULL
+};
+
+struct console ttyd = {
+	.c_name = "ttyd",
+	.c_desc = "serial port d",
+	.c_flags = 0,
+	.c_probe = comc_probe,
+	.c_init = comc_init,
+	.c_out = comc_putchar,
+	.c_in = comc_getchar,
+	.c_ready = comc_ischar,
+	.c_ioctl = comc_ioctl,
+	.c_devinfo = comc_devinfo,
+	.c_private = NULL
+};
+
+static EFI_STATUS
+efi_serial_init(EFI_HANDLE **handlep, int *nhandles)
+{
+	UINTN bufsz = 0;
+	EFI_STATUS status;
+	EFI_HANDLE *handles;
+
+	/*
+	 * get buffer size
+	 */
+	*nhandles = 0;
+	handles = NULL;
+	status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles);
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return (status);
+
+	if ((handles = malloc(bufsz)) == NULL)
+		return (ENOMEM);
+
+	*nhandles = (int)(bufsz / sizeof (EFI_HANDLE));
+	/*
+	 * get handle array
+	 */
+	status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles);
+	if (EFI_ERROR(status)) {
+		free(handles);
+		*nhandles = 0;
+	} else
+		*handlep = handles;
+	return (status);
+}
+
+/*
+ * Find serial device number from device path.
+ * Return -1 if not found.
+ */
+static int
+efi_serial_get_index(EFI_DEVICE_PATH *devpath)
+{
+	ACPI_HID_DEVICE_PATH  *acpi;
+
+	while (!IsDevicePathEnd(devpath)) {
+		if (DevicePathType(devpath) == ACPI_DEVICE_PATH &&
+		    DevicePathSubType(devpath) == ACPI_DP) {
+
+			acpi = (ACPI_HID_DEVICE_PATH *)devpath;
+			if (acpi->HID == EISA_PNP_ID(PNP0501)) {
+				return (acpi->UID);
+			}
+		}
+
+		devpath = NextDevicePathNode(devpath);
+	}
+	return (-1);
+}
+
+/*
+ * The order of handles from LocateHandle() is not known, we need to
+ * iterate handles, pick device path for handle, and check the device
+ * number.
+ */
+static EFI_HANDLE
+efi_serial_get_handle(int port)
+{
+	EFI_STATUS status;
+	EFI_HANDLE *handles, handle;
+	EFI_DEVICE_PATH *devpath;
+	int index, nhandles;
+
+	if (port == -1)
+		return (NULL);
+
+	handles = NULL;
+	nhandles = 0;
+	status = efi_serial_init(&handles, &nhandles);
+	if (EFI_ERROR(status))
+		return (NULL);
+
+	handle = NULL;
+	for (index = 0; index < nhandles; index++) {
+		devpath = efi_lookup_devpath(handles[index]);
+		if (port == efi_serial_get_index(devpath)) {
+			handle = (handles[index]);
+			break;
+		}
+	}
+
+	/*
+	 * In case we did fail to identify the device by path, use port as
+	 * array index. Note, we did check port == -1 above.
+	 */
+	if (port < nhandles && handle == NULL)
+		handle = handles[port];
+
+	free(handles);
+	return (handle);
+}
+
+static void
+comc_probe(struct console *cp)
+{
+	EFI_STATUS status;
+	EFI_HANDLE handle;
+	struct serial *port;
+	char name[20];
+	char value[20];
+	char *env;
+
+	/* are we already set up? */
+	if (cp->c_private != NULL)
+		return;
+
+	cp->c_private = malloc(sizeof (struct serial));
+	port = cp->c_private;
+	port->baudrate = COMSPEED;
+
+	port->ioaddr = -1;	/* invalid port */
+	if (strcmp(cp->c_name, "ttya") == 0)
+		port->ioaddr = 0;
+	else if (strcmp(cp->c_name, "ttyb") == 0)
+		port->ioaddr = 1;
+	else if (strcmp(cp->c_name, "ttyc") == 0)
+		port->ioaddr = 2;
+	else if (strcmp(cp->c_name, "ttyd") == 0)
+		port->ioaddr = 3;
+
+	port->databits = 8;		/* 8,n,1 */
+	port->parity = NoParity;	/* 8,n,1 */
+	port->stopbits = OneStopBit;	/* 8,n,1 */
+	port->ignore_cd = 1;		/* ignore cd */
+	port->rtsdtr_off = 0;		/* rts-dtr is on */
+	port->sio = NULL;
+
+	handle = efi_serial_get_handle(port->ioaddr);
+
+	if (handle != NULL) {
+		status = BS->OpenProtocol(handle, &serial,
+		    (void**)&port->sio, IH, NULL,
+		    EFI_OPEN_PROTOCOL_GET_PROTOCOL);
+
+		if (EFI_ERROR(status))
+			port->sio = NULL;
+	}
+
+	snprintf(name, sizeof (name), "%s-mode", cp->c_name);
+	env = getenv(name);
+
+	if (env != NULL)
+		(void) comc_parse_mode(port, env);
+
+	env = comc_asprint_mode(port);
+
+	if (env != NULL) {
+		unsetenv(name);
+		env_setenv(name, EV_VOLATILE, env, comc_mode_set, env_nounset);
+		free(env);
+	}
+
+	snprintf(name, sizeof (name), "%s-ignore-cd", cp->c_name);
+	env = getenv(name);
+	if (env != NULL) {
+		if (strcmp(env, "true") == 0)
+			port->ignore_cd = 1;
+		else if (strcmp(env, "false") == 0)
+			port->ignore_cd = 0;
+	}
+
+	snprintf(value, sizeof (value), "%s",
+	    port->ignore_cd? "true" : "false");
+	unsetenv(name);
+	env_setenv(name, EV_VOLATILE, value, comc_cd_set, env_nounset);
+
+	snprintf(name, sizeof (name), "%s-rts-dtr-off", cp->c_name);
+	env = getenv(name);
+	if (env != NULL) {
+		if (strcmp(env, "true") == 0)
+			port->rtsdtr_off = 1;
+		else if (strcmp(env, "false") == 0)
+			port->rtsdtr_off = 0;
+	}
+
+	snprintf(value, sizeof (value), "%s",
+	    port->rtsdtr_off? "true" : "false");
+	unsetenv(name);
+	env_setenv(name, EV_VOLATILE, value, comc_rtsdtr_set, env_nounset);
+
+	cp->c_flags = 0;
+	if (comc_setup(cp))
+		cp->c_flags = C_PRESENTIN | C_PRESENTOUT;
+}
+
+static int
+comc_init(struct console *cp, int arg __attribute((unused)))
+{
+
+	if (comc_setup(cp))
+		return (CMD_OK);
+
+	cp->c_flags = 0;
+	return (CMD_ERROR);
+}
+
+static void
+comc_putchar(struct console *cp, int c)
+{
+	int wait;
+	EFI_STATUS status;
+	UINTN bufsz = 1;
+	char cb = c;
+	struct serial *sp = cp->c_private;
+
+	if (sp->sio == NULL)
+		return;
+
+	for (wait = COMC_TXWAIT; wait > 0; wait--) {
+		status = sp->sio->Write(sp->sio, &bufsz, &cb);
+		if (status != EFI_TIMEOUT)
+			break;
+	}
+}
+
+static int
+comc_getchar(struct console *cp)
+{
+	EFI_STATUS status;
+	UINTN bufsz = 1;
+	char c;
+	struct serial *sp = cp->c_private;
+
+	if (sp->sio == NULL || !comc_ischar(cp))
+		return (-1);
+
+	status = sp->sio->Read(sp->sio, &bufsz, &c);
+	if (EFI_ERROR(status) || bufsz == 0)
+		return (-1);
+
+	return (c);
+}
+
+static int
+comc_ischar(struct console *cp)
+{
+	EFI_STATUS status;
+	uint32_t control;
+	struct serial *sp = cp->c_private;
+
+	if (sp->sio == NULL)
+		return (0);
+
+	status = sp->sio->GetControl(sp->sio, &control);
+	if (EFI_ERROR(status))
+		return (0);
+
+	return (!(control & EFI_SERIAL_INPUT_BUFFER_EMPTY));
+}
+
+static int
+comc_ioctl(struct console *cp __unused, int cmd __unused, void *data __unused)
+{
+	return (ENOTTY);
+}
+
+static void
+comc_devinfo(struct console *cp)
+{
+	struct serial *port = cp->c_private;
+	EFI_HANDLE handle;
+	EFI_DEVICE_PATH *dp;
+	CHAR16 *text;
+
+	handle = efi_serial_get_handle(port->ioaddr);
+	if (handle == NULL) {
+		printf("\tdevice is not present");
+		return;
+	}
+
+	dp = efi_lookup_devpath(handle);
+	if (dp == NULL)
+		return;
+
+	text = efi_devpath_name(dp);
+	if (text == NULL)
+		return;
+
+	printf("\t%S", text);
+	efi_free_devpath_name(text);
+}
+
+static char *
+comc_asprint_mode(struct serial *sp)
+{
+	char par, *buf;
+	char *stop;
+
+	if (sp == NULL)
+		return (NULL);
+
+	switch (sp->parity) {
+	case NoParity:
+		par = 'n';
+		break;
+	case EvenParity:
+		par = 'e';
+		break;
+	case OddParity:
+		par = 'o';
+		break;
+	case MarkParity:
+		par = 'm';
+		break;
+	case SpaceParity:
+		par = 's';
+		break;
+	default:
+		par = 'n';
+		break;
+	}
+
+	switch (sp->stopbits) {
+	case OneStopBit:
+		stop = "1";
+		break;
+	case TwoStopBits:
+		stop = "2";
+		break;
+	case OneFiveStopBits:
+		stop = "1.5";
+		break;
+	default:
+		stop = "1";
+		break;
+	}
+
+	asprintf(&buf, "%ju,%d,%c,%s,-", sp->baudrate, sp->databits, par, stop);
+	return (buf);
+}
+
+static int
+comc_parse_mode(struct serial *sp, const char *value)
+{
+	unsigned long n;
+	uint64_t baudrate;
+	uint8_t databits = 8;
+	int parity = NoParity;
+	int stopbits = OneStopBit;
+	char *ep;
+
+	if (value == NULL || *value == '\0')
+		return (CMD_ERROR);
+
+	errno = 0;
+	n = strtoul(value, &ep, 10);
+	if (errno != 0 || *ep != ',')
+		return (CMD_ERROR);
+	baudrate = n;
+
+	ep++;
+	n = strtoul(ep, &ep, 10);
+	if (errno != 0 || *ep != ',')
+		return (CMD_ERROR);
+
+	switch (n) {
+	case 5: databits = 5;
+		break;
+	case 6: databits = 6;
+		break;
+	case 7: databits = 7;
+		break;
+	case 8: databits = 8;
+		break;
+	default:
+		return (CMD_ERROR);
+	}
+
+	ep++;
+	switch (*ep++) {
+	case 'n': parity = NoParity;
+		break;
+	case 'e': parity = EvenParity;
+		break;
+	case 'o': parity = OddParity;
+		break;
+	case 'm': parity = MarkParity;
+		break;
+	case 's': parity = SpaceParity;
+		break;
+	default:
+		return (CMD_ERROR);
+	}
+
+	if (*ep == ',')
+		ep++;
+	else
+		return (CMD_ERROR);
+
+	switch (*ep++) {
+	case '1': stopbits = OneStopBit;
+		if (ep[0] == '.' && ep[1] == '5') {
+			ep += 2;
+			stopbits = OneFiveStopBits;
+		}
+		break;
+	case '2': stopbits = TwoStopBits;
+		break;
+	default:
+		return (CMD_ERROR);
+	}
+
+	/* handshake is ignored, but we check syntax anyhow */
+	if (*ep == ',')
+		ep++;
+	else
+		return (CMD_ERROR);
+
+	switch (*ep++) {
+	case '-':
+	case 'h':
+	case 's':
+		break;
+	default:
+		return (CMD_ERROR);
+	}
+
+	if (*ep != '\0')
+		return (CMD_ERROR);
+
+	sp->baudrate = baudrate;
+	sp->databits = databits;
+	sp->parity = parity;
+	sp->stopbits = stopbits;
+	return (CMD_OK);
+}
+
+static struct console *
+get_console(char *name)
+{
+	struct console *cp = NULL;
+
+	switch (name[3]) {
+	case 'a': cp = &ttya;
+		break;
+	case 'b': cp = &ttyb;
+		break;
+	case 'c': cp = &ttyc;
+		break;
+	case 'd': cp = &ttyd;
+		break;
+	}
+	return (cp);
+}
+
+static int
+comc_mode_set(struct env_var *ev, int flags, const void *value)
+{
+	struct console *cp;
+
+	if (value == NULL)
+		return (CMD_ERROR);
+
+	if ((cp = get_console(ev->ev_name)) == NULL)
+		return (CMD_ERROR);
+
+	if (comc_parse_mode(cp->c_private, value) == CMD_ERROR)
+		return (CMD_ERROR);
+
+	(void) comc_setup(cp);
+
+	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
+
+	return (CMD_OK);
+}
+
+static int
+comc_cd_set(struct env_var *ev, int flags, const void *value)
+{
+	struct console *cp;
+	struct serial *sp;
+
+	if (value == NULL)
+		return (CMD_ERROR);
+
+	if ((cp = get_console(ev->ev_name)) == NULL)
+		return (CMD_ERROR);
+
+	sp = cp->c_private;
+	if (strcmp(value, "true") == 0)
+		sp->ignore_cd = 1;
+	else if (strcmp(value, "false") == 0)
+		sp->ignore_cd = 0;
+	else
+		return (CMD_ERROR);
+
+	(void) comc_setup(cp);
+
+	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
+
+	return (CMD_OK);
+}
+
+static int
+comc_rtsdtr_set(struct env_var *ev, int flags, const void *value)
+{
+	struct console *cp;
+	struct serial *sp;
+
+	if (value == NULL)
+		return (CMD_ERROR);
+
+	if ((cp = get_console(ev->ev_name)) == NULL)
+		return (CMD_ERROR);
+
+	sp = cp->c_private;
+	if (strcmp(value, "true") == 0)
+		sp->rtsdtr_off = 1;
+	else if (strcmp(value, "false") == 0)
+		sp->rtsdtr_off = 0;
+	else
+		return (CMD_ERROR);
+
+	(void) comc_setup(cp);
+
+	env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL);
+
+	return (CMD_OK);
+}
+
+/*
+ * In case of error, we also reset ACTIVE flags, so the console
+ * framefork will try alternate consoles.
+ */
+static bool
+comc_setup(struct console *cp)
+{
+	EFI_STATUS status;
+	UINT32 control;
+	struct serial *sp = cp->c_private;
+
+	/* port is not usable */
+	if (sp->sio == NULL)
+		return (false);
+
+	status = sp->sio->Reset(sp->sio);
+	if (EFI_ERROR(status))
+		return (false);
+
+	status = sp->sio->SetAttributes(sp->sio, sp->baudrate, 0, 0, sp->parity,
+	    sp->databits, sp->stopbits);
+	if (EFI_ERROR(status))
+		return (false);
+
+	status = sp->sio->GetControl(sp->sio, &control);
+	if (EFI_ERROR(status))
+		return (false);
+	if (sp->rtsdtr_off) {
+		control &= ~(EFI_SERIAL_REQUEST_TO_SEND |
+		    EFI_SERIAL_DATA_TERMINAL_READY);
+	} else {
+		control |= EFI_SERIAL_REQUEST_TO_SEND;
+	}
+
+	(void) sp->sio->SetControl(sp->sio, control);
+
+	/* Mark this port usable. */
+	cp->c_flags |= (C_PRESENTIN | C_PRESENTOUT);
+	return (true);
+}
--- a/usr/src/boot/sys/boot/efi/loader/i386/Makefile	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/i386/Makefile	Mon Dec 09 14:15:34 2019 +0000
@@ -31,3 +31,6 @@
 CLEANFILES +=	machine x86 $(EFIPROG)
 
 $(OBJS):	machine x86
+
+%.o: ../../../i386/libi386/%.c
+	$(COMPILE.c) $<
--- a/usr/src/boot/sys/boot/efi/loader/main.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/efi/loader/main.c	Mon Dec 09 14:15:34 2019 +0000
@@ -81,7 +81,7 @@
 	EFI_DEVICE_PATH *devpath, *dp, *node;
 	HARDDRIVE_DEVICE_PATH *hd;
 	bool ret;
-	extern UINT64 start_sector;	/* from multiboot.S */
+	extern UINT64 start_sector;	/* from mb_header.S */
 
 	/* This check is true for chainloader case. */
 	if (h == img->DeviceHandle)
--- a/usr/src/boot/sys/boot/i386/gptzfsboot/Makefile	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/i386/gptzfsboot/Makefile	Mon Dec 09 14:15:34 2019 +0000
@@ -62,7 +62,7 @@
 
 install: all $(ROOTBOOTPROG)
 
-OBJS =	multiboot.o zfsboot.o sio.o cons.o devopen.o \
+OBJS =	mb_header.o zfsboot.o sio.o cons.o devopen.o \
 	part.o disk.o bcache.o zfs_cmd.o
 
 part.o := CPPFLAGS += -I$(ZLIB)
@@ -108,8 +108,8 @@
 %.o:	../../common/%.c
 	$(COMPILE.c) $<
 
-multiboot.o: ../../common/multiboot.S
-	$(COMPILE.S) ../../common/multiboot.S
+%.o: ../../common/%.S
+	$(COMPILE.S) $<
 
 clobber: clean
 
--- a/usr/src/boot/sys/boot/i386/isoboot/Makefile	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/i386/isoboot/Makefile	Mon Dec 09 14:15:34 2019 +0000
@@ -56,7 +56,7 @@
 
 install: all $(ROOTBOOTPROG)
 
-OBJS= multiboot.o isoboot.o sio.o drv.o cons.o gptldr.o
+OBJS= mb_header.o isoboot.o sio.o drv.o cons.o gptldr.o
 
 CLEANFILES += isoboot
 
--- a/usr/src/boot/sys/boot/i386/libi386/comconsole.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/boot/sys/boot/i386/libi386/comconsole.c	Mon Dec 09 14:15:34 2019 +0000
@@ -23,6 +23,11 @@
  * SUCH DAMAGE.
  */
 
+/*
+ * This code is shared on BIOS and UEFI systems on x86 because
+ * we can access io ports on both platforms and the UEFI Serial IO protocol
+ * is not giving us reliable port order and we see issues with input.
+ */
 #include <sys/cdefs.h>
 
 #include <stand.h>
@@ -495,7 +500,7 @@
 static uint32_t
 comc_parse_pcidev(const char *string)
 {
-#ifdef NO_PCI
+#ifdef EFI
 	(void) string;
 	return (0);
 #else
@@ -539,7 +544,7 @@
 static int
 comc_pcidev_handle(struct console *cp, uint32_t locator)
 {
-#ifdef NO_PCI
+#ifdef EFI
 	(void) cp;
 	(void) locator;
 	return (CMD_ERROR);
--- a/usr/src/cmd/format/io.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/io.c	Mon Dec 09 14:15:34 2019 +0000
@@ -49,7 +49,6 @@
 
 extern int	data_lineno;
 extern char	*space2str();
-extern long	strtol();
 
 /*
  * This variable is used to determine whether a token is present in the pipe
@@ -62,10 +61,6 @@
  */
 int	last_token_type = 0;
 
-#ifdef	__STDC__
-/*
- * Prototypes for ANSI C compilers
- */
 static int	sup_get_token(char *);
 static void	pushchar(int c);
 static int	checkeof(void);
@@ -79,33 +74,11 @@
 static void	sup_pushchar(int c);
 static int	geti64(char *str, uint64_t *iptr, uint64_t *wild);
 
-#else	/* __STDC__ */
-/*
- * Prototypes for non-ANSI C compilers
- */
-
-static int	sup_get_token();
-static void	pushchar(int c);
-static int	checkeof(void);
-static void	flushline(void);
-static int	strcnt(char *s1, char *s2);
-static int	getbn(char *str, diskaddr_t *iptr);
-static void	print_input_choices(int type, u_ioparam_t *param);
-static int	slist_widest_str(slist_t *slist);
-static void	ljust_print(char *str, int width);
-static int	sup_inputchar(void);
-static void	sup_pushchar(int c);
-static int	geti64(char *str, uint64_t *iptr, uint64_t *wild);
-
-#endif	/* __STDC__ */
-
-
 /*
  * This routine pushes the given character back onto the input stream.
  */
 static void
-pushchar(c)
-	int	c;
+pushchar(int c)
 {
 	(void) ungetc(c, stdin);
 }
@@ -114,7 +87,7 @@
  * This routine checks the input stream for an eof condition.
  */
 static int
-checkeof()
+checkeof(void)
 {
 	return (feof(stdin));
 }
@@ -124,8 +97,7 @@
  * basically any consecutive non-white characters.
  */
 char *
-gettoken(inbuf)
-	char	*inbuf;
+gettoken(char *inbuf)
 {
 	char	*ptr = inbuf;
 	int	c, quoted = 0;
@@ -196,8 +168,7 @@
  * This routine removes the leading and trailing spaces from a token.
  */
 void
-clean_token(cleantoken, token)
-	char	*cleantoken, *token;
+clean_token(char *cleantoken, char *token)
 {
 	char	*ptr;
 
@@ -214,7 +185,7 @@
 	 * Strip off trailing white-space.
 	 */
 	for (ptr = cleantoken + strlen(cleantoken) - 1;
-		isspace(*ptr) && (ptr >= cleantoken); ptr--) {
+	    isspace(*ptr) && (ptr >= cleantoken); ptr--) {
 		*ptr = '\0';
 	}
 }
@@ -254,8 +225,7 @@
  * between s1 and s2, stopping as soon as a mismatch is found.
  */
 static int
-strcnt(s1, s2)
-	char	*s1, *s2;
+strcnt(char *s1, char *s2)
 {
 	int	i = 0;
 
@@ -271,9 +241,7 @@
  * is present, the wildcard value will be returned.
  */
 int
-geti(str, iptr, wild)
-	char	*str;
-	int	*iptr, *wild;
+geti(char *str, int *iptr, int *wild)
 {
 	char	*str2;
 
@@ -306,9 +274,7 @@
  * is present, the wildcard value will be returned.
  */
 static int
-geti64(str, iptr, wild)
-	char		*str;
-	uint64_t	*iptr, *wild;
+geti64(char *str, uint64_t *iptr, uint64_t *wild)
 {
 	char	*str2;
 
@@ -345,9 +311,7 @@
  * to the highest possible legal value.
  */
 static int
-getbn(str, iptr)
-	char	*str;
-	diskaddr_t	*iptr;
+getbn(char *str, diskaddr_t *iptr)
 {
 	char	*cptr, *hptr, *sptr;
 	int	cyl, head, sect;
@@ -442,13 +406,8 @@
  * values and prompt strings.
  */
 uint64_t
-input(type, promptstr, delim, param, deflt, cmdflag)
-	int		type;
-	char		*promptstr;
-	int		delim;
-	u_ioparam_t	*param;
-	int		*deflt;
-	int		cmdflag;
+input(int type, char *promptstr, int delim, u_ioparam_t *param, int *deflt,
+    int cmdflag)
 {
 	int		interactive, help, i, length, index, tied;
 	blkaddr_t	bn;
@@ -586,7 +545,7 @@
 				cylno = bn2c(part_deflt->deflt_size) - 1;
 			} else {
 				cylno = (bn2c(part_deflt->deflt_size) +
-					    part_deflt->start_cyl) - 1;
+				    part_deflt->start_cyl) - 1;
 			}
 
 			fmt_print("[%ub, %uc, %de, %1.2fmb, %1.2fgb]",
@@ -609,11 +568,11 @@
 			    efi_deflt->end_sector,
 			    efi_deflt->start_sector + efi_deflt->end_sector - 1,
 			    (efi_deflt->end_sector * cur_blksz) /
-				(1024 * 1024),
+			    (1024 * 1024),
 			    (efi_deflt->end_sector * cur_blksz) /
-				(1024 * 1024 * 1024),
+			    (1024 * 1024 * 1024),
 			    (efi_deflt->end_sector * cur_blksz) /
-				((uint64_t)1024 * 1024 * 1024 * 1024));
+			    ((uint64_t)1024 * 1024 * 1024 * 1024));
 			break;
 		case FIO_OPINT:
 			/* no default value for optional input type */
@@ -659,9 +618,9 @@
 			 * exit gracefully.
 			 */
 			if ((strlcat(shell_argv, arg, sizeof (shell_argv)) >=
-				sizeof (shell_argv)) ||
+			    sizeof (shell_argv)) ||
 			    (strlcat(shell_argv, " ", sizeof (shell_argv)) >=
-				sizeof (shell_argv))) {
+			    sizeof (shell_argv))) {
 				err_print("Error: Command line too long.\n");
 				fullabort();
 			}
@@ -781,8 +740,8 @@
 	 * If token is a '?' or a 'h', it is a request for help.
 	 */
 	if ((strcmp(cleantoken, "?") == 0) ||
-		(strcmp(cleantoken, "h") == 0) ||
-			(strcmp(cleantoken, "help") == 0)) {
+	    (strcmp(cleantoken, "h") == 0) ||
+	    (strcmp(cleantoken, "help") == 0)) {
 		help = 1;
 	}
 	/*
@@ -813,12 +772,12 @@
 		 * Convert token to a disk block number.
 		 */
 		if (cur_label == L_TYPE_EFI) {
-		    if (geti64(cleantoken, (uint64_t *)&bn64,
-			(uint64_t *)NULL))
-			    break;
+			if (geti64(cleantoken, (uint64_t *)&bn64,
+			    (uint64_t *)NULL))
+				break;
 		} else {
-		    if (getbn(cleantoken, &bn64))
-			break;
+			if (getbn(cleantoken, &bn64))
+				break;
 		}
 		/*
 		 * Check to be sure it is within the legal bounds.
@@ -1066,8 +1025,7 @@
 	 * Return the value associated with the matched string.
 	 */
 	case FIO_SLIST:
-		i = find_value((slist_t *)param->io_slist,
-			cleantoken, &value);
+		i = find_value((slist_t *)param->io_slist, cleantoken, &value);
 		if (i == 1) {
 			return (value);
 		} else {
@@ -1520,14 +1478,14 @@
 			fmt_print("Expecting up to %llu sectors,",
 			    cur_parts->etoc->efi_last_u_lba);
 			fmt_print("or %llu megabytes,",
-			    (cur_parts->etoc->efi_last_u_lba * cur_blksz)/
-				(1024 * 1024));
+			    (cur_parts->etoc->efi_last_u_lba * cur_blksz) /
+			    (1024 * 1024));
 			fmt_print("or %llu gigabytes\n",
-			    (cur_parts->etoc->efi_last_u_lba * cur_blksz)/
-				(1024 * 1024 * 1024));
+			    (cur_parts->etoc->efi_last_u_lba * cur_blksz) /
+			    (1024 * 1024 * 1024));
 			fmt_print("or %llu terabytes\n",
-			    (cur_parts->etoc->efi_last_u_lba * cur_blksz)/
-				((uint64_t)1024 * 1024 * 1024 * 1024));
+			    (cur_parts->etoc->efi_last_u_lba * cur_blksz) /
+			    ((uint64_t)1024 * 1024 * 1024 * 1024));
 			break;
 		}
 
@@ -1591,12 +1549,12 @@
 			 * Token is number of blocks
 			 */
 			if (geti64(cleantoken, &blokno, (uint64_t *)NULL)) {
-			    break;
+				break;
 			}
 			if (blokno > bounds->upper) {
-			    err_print(
-"Number of blocks must be less that the total available blocks.\n");
-			    break;
+				err_print("Number of blocks must be less that "
+				    "the total available blocks.\n");
+				break;
 			}
 			return (blokno);
 
@@ -1614,8 +1572,8 @@
 			 * Some sanity check
 			 */
 			if (blokno < efi_deflt->start_sector) {
-				err_print(
-"End Sector must fall on or after start sector %llu\n",
+				err_print("End Sector must fall on or after "
+				    "start sector %llu\n",
 				    efi_deflt->start_sector);
 				break;
 			}
@@ -1624,8 +1582,8 @@
 			 * verify that our input is within range
 			 */
 			if (blokno > cur_parts->etoc->efi_last_u_lba) {
-				err_print(
-"End Sector %llu is beyond max Sector %llu\n",
+				err_print("End Sector %llu is beyond max "
+				    "Sector %llu\n",
 				    blokno, cur_parts->etoc->efi_last_u_lba);
 				break;
 			}
@@ -1681,11 +1639,11 @@
 				break;
 			}
 			return (uint64_t)((float)nmegs * 1024.0 *
-				1024.0 * 1024.0 * 1024.0 / cur_blksz);
+			    1024.0 * 1024.0 * 1024.0 / cur_blksz);
 
 		default:
-			err_print(
-"Please specify units in either b(number of blocks), e(end sector),\n");
+			err_print("Please specify units in either "
+			    "b(number of blocks), e(end sector),\n");
 			err_print(" g(gigabytes), m(megabytes)");
 			err_print(" or t(terabytes)\n");
 			break;
@@ -1721,9 +1679,7 @@
  * Print input choices
  */
 static void
-print_input_choices(type, param)
-	int		type;
-	u_ioparam_t	*param;
+print_input_choices(int type, u_ioparam_t *param)
 {
 	char		**sp;
 	slist_t		*lp;
@@ -1803,10 +1759,7 @@
  * associated with the matched string in match_value.
  */
 int
-find_value(slist, match_str, match_value)
-	slist_t		*slist;
-	char		*match_str;
-	int		*match_value;
+find_value(slist_t *slist, char *match_str, int *match_value)
 {
 	int		i;
 	int		nmatches;
@@ -1851,9 +1804,7 @@
  * Return the string associated with that value.
  */
 char *
-find_string(slist, match_value)
-	slist_t		*slist;
-	int		match_value;
+find_string(slist_t *slist, int match_value)
 {
 	for (; slist->str != NULL; slist++) {
 		if (slist->value == match_value) {
@@ -1861,15 +1812,14 @@
 		}
 	}
 
-	return ((char *)NULL);
+	return (NULL);
 }
 
 /*
  * Return the width of the widest string in an slist
  */
 static int
-slist_widest_str(slist)
-	slist_t	*slist;
+slist_widest_str(slist_t *slist)
 {
 	int	i;
 	int	width;
@@ -1887,9 +1837,7 @@
  * Print a string left-justified to a fixed width.
  */
 static void
-ljust_print(str, width)
-	char	*str;
-	int	width;
+ljust_print(char *str, int width)
 {
 	int	i;
 
@@ -2050,9 +1998,7 @@
  * data is not crud, so be rather defensive.
  */
 void
-print_buf(buf, nbytes)
-	char	*buf;
-	int	nbytes;
+print_buf(char *buf, int nbytes)
 {
 	int	c;
 
@@ -2072,13 +2018,12 @@
  * booting.
  */
 void
-pr_ctlrline(ctlr)
-	register struct ctlr_info *ctlr;
+pr_ctlrline(struct ctlr_info *ctlr)
 {
 
 	fmt_print("           %s%d at %s 0x%x ",
-		ctlr->ctlr_cname, ctlr->ctlr_num,
-		space2str(ctlr->ctlr_space), ctlr->ctlr_addr);
+	    ctlr->ctlr_cname, ctlr->ctlr_num,
+	    space2str(ctlr->ctlr_space), ctlr->ctlr_addr);
 	if (ctlr->ctlr_vec != 0)
 		fmt_print("vec 0x%x ", ctlr->ctlr_vec);
 	else
@@ -2093,9 +2038,7 @@
  * booting.
  */
 void
-pr_diskline(disk, num)
-	register struct disk_info *disk;
-	int	num;
+pr_diskline(struct disk_info *disk, int	num)
 {
 	struct	ctlr_info *ctlr = disk->disk_ctlr;
 	struct	disk_type *type = disk->disk_type;
@@ -2103,13 +2046,13 @@
 	fmt_print("    %4d. %s ", num, disk->disk_name);
 	if ((type != NULL) && (disk->label_type == L_TYPE_SOLARIS)) {
 		fmt_print("<%s cyl %u alt %u hd %u sec %u>",
-			type->dtype_asciilabel, type->dtype_ncyl,
-			type->dtype_acyl, type->dtype_nhead,
-			type->dtype_nsect);
+		    type->dtype_asciilabel, type->dtype_ncyl,
+		    type->dtype_acyl, type->dtype_nhead,
+		    type->dtype_nsect);
 	} else if ((type != NULL) && (disk->label_type == L_TYPE_EFI)) {
 		cur_blksz = disk->disk_lbasize;
 		print_efi_string(type->vendor, type->product,
-			type->revision, type->capacity);
+		    type->revision, type->capacity);
 	} else if (disk->disk_flags & DSK_RESERVED) {
 		fmt_print("<drive not available: reserved>");
 	} else if (disk->disk_flags & DSK_UNAVAILABLE) {
@@ -2127,9 +2070,9 @@
 		fmt_print("          %s\n", disk->devfs_name);
 	} else {
 		fmt_print("          %s%d at %s%d slave %d\n",
-			ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit,
-			ctlr->ctlr_cname, ctlr->ctlr_num,
-			disk->disk_dkinfo.dki_slave);
+		    ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit,
+		    ctlr->ctlr_cname, ctlr->ctlr_num,
+		    disk->disk_dkinfo.dki_slave);
 	}
 
 #ifdef	OLD
@@ -2141,8 +2084,7 @@
 	}
 	fmt_print("\n");
 	if (type != NULL) {
-		fmt_print(
-"           %s%d: <%s cyl %u alt %u hd %u sec %u>\n",
+		fmt_print("           %s%d: <%s cyl %u alt %u hd %u sec %u>\n",
 		    ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit,
 		    type->dtype_asciilabel, type->dtype_ncyl,
 		    type->dtype_acyl, type->dtype_nhead,
@@ -2175,7 +2117,7 @@
  * track of the current line in the data file via a global variable.
  */
 static int
-sup_inputchar()
+sup_inputchar(void)
 {
 	int	c;
 
@@ -2210,8 +2152,7 @@
  * This routine pushes a character back onto the input pipe for the data file.
  */
 static void
-sup_pushchar(c)
-	int	c;
+sup_pushchar(int c)
 {
 	(void) ungetc(c, data_file);
 }
@@ -2230,16 +2171,14 @@
  * last token around, which is useful for error recovery.
  */
 int
-sup_gettoken(buf)
-	char	*buf;
+sup_gettoken(char *buf)
 {
 	last_token_type = sup_get_token(buf);
 	return (last_token_type);
 }
 
 static int
-sup_get_token(buf)
-	char	*buf;
+sup_get_token(char *buf)
 {
 	char	*ptr = buf;
 	int	c, quoted = 0;
@@ -2288,7 +2227,7 @@
 		 * a token.
 		 */
 		if (!quoted && (c == '=' || c == ',' || c == ':' ||
-			c == '#' || c == '|' || c == '&' || c == '~'))
+		    c == '#' || c == '|' || c == '&' || c == '~'))
 			break;
 		/*
 		 * Store the character if there's room left.
@@ -2350,9 +2289,7 @@
  * Push back a token
  */
 void
-sup_pushtoken(token_buf, token_type)
-	char	*token_buf;
-	int	token_type;
+sup_pushtoken(char *token_buf, int token_type)
 {
 	/*
 	 * We can only push one token back at a time
@@ -2369,9 +2306,7 @@
  * and EOF.
  */
 void
-get_inputline(line, nbytes)
-	char	*line;
-	int	nbytes;
+get_inputline(char *line, int nbytes)
 {
 	char	*p = line;
 	int	c;
@@ -2481,9 +2416,9 @@
 	/* reopen file descriptor if one was open before */
 	if (cur_disk != NULL) {
 		if ((cur_file = open_disk(cur_disk->disk_path,
-			O_RDWR | O_NDELAY)) < 0) {
+		    O_RDWR | O_NDELAY)) < 0) {
 			err_print("Error: can't reopen selected disk '%s'. \n",
-				cur_disk->disk_name);
+			    cur_disk->disk_name);
 			fullabort();
 		}
 	}
--- a/usr/src/cmd/format/label.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/label.c	Mon Dec 09 14:15:34 2019 +0000
@@ -282,7 +282,7 @@
 	if (cur_label == L_TYPE_EFI) {
 		enter_critical();
 		vtoc64 = cur_parts->etoc;
-		err_check(vtoc64);
+		efi_err_check(vtoc64);
 		if (efi_write(cur_file, vtoc64) != 0) {
 			err_print("Warning: error writing EFI.\n");
 			error = -1;
@@ -979,97 +979,6 @@
 	return (0);
 }
 
-/* make sure the user specified something reasonable */
-void
-err_check(struct dk_gpt *vtoc)
-{
-	int		resv_part = -1;
-	int		i, j;
-	diskaddr_t	istart, jstart, isize, jsize, endsect;
-	int		overlap = 0;
-	uint_t		reserved;
-
-	/*
-	 * make sure no partitions overlap
-	 */
-	reserved = efi_reserved_sectors(vtoc);
-	for (i = 0; i < vtoc->efi_nparts; i++) {
-		/* It can't be unassigned and have an actual size */
-		if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) &&
-		    (vtoc->efi_parts[i].p_size != 0)) {
-			(void) fprintf(stderr,
-"partition %d is \"unassigned\" but has a size of %llu\n", i,
-			    vtoc->efi_parts[i].p_size);
-		}
-		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) {
-			continue;
-		}
-		if (vtoc->efi_parts[i].p_tag == V_RESERVED) {
-			if (resv_part != -1) {
-				(void) fprintf(stderr,
-"found duplicate reserved partition at %d\n", i);
-			}
-			resv_part = i;
-			if (vtoc->efi_parts[i].p_size != reserved)
-				(void) fprintf(stderr,
-"Warning: reserved partition size must be %u sectors\n",
-				    reserved);
-		}
-		if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) ||
-		    (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) {
-			(void) fprintf(stderr,
-			    "Partition %d starts at %llu\n",
-			    i,
-			    vtoc->efi_parts[i].p_start);
-			(void) fprintf(stderr,
-			    "It must be between %llu and %llu.\n",
-			    vtoc->efi_first_u_lba,
-			    vtoc->efi_last_u_lba);
-		}
-		if ((vtoc->efi_parts[i].p_start +
-		    vtoc->efi_parts[i].p_size <
-		    vtoc->efi_first_u_lba) ||
-		    (vtoc->efi_parts[i].p_start +
-		    vtoc->efi_parts[i].p_size >
-		    vtoc->efi_last_u_lba + 1)) {
-			(void) fprintf(stderr,
-			    "Partition %d ends at %llu\n",
-			    i,
-			    vtoc->efi_parts[i].p_start +
-			    vtoc->efi_parts[i].p_size);
-			(void) fprintf(stderr,
-			    "It must be between %llu and %llu.\n",
-			    vtoc->efi_first_u_lba,
-			    vtoc->efi_last_u_lba);
-		}
-
-		for (j = 0; j < vtoc->efi_nparts; j++) {
-			isize = vtoc->efi_parts[i].p_size;
-			jsize = vtoc->efi_parts[j].p_size;
-			istart = vtoc->efi_parts[i].p_start;
-			jstart = vtoc->efi_parts[j].p_start;
-			if ((i != j) && (isize != 0) && (jsize != 0)) {
-				endsect = jstart + jsize -1;
-				if ((jstart <= istart) &&
-				    (istart <= endsect)) {
-					if (!overlap) {
-					(void) fprintf(stderr,
-"label error: EFI Labels do not support overlapping partitions\n");
-					}
-					(void) fprintf(stderr,
-"Partition %d overlaps partition %d.\n", i, j);
-					overlap = 1;
-				}
-			}
-		}
-	}
-	/* make sure there is a reserved partition */
-	if (resv_part == -1) {
-		(void) fprintf(stderr,
-		    "no reserved partition found\n");
-	}
-}
-
 #ifdef	DEBUG
 static void
 dump_label(struct dk_label *label)
--- a/usr/src/cmd/format/label.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/label.h	Mon Dec 09 14:15:34 2019 +0000
@@ -44,7 +44,7 @@
 int	get_disk_info(int, struct efi_info *, struct disk_info *);
 int	label_to_vtoc(struct extvtoc *, struct dk_label *);
 int	SMI_vtoc_to_EFI(int, struct dk_gpt **);
-void	err_check(struct dk_gpt *);
+void	efi_err_check(struct dk_gpt *);
 extern int	is_efi_type(int);
 
 #ifdef	__cplusplus
--- a/usr/src/cmd/format/menu_command.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/menu_command.c	Mon Dec 09 14:15:34 2019 +0000
@@ -1709,7 +1709,7 @@
 			return (-1);
 		}
 		if (efi_write(cur_file, vtoc64) != 0) {
-			err_check(vtoc64);
+			efi_err_check(vtoc64);
 			err_print("Warning: error writing EFI.\n");
 			return (-1);
 		} else {
--- a/usr/src/cmd/format/menu_partition.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/menu_partition.c	Mon Dec 09 14:15:34 2019 +0000
@@ -35,25 +35,15 @@
 #include "misc.h"
 #include "param.h"
 
-#ifdef __STDC__
-
 /* Function prototypes for ANSI C Compilers */
 static void	nspaces(int);
 static int	ndigits(uint64_t);
 
-#else	/* __STDC__ */
-
-/* Function prototypes for non-ANSI C Compilers */
-static void	nspaces();
-static int	ndigits();
-
-#endif	/* __STDC__ */
-
 /*
  * This routine implements the 'a' command.  It changes the 'a' partition.
  */
 int
-p_apart()
+p_apart(void)
 {
 
 	change_partition(0);
@@ -64,7 +54,7 @@
  * This routine implements the 'b' command.  It changes the 'b' partition.
  */
 int
-p_bpart()
+p_bpart(void)
 {
 
 	change_partition(1);
@@ -75,7 +65,7 @@
  * This routine implements the 'c' command.  It changes the 'c' partition.
  */
 int
-p_cpart()
+p_cpart(void)
 {
 
 	change_partition(2);
@@ -86,7 +76,7 @@
  * This routine implements the 'd' command.  It changes the 'd' partition.
  */
 int
-p_dpart()
+p_dpart(void)
 {
 
 	change_partition(3);
@@ -97,7 +87,7 @@
  * This routine implements the 'e' command.  It changes the 'e' partition.
  */
 int
-p_epart()
+p_epart(void)
 {
 
 	change_partition(4);
@@ -108,7 +98,7 @@
  * This routine implements the 'f' command.  It changes the 'f' partition.
  */
 int
-p_fpart()
+p_fpart(void)
 {
 
 	change_partition(5);
@@ -119,7 +109,7 @@
  * This routine implements the 'g' command.  It changes the 'g' partition.
  */
 int
-p_gpart()
+p_gpart(void)
 {
 
 	change_partition(6);
@@ -130,7 +120,7 @@
  * This routine implements the 'h' command.  It changes the 'h' partition.
  */
 int
-p_hpart()
+p_hpart(void)
 {
 
 	change_partition(7);
@@ -142,7 +132,7 @@
  * labeled disks. This can be used only in expert mode.
  */
 int
-p_ipart()
+p_ipart(void)
 {
 	change_partition(8);
 	return (0);
@@ -153,7 +143,7 @@
  * This routine implements the 'j' command.  It changes the 'j' partition.
  */
 int
-p_jpart()
+p_jpart(void)
 {
 
 	change_partition(9);
@@ -162,7 +152,7 @@
 #endif	/* defined(i386) */
 
 int
-p_expand()
+p_expand(void)
 {
 	uint64_t delta;
 	uint_t nparts;
@@ -193,7 +183,7 @@
  * to make a pre-defined partition map the current map.
  */
 int
-p_select()
+p_select(void)
 {
 	struct partition_info	*pptr, *parts;
 	u_ioparam_t		ioparam;
@@ -254,8 +244,8 @@
 	cyl_offset = pptr->pinfo_map[I_PARTITION].dkl_cylno + 1;
 	if (pptr->pinfo_map[J_PARTITION].dkl_nblk != 0) {
 		cyl_offset = pptr->pinfo_map[J_PARTITION].dkl_cylno +
-			((pptr->pinfo_map[J_PARTITION].dkl_nblk +
-				(spc() - 1)) / spc());
+		    ((pptr->pinfo_map[J_PARTITION].dkl_nblk +
+		    (spc() - 1)) / spc());
 	}
 #else	/* !defined(i386) */
 
@@ -281,11 +271,10 @@
 		}
 #endif		/* defined(i386) */
 		if (pptr->pinfo_map[i].dkl_cylno < b_cylno ||
-			pptr->pinfo_map[i].dkl_cylno > (ncyl-1)) {
-			err_print(
-"partition %c: starting cylinder %d is out of range\n",
-				(PARTITION_BASE+i),
-				pptr->pinfo_map[i].dkl_cylno);
+		    pptr->pinfo_map[i].dkl_cylno > (ncyl-1)) {
+			err_print("partition %c: starting cylinder %d is out "
+			    "of range\n", (PARTITION_BASE + i),
+			    pptr->pinfo_map[i].dkl_cylno);
 			return (0);
 		}
 		if (pptr->pinfo_map[i].dkl_nblk > ((ncyl -
@@ -336,7 +325,7 @@
  * to be created.
  */
 int
-p_name()
+p_name(void)
 {
 	char	*name;
 
@@ -493,25 +482,23 @@
 
 	ncyl2_digits = ndigits(map->efi_last_u_lba);
 	if (want_header) {
-	    fmt_print("Part      ");
-	    fmt_print("Tag    Flag     ");
-	    fmt_print("First Sector");
-	    nspaces(ncyl2_digits);
-	    fmt_print("Size");
-	    nspaces(ncyl2_digits);
-	    fmt_print("Last Sector\n");
+		fmt_print("Part      ");
+		fmt_print("Tag    Flag     ");
+		fmt_print("First Sector");
+		nspaces(ncyl2_digits);
+		fmt_print("Size");
+		nspaces(ncyl2_digits);
+		fmt_print("Last Sector\n");
 	}
 
 	fmt_print("  %d ", partnum);
-	s = find_string(ptag_choices,
-		(int)map->efi_parts[partnum].p_tag);
+	s = find_string(ptag_choices, (int)map->efi_parts[partnum].p_tag);
 	if (s == (char *)NULL)
 		s = "-";
 	nspaces(10 - (int)strlen(s));
 	fmt_print("%s", s);
 
-	s = find_string(pflag_choices,
-		(int)map->efi_parts[partnum].p_flag);
+	s = find_string(pflag_choices, (int)map->efi_parts[partnum].p_flag);
 	if (s == (char *)NULL)
 		s = "-";
 	nspaces(6 - (int)strlen(s));
@@ -521,28 +508,27 @@
 
 	secsize = map->efi_parts[partnum].p_size;
 	if (secsize == 0) {
-	    fmt_print("%16llu", map->efi_parts[partnum].p_start);
-	    nspaces(ncyl2_digits);
-	    fmt_print("  0     ");
+		fmt_print("%16llu", map->efi_parts[partnum].p_start);
+		nspaces(ncyl2_digits);
+		fmt_print("  0     ");
 	} else {
-	    fmt_print("%16llu", map->efi_parts[partnum].p_start);
-	    scaled = bn2mb(secsize);
-	    nspaces(ncyl2_digits - 5);
-	    if (scaled >= (float)1024.0 * 1024) {
-		fmt_print("%8.2fTB", scaled/((float)1024.0 * 1024));
-	    } else if (scaled >= (float)1024.0) {
-		fmt_print("%8.2fGB", scaled/(float)1024.0);
-	    } else {
-		fmt_print("%8.2fMB", scaled);
-	    }
+		fmt_print("%16llu", map->efi_parts[partnum].p_start);
+		scaled = bn2mb(secsize);
+		nspaces(ncyl2_digits - 5);
+		if (scaled >= (float)1024.0 * 1024) {
+			fmt_print("%8.2fTB", scaled/((float)1024.0 * 1024));
+		} else if (scaled >= (float)1024.0) {
+			fmt_print("%8.2fGB", scaled/(float)1024.0);
+		} else {
+			fmt_print("%8.2fMB", scaled);
+		}
 	}
 	nspaces(ncyl2_digits);
-	if ((map->efi_parts[partnum].p_start+secsize - 1) ==
-		UINT_MAX64) {
-	    fmt_print(" 0    \n");
+	if ((map->efi_parts[partnum].p_start + secsize - 1) == UINT_MAX64) {
+		fmt_print(" 0    \n");
 	} else {
-	    fmt_print(" %llu    \n",
-		map->efi_parts[partnum].p_start+secsize - 1);
+		fmt_print(" %llu    \n",
+		    map->efi_parts[partnum].p_start + secsize - 1);
 	}
 }
 
@@ -607,8 +593,7 @@
 	/*
 	 * Print the partition tag.  If invalid, print -
 	 */
-	s = find_string(ptag_choices,
-		(int)pinfo->vtoc.v_part[partnum].p_tag);
+	s = find_string(ptag_choices, (int)pinfo->vtoc.v_part[partnum].p_tag);
 	if (s == (char *)NULL)
 		s = "-";
 	nspaces(10 - (int)strlen(s));
@@ -617,9 +602,8 @@
 	/*
 	 * Print the partition flag.  If invalid print -
 	 */
-	s = find_string(pflag_choices,
-		(int)pinfo->vtoc.v_part[partnum].p_flag);
-	if (s == (char *)NULL)
+	s = find_string(pflag_choices, (int)pinfo->vtoc.v_part[partnum].p_flag);
+	if (s == NULL)
 		s = "-";
 	nspaces(6 - (int)strlen(s));
 	fmt_print("%s", s);
@@ -637,7 +621,7 @@
 		scaled = bn2mb(nblks);
 		if (scaled > (float)1024.0 * 1024.0) {
 			fmt_print("%8.2fTB    ",
-				scaled/((float)1024.0 * 1024.0));
+			    scaled/((float)1024.0 * 1024.0));
 		} else if (scaled > (float)1024.0) {
 			fmt_print("%8.2fGB    ", scaled/(float)1024.0);
 		} else {
@@ -666,8 +650,7 @@
  * Return true if a disk has a volume name
  */
 int
-chk_volname(disk)
-	struct disk_info	*disk;
+chk_volname(struct disk_info *disk)
 {
 	return (disk->v_volume[0] != 0);
 }
@@ -677,8 +660,7 @@
  * Print the volume name, if it appears to be set
  */
 void
-print_volname(disk)
-	struct disk_info	*disk;
+print_volname(struct disk_info *disk)
 {
 	int	i;
 	char	*p;
@@ -696,8 +678,7 @@
  * Print a number of spaces
  */
 static void
-nspaces(n)
-	int	n;
+nspaces(int n)
 {
 	while (n-- > 0)
 		fmt_print(" ");
@@ -707,8 +688,7 @@
  * Return the number of digits required to print a number
  */
 static int
-ndigits(n)
-	uint64_t	n;
+ndigits(uint64_t n)
 {
 	int	i;
 
--- a/usr/src/cmd/format/modify_partition.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/modify_partition.c	Mon Dec 09 14:15:34 2019 +0000
@@ -38,26 +38,11 @@
 #include "label.h"
 #include "auto_sense.h"
 
-#ifdef __STDC__
-
-/* Function prototypes for ANSI C Compilers */
-
 static void	adj_cyl_offset(struct dk_map32 *map);
 static int	check_map(struct dk_map32 *map);
 static void	get_user_map(struct dk_map32 *map, int float_part);
 static void	get_user_map_efi(struct dk_gpt *map, int float_part);
 
-#else	/* __STDC__ */
-
-/* Function prototypes for non-ANSI C Compilers */
-
-static void	adj_cyl_offset();
-static int	check_map();
-static void	get_user_map();
-static void	get_user_map_efi();
-
-#endif	/* __STDC__ */
-
 static char *partn_list[] = { "0", "1", "2", "3", "4", "5", "6", "7", NULL };
 
 static char *sel_list[] = { "0", "1", "2", "3", NULL };
@@ -69,7 +54,7 @@
  * Modify/Create a predefined partition table.
  */
 int
-p_modify()
+p_modify(void)
 {
 	struct	partition_info	tmp_pinfo[1];
 	struct	dk_map32	*map = tmp_pinfo->pinfo_map;
@@ -134,89 +119,92 @@
 	 */
 	if (cur_parts->pinfo_name != NULL) {
 		(void) snprintf(tmpstr, sizeof (tmpstr),
-			"\t0. Current partition table (%s)",
-			cur_parts->pinfo_name);
+		    "\t0. Current partition table (%s)",
+		    cur_parts->pinfo_name);
 	} else {
 		(void) sprintf(tmpstr,
-			"\t0. Current partition table (unnamed)");
+		    "\t0. Current partition table (unnamed)");
 	}
 
 	(void) snprintf(tmpstr2, sizeof (tmpstr2),
-"Select partitioning base:\n%s\n"
-"\t1. All Free Hog\n"
-"Choose base (enter number) ",
-		tmpstr);
+	    "Select partitioning base:\n%s\n"
+	    "\t1. All Free Hog\n"
+	    "Choose base (enter number) ",
+	    tmpstr);
 
 	ioparam.io_charlist = sel_list;
 	sel_type = input(FIO_MSTR, tmpstr2, '?', &ioparam,
-		&sel_type, DATA_INPUT);
+	    &sel_type, DATA_INPUT);
 
 	switch (cur_label) {
 	case L_TYPE_SOLARIS:
-	    if (sel_type == 0) {
-		/*
-		 * Check for invalid parameters but do
-		 * not modify the table.
-		 */
-		if (check_map(cur_parts->pinfo_map)) {
-			err_print("\
-Warning: Fix, or select a different partition table.\n");
-			return (0);
-		}
-		/*
-		 * Create partition map from existing map
-		 */
-		tmp_pinfo->vtoc = cur_parts->vtoc;
-		for (i = 0; i < NDKMAP; i++) {
-			map[i].dkl_nblk = cur_parts->pinfo_map[i].dkl_nblk;
-			map[i].dkl_cylno = cur_parts->pinfo_map[i].dkl_cylno;
-		}
-	    } else {
-		/*
-		 * Make an empty partition map, with all the space
-		 * in the c partition.
-		 */
-		set_vtoc_defaults(tmp_pinfo);
-		for (i = 0; i < NDKMAP; i++) {
-			map[i].dkl_nblk = 0;
-			map[i].dkl_cylno = 0;
-		}
-		map[C_PARTITION].dkl_nblk = ncyl * spc();
+		if (sel_type == 0) {
+			/*
+			 * Check for invalid parameters but do
+			 * not modify the table.
+			 */
+			if (check_map(cur_parts->pinfo_map)) {
+				err_print("Warning: Fix, or select a "
+				    "different partition table.\n");
+				return (0);
+			}
+			/*
+			 * Create partition map from existing map
+			 */
+			tmp_pinfo->vtoc = cur_parts->vtoc;
+			for (i = 0; i < NDKMAP; i++) {
+				map[i].dkl_nblk =
+				    cur_parts->pinfo_map[i].dkl_nblk;
+				map[i].dkl_cylno =
+				    cur_parts->pinfo_map[i].dkl_cylno;
+			}
+		} else {
+			/*
+			 * Make an empty partition map, with all the space
+			 * in the c partition.
+			 */
+			set_vtoc_defaults(tmp_pinfo);
+			for (i = 0; i < NDKMAP; i++) {
+				map[i].dkl_nblk = 0;
+				map[i].dkl_cylno = 0;
+			}
+			map[C_PARTITION].dkl_nblk = ncyl * spc();
 
 #if defined(i386)
-		/*
-		 * Adjust for the boot and possibly alternates partitions
-		 */
-		map[I_PARTITION].dkl_nblk = spc();
-		map[I_PARTITION].dkl_cylno = 0;
-		if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) {
-			map[J_PARTITION].dkl_nblk = 2 * spc();
-			map[J_PARTITION].dkl_cylno = spc() / spc();
-		}
+			/*
+			 * Adjust for the boot and possibly alternates
+			 * partitions.
+			 */
+			map[I_PARTITION].dkl_nblk = spc();
+			map[I_PARTITION].dkl_cylno = 0;
+			if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) {
+				map[J_PARTITION].dkl_nblk = 2 * spc();
+				map[J_PARTITION].dkl_cylno = spc() / spc();
+			}
 #endif			/* defined(i386) */
-	    }
-	    break;
+		}
+		break;
 	case L_TYPE_EFI:
-	    if (sel_type == 1) {
-		for (i = 0; i < cur_parts->etoc->efi_nparts; i++) {
-		    cur_parts->etoc->efi_parts[i].p_start = 0;
-		    cur_parts->etoc->efi_parts[i].p_size = 0;
+		if (sel_type == 1) {
+			for (i = 0; i < cur_parts->etoc->efi_nparts; i++) {
+				cur_parts->etoc->efi_parts[i].p_start = 0;
+				cur_parts->etoc->efi_parts[i].p_size = 0;
+			}
 		}
-	    }
-	    break;
+		break;
 	}
 
 	fmt_print("\n");
 	if (cur_label == L_TYPE_SOLARIS) {
-	    print_map(tmp_pinfo);
+		print_map(tmp_pinfo);
 	} else {
-	    print_map(cur_parts);
+		print_map(cur_parts);
 	}
 
 	ioparam.io_charlist = confirm_list;
-	if (input(FIO_MSTR,
-"Do you wish to continue creating a new partition\ntable based on above table",
-			'?', &ioparam, &inpt_dflt, DATA_INPUT)) {
+	if (input(FIO_MSTR, "Do you wish to continue creating a new "
+	    "partition\ntable based on above table",
+	    '?', &ioparam, &inpt_dflt, DATA_INPUT)) {
 		return (0);
 	}
 
@@ -228,11 +216,11 @@
 		free_hog = G_PARTITION;	/* default to g partition */
 		ioparam.io_charlist = partn_list;
 		free_hog = input(FIO_MSTR, "Free Hog partition", '?',
-			&ioparam, &free_hog, DATA_INPUT);
+		    &ioparam, &free_hog, DATA_INPUT);
 		/* disallow c partition */
 		if (free_hog == C_PARTITION) {
 			fmt_print("'%c' cannot be the 'Free Hog' partition.\n",
-				C_PARTITION + PARTITION_BASE);
+			    C_PARTITION + PARTITION_BASE);
 			free_hog = -1;
 			continue;
 		}
@@ -246,7 +234,7 @@
 			map[free_hog].dkl_nblk -= map[I_PARTITION].dkl_nblk;
 			if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) {
 				map[free_hog].dkl_nblk -=
-					map[J_PARTITION].dkl_nblk;
+				    map[J_PARTITION].dkl_nblk;
 			}
 #endif			/* defined(i386) */
 			break;
@@ -256,11 +244,11 @@
 		 * the float partition.
 		 */
 		if (map[free_hog].dkl_nblk == 0) {
-			err_print("\
-Warning: No space available from Free Hog partition.\n");
+			err_print("Warning: No space available from Free Hog "
+			    "partition.\n");
 			ioparam.io_charlist = confirm_list;
 			if (input(FIO_MSTR, "Continue", '?',
-				&ioparam, &inpt_dflt, DATA_INPUT)) {
+			    &ioparam, &inpt_dflt, DATA_INPUT)) {
 				free_hog = -1;
 			}
 		}
@@ -268,27 +256,27 @@
 	inpt_dflt = 0;
 
 	if (cur_label == L_TYPE_EFI) {
-	    free_hog = G_PARTITION; /* default to g partition */
-	    ioparam.io_charlist = partn_list;
-	    free_hog = input(FIO_MSTR, "Free Hog partition", '?',
-		&ioparam, &free_hog, DATA_INPUT);
-	    /* disallow c partition */
-	    if (free_hog == C_PARTITION) {
-		fmt_print("'%c' cannot be the 'Free Hog' partition.\n",
-		    C_PARTITION + PARTITION_BASE);
-		return (-1);
-	    }
-	    get_user_map_efi(cur_parts->etoc, free_hog);
-	    print_map(cur_parts);
-	    if (check("Ready to label disk, continue")) {
-		return (-1);
-	    }
-	    fmt_print("\n");
-	    if (write_label()) {
-		err_print("Writing label failed\n");
-		return (-1);
-	    }
-	    return (0);
+		free_hog = G_PARTITION; /* default to g partition */
+		ioparam.io_charlist = partn_list;
+		free_hog = input(FIO_MSTR, "Free Hog partition", '?',
+		    &ioparam, &free_hog, DATA_INPUT);
+		/* disallow c partition */
+		if (free_hog == C_PARTITION) {
+			fmt_print("'%c' cannot be the 'Free Hog' partition.\n",
+			    C_PARTITION + PARTITION_BASE);
+			return (-1);
+		}
+		get_user_map_efi(cur_parts->etoc, free_hog);
+		print_map(cur_parts);
+		if (check("Ready to label disk, continue")) {
+			return (-1);
+		}
+		fmt_print("\n");
+		if (write_label()) {
+			err_print("Writing label failed\n");
+			return (-1);
+		}
+		return (0);
 	}
 	/*
 	 * get user modified partition table
@@ -304,9 +292,8 @@
 	print_map(tmp_pinfo);
 
 	ioparam.io_charlist = confirm_list;
-	if (input(FIO_MSTR, "\
-Okay to make this the current partition table", '?',
-		&ioparam, &inpt_dflt, DATA_INPUT)) {
+	if (input(FIO_MSTR, "Okay to make this the current partition table",
+	    '?', &ioparam, &inpt_dflt, DATA_INPUT)) {
 		return (0);
 	} else {
 		make_partition();
@@ -318,9 +305,9 @@
 			cur_parts->pinfo_map[i].dkl_cylno = map[i].dkl_cylno;
 #ifdef i386
 			cur_parts->vtoc.v_part[i].p_start =
-				map[i].dkl_cylno * nhead * nsect;
+			    map[i].dkl_cylno * nhead * nsect;
 			cur_parts->vtoc.v_part[i].p_size =
-				map[i].dkl_nblk;
+			    map[i].dkl_nblk;
 #endif
 		}
 		(void) p_name();
@@ -340,14 +327,11 @@
 	}
 }
 
-
-
 /*
  * Adjust cylinder offsets
  */
 static void
-adj_cyl_offset(map)
-	struct	dk_map32 *map;
+adj_cyl_offset(struct dk_map32 *map)
 {
 	int	i;
 	int	cyloffset = 0;
@@ -390,8 +374,7 @@
  * Check partition table
  */
 static int
-check_map(map)
-	struct	dk_map32 *map;
+check_map(struct dk_map32 *map)
 {
 	int		i;
 	int		cyloffset = 0;
@@ -411,16 +394,16 @@
 	 */
 	for (i = 0; i < NDKMAP; i++) {
 		if (map[i].dkl_cylno > (blkaddr32_t)ncyl-1) {
-			err_print("\
-Warning: Partition %c starting cylinder %d is out of range.\n",
-				(PARTITION_BASE+i), map[i].dkl_cylno);
+			err_print("Warning: Partition %c starting cylinder "
+			    "%d is out of range.\n",
+			    (PARTITION_BASE+i), map[i].dkl_cylno);
 			return (-1);
 		}
 		if (map[i].dkl_nblk >
-			(blkaddr32_t)(ncyl - map[i].dkl_cylno) * spc()) {
-			err_print("\
-Warning: Partition %c, specified # of blocks, %u, is out of range.\n",
-				(PARTITION_BASE+i), map[i].dkl_nblk);
+		    (blkaddr32_t)(ncyl - map[i].dkl_cylno) * spc()) {
+			err_print("Warning: Partition %c, specified # of "
+			    "blocks, %u, is out of range.\n",
+			    (PARTITION_BASE+i), map[i].dkl_nblk);
 			return (-1);
 		}
 		if (i != C_PARTITION && map[i].dkl_nblk) {
@@ -429,21 +412,21 @@
 				continue;
 #endif
 			if (map[i].dkl_cylno < cyloffset) {
-				err_print(
-"Warning: Overlapping partition (%c) in table.\n", PARTITION_BASE+i);
+				err_print("Warning: Overlapping partition "
+				    "(%c) in table.\n", PARTITION_BASE+i);
 				return (-1);
 			} else if (map[i].dkl_cylno > cyloffset) {
-				err_print(
-"Warning: Non-contiguous partition (%c) in table.\n", PARTITION_BASE+i);
+				err_print("Warning: Non-contiguous partition "
+				    "(%c) in table.\n", PARTITION_BASE+i);
 			}
 			cyloffset += (map[i].dkl_nblk + (spc()-1))/spc();
 			tot_blks = map[i].dkl_nblk;
 		}
 	}
 	if (tot_blks > map[C_PARTITION].dkl_nblk) {
-		err_print("\
-Warning: Total blocks used is greater than number of blocks in '%c'\n\
-\tpartition.\n", C_PARTITION + PARTITION_BASE);
+		err_print("Warning: Total blocks used is greater than number "
+		    "of blocks in '%c'\n\tpartition.\n",
+		    C_PARTITION + PARTITION_BASE);
 	return (-1);
 	}
 	return (0);
@@ -455,9 +438,7 @@
  * get user defined partitions
  */
 static void
-get_user_map(map, float_part)
-	struct	dk_map32 *map;
-	int	float_part;
+get_user_map(struct dk_map32 *map, int float_part)
 {
 	int		i;
 	blkaddr32_t	newsize;
@@ -471,24 +452,24 @@
 	for (i = 0; i < NDKMAP; i++) {
 		if (partn_list[i] == NULL)
 			break;
-		if ((i == C_PARTITION) || (i == float_part))
+		if ((i == C_PARTITION) || (i == float_part)) {
 			continue;
-		else {
+		} else {
 			ioparam.io_bounds.lower = 0;
 			ioparam.io_bounds.upper = map[i].dkl_nblk +
-				map[float_part].dkl_nblk;
+			    map[float_part].dkl_nblk;
 			deflt = map[i].dkl_nblk;
 			if (ioparam.io_bounds.upper == 0) {
-				err_print("\
-Warning: no space available for '%s' from Free Hog partition\n",
-					partn_list[i]);
+				err_print("Warning: no space available for "
+				    "'%s' from Free Hog partition\n",
+				    partn_list[i]);
 				continue;
 			}
 			(void) snprintf(tmpstr, sizeof (tmpstr),
-				"Enter size of partition '%s' ",
-				partn_list[i]);
+			    "Enter size of partition '%s' ",
+			    partn_list[i]);
 			newsize = (blkaddr32_t)input(FIO_CYL, tmpstr, ':',
-				&ioparam, (int *)&deflt, DATA_INPUT);
+			    &ioparam, (int *)&deflt, DATA_INPUT);
 			map[float_part].dkl_nblk -= (newsize - map[i].dkl_nblk);
 			map[i].dkl_nblk = newsize;
 		}
@@ -496,8 +477,7 @@
 }
 
 static struct partition_info *
-build_partition(tptr)
-struct disk_type *tptr;
+build_partition(struct disk_type *tptr)
 {
 	struct partition_info	*part;
 	struct dk_label		*label;
@@ -524,21 +504,20 @@
 	if (!build_default_partition(label, cur_ctype->ctype_ctype))
 		return (NULL);
 
-	part = (struct partition_info *)
-		    zalloc(sizeof (struct partition_info));
+	part = zalloc(sizeof (struct partition_info));
 	part->pinfo_name = alloc_string(tptr->dtype_asciilabel);
 	/*
 	 * Fill in the partition info from the label
 	 */
 	for (i = 0; i < NDKMAP; i++) {
 #if defined(_SUNOS_VTOC_8)
-	    part->pinfo_map[i] = label->dkl_map[i];
+		part->pinfo_map[i] = label->dkl_map[i];
 #else
-	    part->pinfo_map[i].dkl_cylno =
-		label->dkl_vtoc.v_part[i].p_start /
-		(blkaddr32_t)(tptr->dtype_nhead * tptr->dtype_nsect - apc);
-	    part->pinfo_map[i].dkl_nblk =
-		label->dkl_vtoc.v_part[i].p_size;
+		part->pinfo_map[i].dkl_cylno =
+		    label->dkl_vtoc.v_part[i].p_start /
+		    (blkaddr32_t)(tptr->dtype_nhead * tptr->dtype_nsect - apc);
+		part->pinfo_map[i].dkl_nblk =
+		    label->dkl_vtoc.v_part[i].p_size;
 #endif /* ifdefined(_SUNOS_VTOC_8) */
 	}
 	part->vtoc = label->dkl_vtoc;
@@ -549,11 +528,8 @@
  * build new partition table for given disk type
  */
 static void
-get_user_map_efi(map, float_part)
-	struct dk_gpt *map;
-	int	float_part;
+get_user_map_efi(struct dk_gpt *map, int float_part)
 {
-
 	int		i;
 	efi_deflt_t	efi_deflt;
 	u_ioparam_t	ioparam;
@@ -591,7 +567,7 @@
 	}
 	map->efi_parts[float_part].p_start = start_lba;
 	map->efi_parts[float_part].p_size = map->efi_last_u_lba + 1 -
-		start_lba - reserved;
+	    start_lba - reserved;
 	map->efi_parts[float_part].p_tag = V_USR;
 	if (map->efi_parts[float_part].p_size == 0) {
 		map->efi_parts[float_part].p_size = 0;
@@ -612,8 +588,7 @@
 
 
 void
-new_partitiontable(tptr, oldtptr)
-struct disk_type	*tptr, *oldtptr;
+new_partitiontable(struct disk_type *tptr, struct disk_type *oldtptr)
 {
 	struct partition_info *part;
 
@@ -622,16 +597,15 @@
 	 * partition table else copy the old partition table.(best guess).
 	 */
 	if ((oldtptr != NULL) &&
-		(tptr->dtype_ncyl ==  oldtptr->dtype_ncyl) &&
-		(tptr->dtype_nhead == oldtptr->dtype_nhead) &&
-		(tptr->dtype_nsect == oldtptr->dtype_nsect)) {
-
-	    part = (struct partition_info *)
-			zalloc(sizeof (struct partition_info));
-	    bcopy((char *)cur_parts, (char *)part,
-			sizeof (struct partition_info));
-	    part->pinfo_next = tptr->dtype_plist;
-	    tptr->dtype_plist = part;
+	    (tptr->dtype_ncyl == oldtptr->dtype_ncyl) &&
+	    (tptr->dtype_nhead == oldtptr->dtype_nhead) &&
+	    (tptr->dtype_nsect == oldtptr->dtype_nsect)) {
+		part = (struct partition_info *)
+		    zalloc(sizeof (struct partition_info));
+		bcopy((char *)cur_parts, (char *)part,
+		    sizeof (struct partition_info));
+		part->pinfo_next = tptr->dtype_plist;
+		tptr->dtype_plist = part;
 	} else {
 
 #ifdef DEBUG
--- a/usr/src/cmd/format/partition.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/format/partition.c	Mon Dec 09 14:15:34 2019 +0000
@@ -80,31 +80,31 @@
 	int		i;
 
 	for (i = 0; i < map->efi_nparts - 1; i++) {
-	    start[0] = map->efi_parts[i].p_start;
-	    size[0] = map->efi_parts[i].p_size;
-	    sec_no[0] = start[0] + size[0];
+		start[0] = map->efi_parts[i].p_start;
+		size[0] = map->efi_parts[i].p_size;
+		sec_no[0] = start[0] + size[0];
 
-	    start[1] = map->efi_parts[i+1].p_start;
-	    size[1] = map->efi_parts[i+1].p_size;
-	    sec_no[1] = start[1] + size[1];
+		start[1] = map->efi_parts[i + 1].p_start;
+		size[1] = map->efi_parts[i + 1].p_size;
+		sec_no[1] = start[1] + size[1];
 
-	    if (map->efi_parts[i].p_tag == V_BACKUP) {
-		sec_no[0] = 0;
-	    }
-	    if (map->efi_parts[i+1].p_tag == V_BACKUP) {
-		sec_no[1] = 0;
-	    }
-	    if (i == 0) {
-		max = sec_no[1];
-	    }
-	    if (sec_no[0] > max) {
-		max = sec_no[0];
-	    } else {
-		max = max;
-	    }
+		if (map->efi_parts[i].p_tag == V_BACKUP) {
+			sec_no[0] = 0;
+		}
+		if (map->efi_parts[i+1].p_tag == V_BACKUP) {
+			sec_no[1] = 0;
+		}
+		if (i == 0) {
+			max = sec_no[1];
+		}
+		if (sec_no[0] > max) {
+			max = sec_no[0];
+		} else {
+			max = max;
+		}
 	}
 	if (max == 0)
-	    max = map->efi_first_u_lba;
+		max = map->efi_first_u_lba;
 	return (max);
 }
 
@@ -136,66 +136,66 @@
 	}
 
 	if (cur_label == L_TYPE_EFI) {
-	    if (num > cur_parts->etoc->efi_nparts - 1) {
-		err_print("Invalid partition for EFI label\n");
-		return;
-	    }
-	    print_efi_partition(cur_parts->etoc, num, 1);
-	    fmt_print("\n");
+		if (num > cur_parts->etoc->efi_nparts - 1) {
+			err_print("Invalid partition for EFI label\n");
+			return;
+		}
+		print_efi_partition(cur_parts->etoc, num, 1);
+		fmt_print("\n");
 		/*
 		 * Prompt for p_tag and p_flag values for this partition
 		 */
-	    deflt = cur_parts->etoc->efi_parts[num].p_tag;
-	    if (deflt == V_UNASSIGNED) {
-		deflt = V_USR;
-	    }
-	    (void) sprintf(msg, "Enter partition id tag");
-	    ioparam.io_slist = ptag_choices;
-	    tag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT);
+		deflt = cur_parts->etoc->efi_parts[num].p_tag;
+		if (deflt == V_UNASSIGNED) {
+			deflt = V_USR;
+		}
+		(void) sprintf(msg, "Enter partition id tag");
+		ioparam.io_slist = ptag_choices;
+		tag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT);
 
-	    deflt = cur_parts->etoc->efi_parts[num].p_flag;
-	    (void) sprintf(msg, "Enter partition permission flags");
-	    ioparam.io_slist = pflag_choices;
-	    flag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT);
+		deflt = cur_parts->etoc->efi_parts[num].p_flag;
+		(void) sprintf(msg, "Enter partition permission flags");
+		ioparam.io_slist = pflag_choices;
+		flag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT);
 
-	    ioparam.io_bounds.lower = cur_parts->etoc->efi_first_u_lba;
-	    ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba;
+		ioparam.io_bounds.lower = cur_parts->etoc->efi_first_u_lba;
+		ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba;
 
-	    efi_deflt.start_sector = maxofN(cur_parts->etoc);
-	    if ((cur_parts->etoc->efi_parts[num].p_start != 0) &&
-		(cur_parts->etoc->efi_parts[num].p_size != 0)) {
-		    efi_deflt.start_sector =
-			cur_parts->etoc->efi_parts[num].p_start;
-	    }
-	    efi_deflt.end_sector = ioparam.io_bounds.upper -
-					efi_deflt.start_sector;
-	    i64 = input(FIO_INT64, "Enter new starting Sector", ':', &ioparam,
-		(int *)&efi_deflt, DATA_INPUT);
+		efi_deflt.start_sector = maxofN(cur_parts->etoc);
+		if ((cur_parts->etoc->efi_parts[num].p_start != 0) &&
+		    (cur_parts->etoc->efi_parts[num].p_size != 0)) {
+			efi_deflt.start_sector =
+			    cur_parts->etoc->efi_parts[num].p_start;
+		}
+		efi_deflt.end_sector = ioparam.io_bounds.upper -
+		    efi_deflt.start_sector;
+		i64 = input(FIO_INT64, "Enter new starting Sector", ':',
+		    &ioparam, (int *)&efi_deflt, DATA_INPUT);
 
-	    ioparam.io_bounds.lower = 0;
-	    ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba;
-	    efi_deflt.end_sector = cur_parts->etoc->efi_parts[num].p_size;
-	    efi_deflt.start_sector = i64;
-	    j64 = input(FIO_EFI, "Enter partition size", ':', &ioparam,
-		(int *)&efi_deflt, DATA_INPUT);
-	    if (j64 == 0) {
-		tag = V_UNASSIGNED;
-		i64 = 0;
-	    } else if ((j64 != 0) && (tag == V_UNASSIGNED)) {
-		tag = V_USR;
-	    }
+		ioparam.io_bounds.lower = 0;
+		ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba;
+		efi_deflt.end_sector = cur_parts->etoc->efi_parts[num].p_size;
+		efi_deflt.start_sector = i64;
+		j64 = input(FIO_EFI, "Enter partition size", ':', &ioparam,
+		    (int *)&efi_deflt, DATA_INPUT);
+		if (j64 == 0) {
+			tag = V_UNASSIGNED;
+			i64 = 0;
+		} else if ((j64 != 0) && (tag == V_UNASSIGNED)) {
+			tag = V_USR;
+		}
 
-	    if (cur_parts->pinfo_name != NULL)
-		make_partition();
+		if (cur_parts->pinfo_name != NULL)
+			make_partition();
 
-	    cur_parts->etoc->efi_parts[num].p_tag = tag;
-	    cur_parts->etoc->efi_parts[num].p_flag = flag;
-	    cur_parts->etoc->efi_parts[num].p_start = i64;
-	    cur_parts->etoc->efi_parts[num].p_size = j64;
-	/*
-	 * We are now done with EFI part, so return now
-	 */
-	    return;
+		cur_parts->etoc->efi_parts[num].p_tag = tag;
+		cur_parts->etoc->efi_parts[num].p_flag = flag;
+		cur_parts->etoc->efi_parts[num].p_start = i64;
+		cur_parts->etoc->efi_parts[num].p_size = j64;
+		/*
+		 * We are now done with EFI part, so return now
+		 */
+		return;
 	}
 	/*
 	 * Print out the given partition so the user knows what they're
@@ -237,9 +237,11 @@
 		if (tag != V_ALTSCTR) {
 			if (cur_parts->pinfo_map[J_PARTITION].dkl_nblk != 0) {
 				cyl_offset =
-				cur_parts->pinfo_map[J_PARTITION].dkl_cylno +
-				((cur_parts->pinfo_map[J_PARTITION].dkl_nblk +
-				(spc()-1)) / spc());
+				    cur_parts->
+				    pinfo_map[J_PARTITION].dkl_cylno +
+				    ((cur_parts->
+				    pinfo_map[J_PARTITION].dkl_nblk +
+				    (spc() - 1)) / spc());
 			}
 		}
 	}
@@ -247,8 +249,7 @@
 
 	ioparam.io_bounds.lower = 0;
 	ioparam.io_bounds.upper = ncyl - 1;
-	deflt = max(cur_parts->pinfo_map[num].dkl_cylno,
-		cyl_offset);
+	deflt = max(cur_parts->pinfo_map[num].dkl_cylno, cyl_offset);
 	i = (uint_t)input(FIO_INT, "Enter new starting cyl", ':', &ioparam,
 	    &deflt, DATA_INPUT);
 
@@ -257,9 +258,8 @@
 
 	/* fill in defaults for the current partition */
 	p_deflt.start_cyl = i;
-	p_deflt.deflt_size =
-		min(cur_parts->pinfo_map[num].dkl_nblk,
-		    ioparam.io_bounds.upper);
+	p_deflt.deflt_size = min(cur_parts->pinfo_map[num].dkl_nblk,
+	    ioparam.io_bounds.upper);
 
 	/* call input, passing p_deflt's address, typecast to (int *) */
 	j = (uint_t)input(FIO_ECYL, "Enter partition size", ':', &ioparam,
@@ -378,18 +378,18 @@
 	 */
 	enter_critical();
 	for (pptr = parts; pptr != NULL; pptr = pptr->pinfo_next) {
-	    if (cur_dtype->dtype_asciilabel) {
-		if (pptr->pinfo_name != NULL && strcmp(pptr->pinfo_name,
-				cur_dtype->dtype_asciilabel) == 0) {
-			/*
-			 * Set current partition and name it.
-			 */
-			cur_disk->disk_parts = cur_parts = pptr;
-			cur_parts->pinfo_name = pptr->pinfo_name;
-			exit_critical();
-			return (0);
+		if (cur_dtype->dtype_asciilabel) {
+			if (pptr->pinfo_name != NULL && strcmp(pptr->pinfo_name,
+			    cur_dtype->dtype_asciilabel) == 0) {
+				/*
+				 * Set current partition and name it.
+				 */
+				cur_disk->disk_parts = cur_parts = pptr;
+				cur_parts->pinfo_name = pptr->pinfo_name;
+				exit_critical();
+				return (0);
+			}
 		}
-	    }
 	}
 	/*
 	 * If we couldn't find a match, take the first one.
@@ -436,18 +436,19 @@
 	 * If there was a current map, copy its values.
 	 */
 	if (cur_label == L_TYPE_EFI) {
-	    struct dk_gpt	*map;
-	    int			nparts;
-	    int			size;
+		struct dk_gpt	*map;
+		int		nparts;
+		int		size;
 
-	    nparts = cur_parts->etoc->efi_nparts;
-	    size = sizeof (struct dk_part) * nparts + sizeof (struct dk_gpt);
-	    map = zalloc(size);
-	    (void) memcpy(map, cur_parts->etoc, size);
-	    pptr->etoc = map;
-	    cur_disk->disk_parts = cur_parts = pptr;
-	    exit_critical();
-	    return;
+		nparts = cur_parts->etoc->efi_nparts;
+		size = sizeof (struct dk_part) * nparts +
+		    sizeof (struct dk_gpt);
+		map = zalloc(size);
+		(void) memcpy(map, cur_parts->etoc, size);
+		pptr->etoc = map;
+		cur_disk->disk_parts = cur_parts = pptr;
+		exit_critical();
+		return;
 	}
 	if (cur_parts != NULL) {
 		for (i = 0; i < NDKMAP; i++) {
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Mon Dec 09 14:15:34 2019 +0000
@@ -55,6 +55,7 @@
 #include <sys/zfs_acl.h>
 #include <sys/sa_impl.h>
 #include <sys/multilist.h>
+#include <sys/btree.h>
 
 #ifdef _KERNEL
 #define	ZFS_OBJ_NAME	"zfs"
@@ -1462,13 +1463,15 @@
 	    0, NULL));
 }
 
-
-
 typedef struct mdb_range_tree {
 	struct {
-		uint64_t avl_numnodes;
+		uint64_t bt_num_elems;
+		uint64_t bt_num_nodes;
 	} rt_root;
 	uint64_t rt_space;
+	range_seg_type_t rt_type;
+	uint8_t		rt_shift;
+	uint64_t	rt_start;
 } mdb_range_tree_t;
 
 typedef struct mdb_metaslab_group {
@@ -1566,15 +1569,13 @@
 		    ms.ms_unflushed_frees, 0) == -1)
 			return (DCMD_ERR);
 		ufrees = rt.rt_space;
-		raw_uchanges_mem = rt.rt_root.avl_numnodes *
-		    mdb_ctf_sizeof_by_name("range_seg_t");
+		raw_uchanges_mem = rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE;
 
 		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
 		    ms.ms_unflushed_allocs, 0) == -1)
 			return (DCMD_ERR);
 		uallocs = rt.rt_space;
-		raw_uchanges_mem += rt.rt_root.avl_numnodes *
-		    mdb_ctf_sizeof_by_name("range_seg_t");
+		raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE;
 		mdb_nicenum(raw_uchanges_mem, uchanges_mem);
 
 		raw_free = ms.ms_size;
@@ -1644,14 +1645,12 @@
 		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
 		    ms.ms_unflushed_frees, 0) == -1)
 			return (DCMD_ERR);
-		raw_uchanges_mem +=
-		    rt.rt_root.avl_numnodes * sizeof (range_seg_t);
+		raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE;
 
 		if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t",
 		    ms.ms_unflushed_allocs, 0) == -1)
 			return (DCMD_ERR);
-		raw_uchanges_mem +=
-		    rt.rt_root.avl_numnodes * sizeof (range_seg_t);
+		raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE;
 	}
 	mdb_nicenum(raw_uchanges_mem, uchanges_mem);
 	mdb_printf("%10s\n", uchanges_mem);
@@ -2669,6 +2668,202 @@
 	return (mdb_pwalk_dcmd("zio_root", "zio", argc, argv, addr));
 }
 
+
+typedef struct mdb_zfs_btree_hdr {
+	uintptr_t		bth_parent;
+	boolean_t		bth_core;
+	/*
+	 * For both leaf and core nodes, represents the number of elements in
+	 * the node. For core nodes, they will have bth_count + 1 children.
+	 */
+	uint32_t		bth_count;
+} mdb_zfs_btree_hdr_t;
+
+typedef struct mdb_zfs_btree_core {
+	mdb_zfs_btree_hdr_t	btc_hdr;
+	uintptr_t		btc_children[BTREE_CORE_ELEMS + 1];
+	uint8_t			btc_elems[];
+} mdb_zfs_btree_core_t;
+
+typedef struct mdb_zfs_btree_leaf {
+	mdb_zfs_btree_hdr_t	btl_hdr;
+	uint8_t			btl_elems[];
+} mdb_zfs_btree_leaf_t;
+
+typedef struct mdb_zfs_btree {
+	uintptr_t		bt_root;
+	size_t			bt_elem_size;
+} mdb_zfs_btree_t;
+
+typedef struct btree_walk_data {
+	mdb_zfs_btree_t		bwd_btree;
+	mdb_zfs_btree_hdr_t	*bwd_node;
+	uint64_t		bwd_offset; // In units of bt_node_size
+} btree_walk_data_t;
+
+static uintptr_t
+btree_leftmost_child(uintptr_t addr, mdb_zfs_btree_hdr_t *buf)
+{
+	size_t size = offsetof(zfs_btree_core_t, btc_children) +
+	    sizeof (uintptr_t);
+	for (;;) {
+		if (mdb_vread(buf, size, addr) == -1) {
+			mdb_warn("failed to read at %p\n", addr);
+			return ((uintptr_t)0ULL);
+		}
+		if (!buf->bth_core)
+			return (addr);
+		mdb_zfs_btree_core_t *node = (mdb_zfs_btree_core_t *)buf;
+		addr = node->btc_children[0];
+	}
+}
+
+static int
+btree_walk_step(mdb_walk_state_t *wsp)
+{
+	btree_walk_data_t *bwd = wsp->walk_data;
+	size_t elem_size = bwd->bwd_btree.bt_elem_size;
+	if (wsp->walk_addr == 0ULL)
+		return (WALK_DONE);
+
+	if (!bwd->bwd_node->bth_core) {
+		/*
+		 * For the first element in a leaf node, read in the full
+		 * leaf, since we only had part of it read in before.
+		 */
+		if (bwd->bwd_offset == 0) {
+			if (mdb_vread(bwd->bwd_node, BTREE_LEAF_SIZE,
+			    wsp->walk_addr) == -1) {
+				mdb_warn("failed to read at %p\n",
+				    wsp->walk_addr);
+				return (WALK_ERR);
+			}
+		}
+
+		int status = wsp->walk_callback((uintptr_t)(wsp->walk_addr +
+		    offsetof(mdb_zfs_btree_leaf_t, btl_elems) +
+		    bwd->bwd_offset * elem_size), bwd->bwd_node,
+		    wsp->walk_cbdata);
+		if (status != WALK_NEXT)
+			return (status);
+		bwd->bwd_offset++;
+
+		/* Find the next element, if we're at the end of the leaf. */
+		while (bwd->bwd_offset == bwd->bwd_node->bth_count) {
+			uintptr_t par = bwd->bwd_node->bth_parent;
+			uintptr_t cur = wsp->walk_addr;
+			wsp->walk_addr = par;
+			if (par == 0ULL)
+				return (WALK_NEXT);
+
+			size_t size = sizeof (zfs_btree_core_t) +
+			    BTREE_CORE_ELEMS * elem_size;
+			if (mdb_vread(bwd->bwd_node, size, wsp->walk_addr) ==
+			    -1) {
+				mdb_warn("failed to read at %p\n",
+				    wsp->walk_addr);
+				return (WALK_ERR);
+			}
+			mdb_zfs_btree_core_t *node =
+			    (mdb_zfs_btree_core_t *)bwd->bwd_node;
+			int i;
+			for (i = 0; i <= bwd->bwd_node->bth_count; i++) {
+				if (node->btc_children[i] == cur)
+					break;
+			}
+			if (i > bwd->bwd_node->bth_count) {
+				mdb_warn("btree parent/child mismatch at "
+				    "%#lx\n", cur);
+				return (WALK_ERR);
+			}
+			bwd->bwd_offset = i;
+		}
+		return (WALK_NEXT);
+	}
+
+	if (!bwd->bwd_node->bth_core) {
+		mdb_warn("Invalid btree node at %#lx\n", wsp->walk_addr);
+		return (WALK_ERR);
+	}
+	mdb_zfs_btree_core_t *node = (mdb_zfs_btree_core_t *)bwd->bwd_node;
+	int status = wsp->walk_callback((uintptr_t)(wsp->walk_addr +
+	    offsetof(mdb_zfs_btree_core_t, btc_elems) + bwd->bwd_offset *
+	    elem_size), bwd->bwd_node, wsp->walk_cbdata);
+	if (status != WALK_NEXT)
+		return (status);
+
+	uintptr_t new_child = node->btc_children[bwd->bwd_offset + 1];
+	wsp->walk_addr = btree_leftmost_child(new_child, bwd->bwd_node);
+	if (wsp->walk_addr == 0ULL)
+		return (WALK_ERR);
+
+	bwd->bwd_offset = 0;
+	return (WALK_NEXT);
+}
+
+static int
+btree_walk_init(mdb_walk_state_t *wsp)
+{
+	btree_walk_data_t *bwd;
+
+	if (wsp->walk_addr == 0ULL) {
+		mdb_warn("must supply address of zfs_btree_t\n");
+		return (WALK_ERR);
+	}
+
+	bwd = mdb_zalloc(sizeof (btree_walk_data_t), UM_SLEEP);
+	if (mdb_ctf_vread(&bwd->bwd_btree, "zfs_btree_t", "mdb_zfs_btree_t",
+	    wsp->walk_addr, 0) == -1) {
+		mdb_free(bwd, sizeof (*bwd));
+		return (WALK_ERR);
+	}
+
+	if (bwd->bwd_btree.bt_elem_size == 0) {
+		mdb_warn("invalid or uninitialized btree at %#lx\n",
+		    wsp->walk_addr);
+		mdb_free(bwd, sizeof (*bwd));
+		return (WALK_ERR);
+	}
+
+	size_t size = MAX(BTREE_LEAF_SIZE, sizeof (zfs_btree_core_t) +
+	    BTREE_CORE_ELEMS * bwd->bwd_btree.bt_elem_size);
+	bwd->bwd_node = mdb_zalloc(size, UM_SLEEP);
+
+	uintptr_t node = (uintptr_t)bwd->bwd_btree.bt_root;
+	if (node == 0ULL) {
+		wsp->walk_addr = 0ULL;
+		wsp->walk_data = bwd;
+		return (WALK_NEXT);
+	}
+	node = btree_leftmost_child(node, bwd->bwd_node);
+	if (node == 0ULL) {
+		mdb_free(bwd->bwd_node, size);
+		mdb_free(bwd, sizeof (*bwd));
+		return (WALK_ERR);
+	}
+	bwd->bwd_offset = 0;
+
+	wsp->walk_addr = node;
+	wsp->walk_data = bwd;
+	return (WALK_NEXT);
+}
+
+static void
+btree_walk_fini(mdb_walk_state_t *wsp)
+{
+	btree_walk_data_t *bwd = (btree_walk_data_t *)wsp->walk_data;
+
+	if (bwd == NULL)
+		return;
+
+	size_t size = MAX(BTREE_LEAF_SIZE, sizeof (zfs_btree_core_t) +
+	    BTREE_CORE_ELEMS * bwd->bwd_btree.bt_elem_size);
+	if (bwd->bwd_node != NULL)
+		mdb_free(bwd->bwd_node, size);
+
+	mdb_free(bwd, sizeof (*bwd));
+}
+
 typedef struct mdb_multilist {
 	uint64_t ml_num_sublists;
 	uintptr_t ml_sublists;
@@ -4170,23 +4365,43 @@
 	return (rc);
 }
 
-typedef struct mdb_range_seg {
+typedef struct mdb_range_seg64 {
 	uint64_t rs_start;
 	uint64_t rs_end;
-} mdb_range_seg_t;
+} mdb_range_seg64_t;
+
+typedef struct mdb_range_seg32 {
+	uint32_t rs_start;
+	uint32_t rs_end;
+} mdb_range_seg32_t;
 
 /* ARGSUSED */
 static int
 range_tree_cb(uintptr_t addr, const void *unknown, void *arg)
 {
-	mdb_range_seg_t rs;
-
-	if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg", "mdb_range_seg_t",
-	    addr, 0) == -1)
-		return (DCMD_ERR);
-
-	mdb_printf("\t[%llx %llx) (length %llx)\n",
-	    rs.rs_start, rs.rs_end, rs.rs_end - rs.rs_start);
+	mdb_range_tree_t *rt = (mdb_range_tree_t *)arg;
+	uint64_t start, end;
+
+	if (rt->rt_type == RANGE_SEG64) {
+		mdb_range_seg64_t rs;
+
+		if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg64",
+		    "mdb_range_seg64_t", addr, 0) == -1)
+			return (DCMD_ERR);
+		start = rs.rs_start;
+		end = rs.rs_end;
+	} else {
+		ASSERT3U(rt->rt_type, ==, RANGE_SEG32);
+		mdb_range_seg32_t rs;
+
+		if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg32",
+		    "mdb_range_seg32_t", addr, 0) == -1)
+			return (DCMD_ERR);
+		start = ((uint64_t)rs.rs_start << rt->rt_shift) + rt->rt_start;
+		end = ((uint64_t)rs.rs_end << rt->rt_shift) + rt->rt_start;
+	}
+
+	mdb_printf("\t[%llx %llx) (length %llx)\n", start, end, end - start);
 
 	return (0);
 }
@@ -4197,7 +4412,7 @@
     const mdb_arg_t *argv)
 {
 	mdb_range_tree_t rt;
-	uintptr_t avl_addr;
+	uintptr_t btree_addr;
 
 	if (!(flags & DCMD_ADDRSPEC))
 		return (DCMD_USAGE);
@@ -4207,12 +4422,12 @@
 		return (DCMD_ERR);
 
 	mdb_printf("%p: range tree of %llu entries, %llu bytes\n",
-	    addr, rt.rt_root.avl_numnodes, rt.rt_space);
-
-	avl_addr = addr +
+	    addr, rt.rt_root.bt_num_elems, rt.rt_space);
+
+	btree_addr = addr +
 	    mdb_ctf_offsetof_by_name(ZFS_STRUCT "range_tree", "rt_root");
 
-	if (mdb_pwalk("avl", range_tree_cb, NULL, avl_addr) != 0) {
+	if (mdb_pwalk("zfs_btree", range_tree_cb, &rt, btree_addr) != 0) {
 		mdb_warn("can't walk range_tree segments");
 		return (DCMD_ERR);
 	}
@@ -4407,6 +4622,8 @@
 	{ "zfs_acl_node_aces0",
 	    "given a zfs_acl_node_t, walk all ACEs as ace_t",
 	    zfs_acl_node_aces0_walk_init, zfs_aces_walk_step, NULL },
+	{ "zfs_btree", "given a zfs_btree_t *, walk all entries",
+	    btree_walk_init, btree_walk_step, btree_walk_fini },
 	{ NULL }
 };
 
--- a/usr/src/cmd/zdb/zdb.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/cmd/zdb/zdb.c	Mon Dec 09 14:15:34 2019 +0000
@@ -86,21 +86,13 @@
 	(idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
 	DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
 
-#ifndef lint
 extern int reference_tracking_enable;
 extern boolean_t zfs_recover;
 extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
 extern int zfs_vdev_async_read_max_active;
 extern int aok;
 extern boolean_t spa_load_verify_dryrun;
-#else
-int reference_tracking_enable;
-boolean_t zfs_recover;
-uint64_t zfs_arc_max, zfs_arc_meta_limit;
-int zfs_vdev_async_read_max_active;
-int aok;
-boolean_t spa_load_verify_dryrun;
-#endif
+extern int zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
@@ -896,7 +888,7 @@
 {
 	char maxbuf[32];
 	range_tree_t *rt = msp->ms_allocatable;
-	avl_tree_t *t = &msp->ms_allocatable_by_size;
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
@@ -905,7 +897,7 @@
 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
-	    "segments", avl_numnodes(t), "maxsize", maxbuf,
+	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
@@ -3388,7 +3380,7 @@
 
 	ASSERT0(range_tree_space(svr->svr_allocd_segs));
 
-	range_tree_t *allocs = range_tree_create(NULL, NULL);
+	range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
@@ -5242,7 +5234,8 @@
 	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
-		mos_refd_objs = range_tree_create(NULL, NULL);
+		mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+		    0);
 		dump_dir(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
@@ -5759,6 +5752,13 @@
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
+	/*
+	 * For performance reasons, we set this tunable down. We do so before
+	 * the arg parsing section so that the user can override this value if
+	 * they choose.
+	 */
+	zfs_btree_verify_intensity = 3;
+
 	while ((c = getopt(argc, argv,
 	    "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
 		switch (c) {
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Mon Dec 09 14:15:34 2019 +0000
@@ -785,7 +785,9 @@
 
 	rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
 
-	return (AVL_ISIGN(rv));
+	if (rv == 0)
+		return (0);
+	return (rv > 0 ? 1 : -1);
 }
 
 void
--- a/usr/src/lib/libzfs/common/libzfs_iter.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_iter.c	Mon Dec 09 14:15:34 2019 +0000
@@ -284,7 +284,11 @@
 	lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
 	rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
 
-	return (AVL_CMP(lcreate, rcreate));
+	if (lcreate < rcreate)
+		return (-1);
+	if (lcreate > rcreate)
+		return (+1);
+	return (0);
 }
 
 int
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Mon Dec 09 14:15:34 2019 +0000
@@ -493,7 +493,11 @@
 	const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1;
 	const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2;
 
-	return (AVL_CMP(fn1->fn_guid, fn2->fn_guid));
+	if (fn1->fn_guid > fn2->fn_guid)
+		return (+1);
+	if (fn1->fn_guid < fn2->fn_guid)
+		return (-1);
+	return (0);
 }
 
 /*
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Mon Dec 09 14:15:34 2019 +0000
@@ -328,6 +328,15 @@
 #define	INGLOBALZONE(z)			(1)
 extern uint32_t zone_get_hostid(void *zonep);
 
+/*
+ * In ZoL the following defines were added to their sys/avl.h header, but
+ * we want to limit these to the ZFS code on illumos.
+ */
+#define	TREE_ISIGN(a)	(((a) > 0) - ((a) < 0))
+#define	TREE_CMP(a, b)	(((a) > (b)) - ((a) < (b)))
+#define	TREE_PCMP(a, b)	\
+	(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from, const char *to,
     cred_t *cr);
--- a/usr/src/uts/common/Makefile.files	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/Makefile.files	Mon Dec 09 14:15:34 2019 +0000
@@ -1365,6 +1365,7 @@
 	bplist.o		\
 	bpobj.o			\
 	bptree.o		\
+	btree.o			\
 	bqueue.o		\
 	cityhash.o		\
 	dbuf.o			\
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Dec 09 14:15:34 2019 +0000
@@ -4943,7 +4943,7 @@
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
-	extern kmem_cache_t	*range_seg_cache;
+	extern kmem_cache_t	*zfs_btree_leaf_cache;
 	extern kmem_cache_t	*abd_chunk_cache;
 
 #ifdef _KERNEL
@@ -4976,7 +4976,7 @@
 	kmem_cache_reap_soon(buf_cache);
 	kmem_cache_reap_soon(hdr_full_cache);
 	kmem_cache_reap_soon(hdr_l2only_cache);
-	kmem_cache_reap_soon(range_seg_cache);
+	kmem_cache_reap_soon(zfs_btree_leaf_cache);
 
 	if (zio_arena != NULL) {
 		/*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/btree.c	Mon Dec 09 14:15:34 2019 +0000
@@ -0,0 +1,2124 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#include	<sys/btree.h>
+#include	<sys/bitops.h>
+#include	<sys/zfs_context.h>
+
+kmem_cache_t *zfs_btree_leaf_cache;
+
+/*
+ * Control the extent of the verification that occurs when zfs_btree_verify is
+ * called. Primarily used for debugging when extending the btree logic and
+ * functionality. As the intensity is increased, new verification steps are
+ * added. These steps are cumulative; intensity = 3 includes the intensity = 1
+ * and intensity = 2 steps as well.
+ *
+ * Intensity 1: Verify that the tree's height is consistent throughout.
+ * Intensity 2: Verify that a core node's children's parent pointers point
+ * to the core node.
+ * Intensity 3: Verify that the total number of elements in the tree matches the
+ * sum of the number of elements in each node. Also verifies that each node's
+ * count obeys the invariants (less than or equal to maximum value, greater than
+ * or equal to half the maximum minus one).
+ * Intensity 4: Verify that each element compares less than the element
+ * immediately after it and greater than the one immediately before it using the
+ * comparator function. For core nodes, also checks that each element is greater
+ * than the last element in the first of the two nodes it separates, and less
+ * than the first element in the second of the two nodes.
+ * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
+ * of each node is poisoned appropriately. Note that poisoning always occurs if
+ * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
+ * operation.
+ *
+ * Intensity 4 and 5 are particularly expensive to perform; the previous levels
+ * are a few memory operations per node, while these levels require multiple
+ * operations per element. In addition, when creating large btrees, these
+ * operations are called at every step, resulting in extremely slow operation
+ * (while the asymptotic complexity of the other steps is the same, the
+ * importance of the constant factors cannot be denied).
+ */
+int zfs_btree_verify_intensity = 0;
+
+/*
+ * A convenience function to silence warnings from memmove's return value and
+ * change argument order to src, dest.
+ */
+void
+bmov(const void *src, void *dest, size_t size)
+{
+	(void) memmove(dest, src, size);
+}
+
+#ifdef _ILP32
+#define	BTREE_POISON 0xabadb10c
+#else
+#define	BTREE_POISON 0xabadb10cdeadbeef
+#endif
+
+static void
+zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
+		    BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
+		    hdr->bth_count * size);
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+			node->btc_children[i] =
+			    (zfs_btree_hdr_t *)BTREE_POISON;
+		}
+		(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
+		    (BTREE_CORE_ELEMS - hdr->bth_count) * size);
+	}
+#endif
+}
+
+static inline void
+zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	ASSERT3U(offset, >=, hdr->bth_count);
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems + offset * size, 0x0f, size);
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		node->btc_children[offset + 1] =
+		    (zfs_btree_hdr_t *)BTREE_POISON;
+		(void) memset(node->btc_elems + offset * size, 0x0f, size);
+	}
+#endif
+}
+
+static inline void
+zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	uint8_t eval = 0x0f;
+	if (hdr->bth_core) {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
+		VERIFY3P(node->btc_children[offset + 1], ==, cval);
+		for (int i = 0; i < size; i++)
+			VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+	} else  {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		for (int i = 0; i < size; i++)
+			VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+	}
+#endif
+}
+
+void
+zfs_btree_init(void)
+{
+	zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
+	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
+	    NULL, 0);
+}
+
+void
+zfs_btree_fini(void)
+{
+	kmem_cache_destroy(zfs_btree_leaf_cache);
+}
+
+void
+zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
+    size_t size)
+{
+	/*
+	 * We need a minimmum of 4 elements so that when we split a node we
+	 * always have at least two elements in each node. This simplifies the
+	 * logic in zfs_btree_bulk_finish, since it means the last leaf will
+	 * always have a left sibling to share with (unless it's the root).
+	 */
+	ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+
+	bzero(tree, sizeof (*tree));
+	tree->bt_compar = compar;
+	tree->bt_elem_size = size;
+	tree->bt_height = -1;
+	tree->bt_bulk = NULL;
+}
+
+/*
+ * Find value in the array of elements provided. Uses a simple binary search.
+ */
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+    const void *value, zfs_btree_index_t *where)
+{
+	uint64_t max = nelems;
+	uint64_t min = 0;
+	while (max > min) {
+		uint64_t idx = (min + max) / 2;
+		uint8_t *cur = buf + idx * tree->bt_elem_size;
+		int comp = tree->bt_compar(cur, value);
+		if (comp == -1) {
+			min = idx + 1;
+		} else if (comp == 1) {
+			max = idx;
+		} else {
+			ASSERT0(comp);
+			where->bti_offset = idx;
+			where->bti_before = B_FALSE;
+			return (cur);
+		}
+	}
+
+	where->bti_offset = max;
+	where->bti_before = B_TRUE;
+	return (NULL);
+}
+
+/*
+ * Find the given value in the tree. where may be passed as null to use as a
+ * membership test or if the btree is being used as a map.
+ */
+void *
+zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		if (where != NULL) {
+			where->bti_node = NULL;
+			where->bti_offset = 0;
+		}
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+
+	/*
+	 * If we're in bulk-insert mode, we check the last spot in the tree
+	 * and the last leaf in the tree before doing the normal search,
+	 * because for most workloads the vast majority of finds in
+	 * bulk-insert mode are to insert new elements.
+	 */
+	zfs_btree_index_t idx;
+	if (tree->bt_bulk != NULL) {
+		zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
+		int compar = tree->bt_compar(last_leaf->btl_elems +
+		    ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
+		    value);
+		if (compar < 0) {
+			/*
+			 * If what they're looking for is after the last
+			 * element, it's not in the tree.
+			 */
+			if (where != NULL) {
+				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+				where->bti_offset =
+				    last_leaf->btl_hdr.bth_count;
+				where->bti_before = B_TRUE;
+			}
+			return (NULL);
+		} else if (compar == 0) {
+			if (where != NULL) {
+				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+				where->bti_offset =
+				    last_leaf->btl_hdr.bth_count - 1;
+				where->bti_before = B_FALSE;
+			}
+			return (last_leaf->btl_elems +
+			    ((last_leaf->btl_hdr.bth_count - 1) *
+			    tree->bt_elem_size));
+		}
+		if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+			/*
+			 * If what they're looking for is after the first
+			 * element in the last leaf, it's in the last leaf or
+			 * it's not in the tree.
+			 */
+			void *d = zfs_btree_find_in_buf(tree,
+			    last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
+			    value, &idx);
+
+			if (where != NULL) {
+				idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
+				*where = idx;
+			}
+			return (d);
+		}
+	}
+
+	zfs_btree_core_t *node = NULL;
+	uint64_t child = 0;
+	uint64_t depth = 0;
+
+	/*
+	 * Iterate down the tree, finding which child the value should be in
+	 * by comparing with the separators.
+	 */
+	for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
+	    node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
+		ASSERT3P(node, !=, NULL);
+		void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+		    node->btc_hdr.bth_count, value, &idx);
+		EQUIV(d != NULL, !idx.bti_before);
+		if (d != NULL) {
+			if (where != NULL) {
+				idx.bti_node = (zfs_btree_hdr_t *)node;
+				*where = idx;
+			}
+			return (d);
+		}
+		ASSERT(idx.bti_before);
+		child = idx.bti_offset;
+	}
+
+	/*
+	 * The value is in this leaf, or it would be if it were in the
+	 * tree. Find its proper location and return it.
+	 */
+	zfs_btree_leaf_t *leaf = (depth == 0 ?
+	    (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
+	void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+	    leaf->btl_hdr.bth_count, value, &idx);
+
+	if (where != NULL) {
+		idx.bti_node = (zfs_btree_hdr_t *)leaf;
+		*where = idx;
+	}
+
+	return (d);
+}
+
+/*
+ * To explain the following functions, it is useful to understand the four
+ * kinds of shifts used in btree operation. First, a shift is a movement of
+ * elements within a node. It is used to create gaps for inserting new
+ * elements and children, or cover gaps created when things are removed. A
+ * shift has two fundamental properties, each of which can be one of two
+ * values, making four types of shifts.  There is the direction of the shift
+ * (left or right) and the shape of the shift (parallelogram or isoceles
+ * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
+ * applies to shifts of core nodes.
+ *
+ * The names derive from the following imagining of the layout of a node:
+ *
+ *  Elements:       *   *   *   *   *   *   *   ...   *   *   *
+ *  Children:     *   *   *   *   *   *   *   *   ...   *   *   *
+ *
+ * This layout follows from the fact that the elements act as separators
+ * between pairs of children, and that children root subtrees "below" the
+ * current node. A left and right shift are fairly self-explanatory; a left
+ * shift moves things to the left, while a right shift moves things to the
+ * right. A parallelogram shift is a shift with the same number of elements
+ * and children being moved, while a trapezoid shift is a shift that moves one
+ * more children than elements. An example follows:
+ *
+ * A parallelogram shift could contain the following:
+ *      _______________
+ *      \*   *   *   * \ *   *   *   ...   *   *   *
+ *     * \ *   *   *   *\  *   *   *   ...   *   *   *
+ *        ---------------
+ * A trapezoid shift could contain the following:
+ *          ___________
+ *       * / *   *   * \ *   *   *   ...   *   *   *
+ *     *  / *  *   *   *\  *   *   *   ...   *   *   *
+ *        ---------------
+ *
+ * Note that a parellelogram shift is always shaped like a "left-leaning"
+ * parallelogram, where the starting index of the children being moved is
+ * always one higher than the starting index of the elements being moved. No
+ * "right-leaning" parallelogram shifts are needed (shifts where the starting
+ * element index and starting child index being moved are the same) to achieve
+ * any btree operations, so we ignore them.
+ */
+
+enum bt_shift_shape {
+	BSS_TRAPEZOID,
+	BSS_PARALLELOGRAM
+};
+
+enum bt_shift_direction {
+	BSD_LEFT,
+	BSD_RIGHT
+};
+
+/*
+ * Shift elements and children in the provided core node by off spots.  The
+ * first element moved is idx, and count elements are moved. The shape of the
+ * shift is determined by shape. The direction is determined by dir.
+ */
+static inline void
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, uint64_t off, enum bt_shift_shape shape,
+    enum bt_shift_direction dir)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(node->btc_hdr.bth_core);
+
+	uint8_t *e_start = node->btc_elems + idx * size;
+	int sign = (dir == BSD_LEFT ? -1 : +1);
+	uint8_t *e_out = e_start + sign * off * size;
+	uint64_t e_count = count;
+	bmov(e_start, e_out, e_count * size);
+
+	zfs_btree_hdr_t **c_start = node->btc_children + idx +
+	    (shape == BSS_TRAPEZOID ? 0 : 1);
+	zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
+	    c_start + off);
+	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	bmov(c_start, c_out, c_count * sizeof (*c_start));
+}
+
+/*
+ * Shift elements and children in the provided core node left by one spot.
+ * The first element moved is idx, and count elements are moved. The
+ * shape of the shift is determined by trap; true if the shift is a trapezoid,
+ * false if it is a parallelogram.
+ */
+static inline void
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, enum bt_shift_shape shape)
+{
+	bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
+}
+
+/*
+ * Shift elements and children in the provided core node right by one spot.
+ * Starts with elements[idx] and children[idx] and one more child than element.
+ */
+static inline void
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, enum bt_shift_shape shape)
+{
+	bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
+}
+
+/*
+ * Shift elements and children in the provided leaf node by off spots.
+ * The first element moved is idx, and count elements are moved. The direction
+ * is determined by left.
+ */
+static inline void
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
+    uint64_t count, uint64_t off, enum bt_shift_direction dir)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(!node->btl_hdr.bth_core);
+
+	uint8_t *start = node->btl_elems + idx * size;
+	int sign = (dir == BSD_LEFT ? -1 : +1);
+	uint8_t *out = start + sign * off * size;
+	bmov(start, out, count * size);
+}
+
+static inline void
+bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+    uint64_t count)
+{
+	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+}
+
+static inline void
+bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+    uint64_t count)
+{
+	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+}
+
+/*
+ * Move children and elements from one core node to another. The shape
+ * parameter behaves the same as it does in the shift logic.
+ */
+static inline void
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
+    uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+    enum bt_shift_shape shape)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(source->btc_hdr.bth_core);
+	ASSERT(dest->btc_hdr.bth_core);
+
+	bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+	    count * size);
+
+	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+	    dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
+	    c_count * sizeof (*source->btc_children));
+}
+
+static inline void
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
+    uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(!source->btl_hdr.bth_core);
+	ASSERT(!dest->btl_hdr.bth_core);
+
+	bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+	    count * size);
+}
+
+/*
+ * Find the first element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+{
+	zfs_btree_hdr_t *node;
+
+	for (node = hdr; node->bth_core; node =
+	    ((zfs_btree_core_t *)node)->btc_children[0])
+		;
+
+	ASSERT(!node->bth_core);
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+	if (where != NULL) {
+		where->bti_node = node;
+		where->bti_offset = 0;
+		where->bti_before = B_FALSE;
+	}
+	return (&leaf->btl_elems[0]);
+}
+
+/* Insert an element and a child into a core node at the given offset. */
+static void
+zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
+    uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+{
+	uint64_t size = tree->bt_elem_size;
+	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+	ASSERT3P(par_hdr, ==, new_node->bth_parent);
+	ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
+
+	if (zfs_btree_verify_intensity >= 5) {
+		zfs_btree_verify_poison_at(tree, par_hdr,
+		    par_hdr->bth_count);
+	}
+	/* Shift existing elements and children */
+	uint64_t count = par_hdr->bth_count - offset;
+	bt_shift_core_right(tree, parent, offset, count,
+	    BSS_PARALLELOGRAM);
+
+	/* Insert new values */
+	parent->btc_children[offset + 1] = new_node;
+	bmov(buf, parent->btc_elems + offset * size, size);
+	par_hdr->bth_count++;
+}
+
+/*
+ * Insert new_node into the parent of old_node directly after old_node, with
+ * buf as the dividing element between the two.
+ */
+static void
+zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
+    zfs_btree_hdr_t *new_node, void *buf)
+{
+	ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
+	uint64_t size = tree->bt_elem_size;
+	zfs_btree_core_t *parent = old_node->bth_parent;
+	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+
+	/*
+	 * If this is the root node we were splitting, we create a new root
+	 * and increase the height of the tree.
+	 */
+	if (parent == NULL) {
+		ASSERT3P(old_node, ==, tree->bt_root);
+		tree->bt_num_nodes++;
+		zfs_btree_core_t *new_root =
+		    kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
+		    size, KM_SLEEP);
+		zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
+		new_root_hdr->bth_parent = NULL;
+		new_root_hdr->bth_core = B_TRUE;
+		new_root_hdr->bth_count = 1;
+
+		old_node->bth_parent = new_node->bth_parent = new_root;
+		new_root->btc_children[0] = old_node;
+		new_root->btc_children[1] = new_node;
+		bmov(buf, new_root->btc_elems, size);
+
+		tree->bt_height++;
+		tree->bt_root = new_root_hdr;
+		zfs_btree_poison_node(tree, new_root_hdr);
+		return;
+	}
+
+	/*
+	 * Since we have the new separator, binary search for where to put
+	 * new_node.
+	 */
+	zfs_btree_index_t idx;
+	ASSERT(par_hdr->bth_core);
+	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	    par_hdr->bth_count, buf, &idx), ==, NULL);
+	ASSERT(idx.bti_before);
+	uint64_t offset = idx.bti_offset;
+	ASSERT3U(offset, <=, par_hdr->bth_count);
+	ASSERT3P(parent->btc_children[offset], ==, old_node);
+
+	/*
+	 * If the parent isn't full, shift things to accomodate our insertions
+	 * and return.
+	 */
+	if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
+		zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
+		return;
+	}
+
+	/*
+	 * We need to split this core node into two. Currently there are
+	 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
+	 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
+	 * current node, and the others will be moved to the new core node.
+	 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
+	 * will be used as the new separator in our parent, and the others
+	 * will be split among the two core nodes.
+	 *
+	 * Usually we will split the node in half evenly, with
+	 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
+	 * instead move only about a quarter of the elements (and children) to
+	 * the new node. Since the average state after a long time is a 3/4
+	 * full node, shortcutting directly to that state improves efficiency.
+	 *
+	 * We do this in two stages: first we split into two nodes, and then we
+	 * reuse our existing logic to insert the new element and child.
+	 */
+	uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+	    2 : 4)) - 1, 2);
+	uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+	ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
+	tree->bt_num_nodes++;
+	zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
+	    BTREE_CORE_ELEMS * size, KM_SLEEP);
+	zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
+	new_par_hdr->bth_parent = par_hdr->bth_parent;
+	new_par_hdr->bth_core = B_TRUE;
+	new_par_hdr->bth_count = move_count;
+	zfs_btree_poison_node(tree, new_par_hdr);
+
+	par_hdr->bth_count = keep_count;
+
+	bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
+	    0, BSS_TRAPEZOID);
+
+	/* Store the new separator in a buffer. */
+	uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
+	bmov(parent->btc_elems + keep_count * size, tmp_buf,
+	    size);
+	zfs_btree_poison_node(tree, par_hdr);
+
+	if (offset < keep_count) {
+		/* Insert the new node into the left half */
+		zfs_btree_insert_core_impl(tree, parent, offset, new_node,
+		    buf);
+
+		/*
+		 * Move the new separator to the existing buffer.
+		 */
+		bmov(tmp_buf, buf, size);
+	} else if (offset > keep_count) {
+		/* Insert the new node into the right half */
+		new_node->bth_parent = new_parent;
+		zfs_btree_insert_core_impl(tree, new_parent,
+		    offset - keep_count - 1, new_node, buf);
+
+		/*
+		 * Move the new separator to the existing buffer.
+		 */
+		bmov(tmp_buf, buf, size);
+	} else {
+		/*
+		 * Move the new separator into the right half, and replace it
+		 * with buf. We also need to shift back the elements in the
+		 * right half to accomodate new_node.
+		 */
+		bt_shift_core_right(tree, new_parent, 0, move_count,
+		    BSS_TRAPEZOID);
+		new_parent->btc_children[0] = new_node;
+		bmov(tmp_buf, new_parent->btc_elems, size);
+		new_par_hdr->bth_count++;
+	}
+	kmem_free(tmp_buf, size);
+	zfs_btree_poison_node(tree, par_hdr);
+
+	for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+		new_parent->btc_children[i]->bth_parent = new_parent;
+
+	for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+		ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
+
+	/*
+	 * Now that the node is split, we need to insert the new node into its
+	 * parent. This may cause further splitting.
+	 */
+	zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
+	    &new_parent->btc_hdr, buf);
+}
+
+/* Insert an element into a leaf node at the given offset. */
+static void
+zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+    uint64_t idx, const void *value)
+{
+	uint64_t size = tree->bt_elem_size;
+	uint8_t *start = leaf->btl_elems + (idx * size);
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+	uint64_t count = leaf->btl_hdr.bth_count - idx;
+	ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+
+	if (zfs_btree_verify_intensity >= 5) {
+		zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
+		    leaf->btl_hdr.bth_count);
+	}
+
+	bt_shift_leaf_right(tree, leaf, idx, count);
+	bmov(value, start, size);
+	hdr->bth_count++;
+}
+
+/* Helper function for inserting a new value into leaf at the given index. */
+static void
+zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+    const void *value, uint64_t idx)
+{
+	uint64_t size = tree->bt_elem_size;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	/*
+	 * If the leaf isn't full, shift the elements after idx and insert
+	 * value.
+	 */
+	if (leaf->btl_hdr.bth_count != capacity) {
+		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+		return;
+	}
+
+	/*
+	 * Otherwise, we split the leaf node into two nodes. If we're not bulk
+	 * inserting, each is of size (capacity / 2).  If we are bulk
+	 * inserting, we move a quarter of the elements to the new node so
+	 * inserts into the old node don't cause immediate splitting but the
+	 * tree stays relatively dense. Since the average state after a long
+	 * time is a 3/4 full node, shortcutting directly to that state
+	 * improves efficiency.  At the end of the bulk insertion process
+	 * we'll need to go through and fix up any nodes (the last leaf and
+	 * its ancestors, potentially) that are below the minimum.
+	 *
+	 * In either case, we're left with one extra element. The leftover
+	 * element will become the new dividing element between the two nodes.
+	 */
+	uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
+	    1, 2);
+	uint64_t keep_count = capacity - move_count - 1;
+	ASSERT3U(capacity - move_count, >=, 2);
+	tree->bt_num_nodes++;
+	zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+	    KM_SLEEP);
+	zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
+	new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
+	new_hdr->bth_core = B_FALSE;
+	new_hdr->bth_count = move_count;
+	zfs_btree_poison_node(tree, new_hdr);
+
+	leaf->btl_hdr.bth_count = keep_count;
+
+	if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
+		tree->bt_bulk = new_leaf;
+
+	/* Copy the back part to the new leaf. */
+	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
+	    0);
+
+	/* We store the new separator in a buffer we control for simplicity. */
+	uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+	bmov(leaf->btl_elems + (keep_count * size), buf, size);
+	zfs_btree_poison_node(tree, &leaf->btl_hdr);
+
+	if (idx < keep_count) {
+		/* Insert into the existing leaf. */
+		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+	} else if (idx > keep_count) {
+		/* Insert into the new leaf. */
+		zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
+		    1, value);
+	} else {
+		/*
+		 * Shift the elements in the new leaf to make room for the
+		 * separator, and use the new value as the new separator.
+		 */
+		bt_shift_leaf_right(tree, new_leaf, 0, move_count);
+		bmov(buf, new_leaf->btl_elems, size);
+		bmov(value, buf, size);
+		new_hdr->bth_count++;
+	}
+
+	/*
+	 * Now that the node is split, we need to insert the new node into its
+	 * parent. This may cause further splitting, bur only of core nodes.
+	 */
+	zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
+	    buf);
+	kmem_free(buf, size);
+}
+
+static uint64_t
+zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	void *buf;
+	if (hdr->bth_core) {
+		buf = ((zfs_btree_core_t *)hdr)->btc_elems;
+	} else {
+		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+	}
+	zfs_btree_index_t idx;
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	    parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
+	ASSERT(idx.bti_before);
+	ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
+	ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
+	return (idx.bti_offset);
+}
+
+/*
+ * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
+ * nodes may violate the invariant that non-root nodes must be at least half
+ * full. All nodes violating this invariant should be the last node in their
+ * particular level. To correct the invariant, we take values from their left
+ * neighbor until they are half full. They must have a left neighbor at their
+ * level because the last node at a level is not the first node unless it's
+ * the root.
+ */
+static void
+zfs_btree_bulk_finish(zfs_btree_t *tree)
+{
+	ASSERT3P(tree->bt_bulk, !=, NULL);
+	ASSERT3P(tree->bt_root, !=, NULL);
+	zfs_btree_leaf_t *leaf = tree->bt_bulk;
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t size = tree->bt_elem_size;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	/*
+	 * The invariant doesn't apply to the root node, if that's the only
+	 * node in the tree we're done.
+	 */
+	if (parent == NULL) {
+		tree->bt_bulk = NULL;
+		return;
+	}
+
+	/* First, take elements to rebalance the leaf node. */
+	if (hdr->bth_count < capacity / 2) {
+		/*
+		 * First, find the left neighbor. The simplest way to do this
+		 * is to call zfs_btree_prev twice; the first time finds some
+		 * ancestor of this node, and the second time finds the left
+		 * neighbor. The ancestor found is the lowest common ancestor
+		 * of leaf and the neighbor.
+		 */
+		zfs_btree_index_t idx = {
+			.bti_node = hdr,
+			.bti_offset = 0
+		};
+		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+		ASSERT(idx.bti_node->bth_core);
+		zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
+		uint64_t common_idx = idx.bti_offset;
+
+		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+		ASSERT(!idx.bti_node->bth_core);
+		zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
+		zfs_btree_hdr_t *l_hdr = idx.bti_node;
+		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
+		    capacity / 2);
+
+		if (zfs_btree_verify_intensity >= 5) {
+			for (int i = 0; i < move_count; i++) {
+				zfs_btree_verify_poison_at(tree, hdr,
+				    leaf->btl_hdr.bth_count + i);
+			}
+		}
+
+		/* First, shift elements in leaf back. */
+		bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
+		    BSD_RIGHT);
+
+		/* Next, move the separator from the common ancestor to leaf. */
+		uint8_t *separator = common->btc_elems + (common_idx * size);
+		uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
+		bmov(separator, out, size);
+		move_count--;
+
+		/*
+		 * Now we move elements from the tail of the left neighbor to
+		 * fill the remaining spots in leaf.
+		 */
+		bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
+		    move_count, move_count, leaf, 0);
+
+		/*
+		 * Finally, move the new last element in the left neighbor to
+		 * the separator.
+		 */
+		bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
+		    move_count - 1) * size, separator, size);
+
+		/* Adjust the node's counts, and we're done. */
+		l_hdr->bth_count -= move_count + 1;
+		hdr->bth_count += move_count + 1;
+
+		ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
+		ASSERT3U(hdr->bth_count, >=, capacity / 2);
+		zfs_btree_poison_node(tree, l_hdr);
+	}
+
+	/*
+	 * Now we have to rebalance any ancestors of leaf that may also
+	 * violate the invariant.
+	 */
+	capacity = BTREE_CORE_ELEMS;
+	while (parent->btc_hdr.bth_parent != NULL) {
+		zfs_btree_core_t *cur = parent;
+		zfs_btree_hdr_t *hdr = &cur->btc_hdr;
+		parent = hdr->bth_parent;
+		/*
+		 * If the invariant isn't violated, move on to the next
+		 * ancestor.
+		 */
+		if (hdr->bth_count >= capacity / 2)
+			continue;
+
+		/*
+		 * Because the smallest number of nodes we can move when
+		 * splitting is 2, we never need to worry about not having a
+		 * left sibling (a sibling is a neighbor with the same parent).
+		 */
+		uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+		ASSERT3U(parent_idx, >, 0);
+		zfs_btree_core_t *l_neighbor =
+		    (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
+		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
+		    capacity / 2);
+
+		if (zfs_btree_verify_intensity >= 5) {
+			for (int i = 0; i < move_count; i++) {
+				zfs_btree_verify_poison_at(tree, hdr,
+				    hdr->bth_count + i);
+			}
+		}
+		/* First, shift things in the right node back. */
+		bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
+		    BSS_TRAPEZOID, BSD_RIGHT);
+
+		/* Next, move the separator to the right node. */
+		uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
+		    size);
+		uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
+		bmov(separator, e_out, size);
+
+		/*
+		 * Now, move elements and children from the left node to the
+		 * right.  We move one more child than elements.
+		 */
+		move_count--;
+		uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+		bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
+		    BSS_TRAPEZOID);
+
+		/*
+		 * Finally, move the last element in the left node to the
+		 * separator's position.
+		 */
+		move_idx--;
+		bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+
+		l_neighbor->btc_hdr.bth_count -= move_count + 1;
+		hdr->bth_count += move_count + 1;
+
+		ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
+		ASSERT3U(hdr->bth_count, >=, capacity / 2);
+
+		zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
+
+		for (int i = 0; i <= hdr->bth_count; i++)
+			cur->btc_children[i]->bth_parent = cur;
+	}
+
+	tree->bt_bulk = NULL;
+}
+
+/*
+ * Insert value into tree at the location specified by where.
+ */
+void
+zfs_btree_insert(zfs_btree_t *tree, const void *value,
+    const zfs_btree_index_t *where)
+{
+	zfs_btree_index_t idx = {0};
+
+	/* If we're not inserting in the last leaf, end bulk insert mode. */
+	if (tree->bt_bulk != NULL) {
+		if (where->bti_node != &tree->bt_bulk->btl_hdr) {
+			zfs_btree_bulk_finish(tree);
+			VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
+			where = &idx;
+		}
+	}
+
+	tree->bt_num_elems++;
+	/*
+	 * If this is the first element in the tree, create a leaf root node
+	 * and add the value to it.
+	 */
+	if (where->bti_node == NULL) {
+		ASSERT3U(tree->bt_num_elems, ==, 1);
+		ASSERT3S(tree->bt_height, ==, -1);
+		ASSERT3P(tree->bt_root, ==, NULL);
+		ASSERT0(where->bti_offset);
+
+		tree->bt_num_nodes++;
+		zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+		    KM_SLEEP);
+		tree->bt_root = &leaf->btl_hdr;
+		tree->bt_height++;
+
+		zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+		hdr->bth_parent = NULL;
+		hdr->bth_core = B_FALSE;
+		hdr->bth_count = 0;
+		zfs_btree_poison_node(tree, hdr);
+
+		zfs_btree_insert_into_leaf(tree, leaf, value, 0);
+		tree->bt_bulk = leaf;
+	} else if (!where->bti_node->bth_core) {
+		/*
+		 * If we're inserting into a leaf, go directly to the helper
+		 * function.
+		 */
+		zfs_btree_insert_into_leaf(tree,
+		    (zfs_btree_leaf_t *)where->bti_node, value,
+		    where->bti_offset);
+	} else {
+		/*
+		 * If we're inserting into a core node, we can't just shift
+		 * the existing element in that slot in the same node without
+		 * breaking our ordering invariants. Instead we place the new
+		 * value in the node at that spot and then insert the old
+		 * separator into the first slot in the subtree to the right.
+		 */
+		ASSERT(where->bti_node->bth_core);
+		zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
+
+		/*
+		 * We can ignore bti_before, because either way the value
+		 * should end up in bti_offset.
+		 */
+		uint64_t off = where->bti_offset;
+		zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
+		size_t size = tree->bt_elem_size;
+		uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+		bmov(node->btc_elems + off * size, buf, size);
+		bmov(value, node->btc_elems + off * size, size);
+
+		/*
+		 * Find the first slot in the subtree to the right, insert
+		 * there.
+		 */
+		zfs_btree_index_t new_idx;
+		VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+		ASSERT0(new_idx.bti_offset);
+		ASSERT(!new_idx.bti_node->bth_core);
+		zfs_btree_insert_into_leaf(tree,
+		    (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
+		kmem_free(buf, size);
+	}
+	zfs_btree_verify(tree);
+}
+
+/*
+ * Return the first element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+	return (zfs_btree_first_helper(tree->bt_root, where));
+}
+
+/*
+ * Find the last element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
+    zfs_btree_index_t *where)
+{
+	zfs_btree_hdr_t *node;
+
+	for (node = hdr; node->bth_core; node =
+	    ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
+		;
+
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+	if (where != NULL) {
+		where->bti_node = node;
+		where->bti_offset = node->bth_count - 1;
+		where->bti_before = B_FALSE;
+	}
+	return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+}
+
+/*
+ * Return the last element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+	return (zfs_btree_last_helper(tree, tree->bt_root, where));
+}
+
+/*
+ * This function contains the logic to find the next node in the tree. A
+ * helper function is used because there are multiple internal consumemrs of
+ * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
+ * node after we've finished with it.
+ */
+static void *
+zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx,
+    void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
+{
+	if (idx->bti_node == NULL) {
+		ASSERT3S(tree->bt_height, ==, -1);
+		return (NULL);
+	}
+
+	uint64_t offset = idx->bti_offset;
+	if (!idx->bti_node->bth_core) {
+		/*
+		 * When finding the next element of an element in a leaf,
+		 * there are two cases. If the element isn't the last one in
+		 * the leaf, in which case we just return the next element in
+		 * the leaf. Otherwise, we need to traverse up our parents
+		 * until we find one where our ancestor isn't the last child
+		 * of its parent. Once we do, the next element is the
+		 * separator after our ancestor in its parent.
+		 */
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+		if (leaf->btl_hdr.bth_count > new_off) {
+			out_idx->bti_node = &leaf->btl_hdr;
+			out_idx->bti_offset = new_off;
+			out_idx->bti_before = B_FALSE;
+			return (leaf->btl_elems + new_off * tree->bt_elem_size);
+		}
+
+		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+		    node != NULL; node = node->btc_hdr.bth_parent) {
+			zfs_btree_hdr_t *hdr = &node->btc_hdr;
+			ASSERT(hdr->bth_core);
+			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			if (done_func != NULL)
+				done_func(tree, prev);
+			if (i == hdr->bth_count) {
+				prev = hdr;
+				continue;
+			}
+			out_idx->bti_node = hdr;
+			out_idx->bti_offset = i;
+			out_idx->bti_before = B_FALSE;
+			return (node->btc_elems + i * tree->bt_elem_size);
+		}
+		if (done_func != NULL)
+			done_func(tree, prev);
+		/*
+		 * We've traversed all the way up and been at the end of the
+		 * node every time, so this was the last element in the tree.
+		 */
+		return (NULL);
+	}
+
+	/* If we were before an element in a core node, return that element. */
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	if (idx->bti_before) {
+		out_idx->bti_before = B_FALSE;
+		return (node->btc_elems + offset * tree->bt_elem_size);
+	}
+
+	/*
+	 * The next element from one in a core node is the first element in
+	 * the subtree just to the right of the separator.
+	 */
+	zfs_btree_hdr_t *child = node->btc_children[offset + 1];
+	return (zfs_btree_first_helper(child, out_idx));
+}
+
+/*
+ * Return the next valued node in the tree.  The same address can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx)
+{
+	return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
+}
+
+/*
+ * Return the previous valued node in the tree.  The same value can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx)
+{
+	if (idx->bti_node == NULL) {
+		ASSERT3S(tree->bt_height, ==, -1);
+		return (NULL);
+	}
+
+	uint64_t offset = idx->bti_offset;
+	if (!idx->bti_node->bth_core) {
+		/*
+		 * When finding the previous element of an element in a leaf,
+		 * there are two cases. If the element isn't the first one in
+		 * the leaf, in which case we just return the previous element
+		 * in the leaf. Otherwise, we need to traverse up our parents
+		 * until we find one where our previous ancestor isn't the
+		 * first child. Once we do, the previous element is the
+		 * separator after our previous ancestor.
+		 */
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		if (offset != 0) {
+			out_idx->bti_node = &leaf->btl_hdr;
+			out_idx->bti_offset = offset - 1;
+			out_idx->bti_before = B_FALSE;
+			return (leaf->btl_elems + (offset - 1) *
+			    tree->bt_elem_size);
+		}
+		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+		    node != NULL; node = node->btc_hdr.bth_parent) {
+			zfs_btree_hdr_t *hdr = &node->btc_hdr;
+			ASSERT(hdr->bth_core);
+			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			if (i == 0) {
+				prev = hdr;
+				continue;
+			}
+			out_idx->bti_node = hdr;
+			out_idx->bti_offset = i - 1;
+			out_idx->bti_before = B_FALSE;
+			return (node->btc_elems + (i - 1) * tree->bt_elem_size);
+		}
+		/*
+		 * We've traversed all the way up and been at the start of the
+		 * node every time, so this was the first node in the tree.
+		 */
+		return (NULL);
+	}
+
+	/*
+	 * The previous element from one in a core node is the last element in
+	 * the subtree just to the left of the separator.
+	 */
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	zfs_btree_hdr_t *child = node->btc_children[offset];
+	return (zfs_btree_last_helper(tree, child, out_idx));
+}
+
+/*
+ * Get the value at the provided index in the tree.
+ *
+ * Note that the value returned from this function can be mutated, but only
+ * if it will not change the ordering of the element with respect to any other
+ * elements that could be in the tree.
+ */
+void *
+zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
+{
+	ASSERT(!idx->bti_before);
+	if (!idx->bti_node->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+	}
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+}
+
+/* Add the given value to the tree. Must not already be in the tree. */
+void
+zfs_btree_add(zfs_btree_t *tree, const void *node)
+{
+	zfs_btree_index_t where = {0};
+	VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
+	zfs_btree_insert(tree, node, &where);
+}
+
+/* Helper function to free a tree node. */
+static void
+zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
+{
+	tree->bt_num_nodes--;
+	if (!node->bth_core) {
+		kmem_cache_free(zfs_btree_leaf_cache, node);
+	} else {
+		kmem_free(node, sizeof (zfs_btree_core_t) +
+		    BTREE_CORE_ELEMS * tree->bt_elem_size);
+	}
+}
+
+/*
+ * Remove the rm_hdr and the separator to its left from the parent node. The
+ * buffer that rm_hdr was stored in may already be freed, so its contents
+ * cannot be accessed.
+ */
+static void
+zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
+    zfs_btree_hdr_t *rm_hdr)
+{
+	size_t size = tree->bt_elem_size;
+	uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+	zfs_btree_hdr_t *hdr = &node->btc_hdr;
+	/*
+	 * If the node is the root node and rm_hdr is one of two children,
+	 * promote the other child to the root.
+	 */
+	if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
+		ASSERT3U(hdr->bth_count, ==, 1);
+		ASSERT3P(tree->bt_root, ==, node);
+		ASSERT3P(node->btc_children[1], ==, rm_hdr);
+		tree->bt_root = node->btc_children[0];
+		node->btc_children[0]->bth_parent = NULL;
+		zfs_btree_node_destroy(tree, hdr);
+		tree->bt_height--;
+		return;
+	}
+
+	uint64_t idx;
+	for (idx = 0; idx <= hdr->bth_count; idx++) {
+		if (node->btc_children[idx] == rm_hdr)
+			break;
+	}
+	ASSERT3U(idx, <=, hdr->bth_count);
+
+	/*
+	 * If the node is the root or it has more than the minimum number of
+	 * children, just remove the child and separator, and return.
+	 */
+	if (hdr->bth_parent == NULL ||
+	    hdr->bth_count > min_count) {
+		/*
+		 * Shift the element and children to the right of rm_hdr to
+		 * the left by one spot.
+		 */
+		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+		    BSS_PARALLELOGRAM);
+		hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+		return;
+	}
+
+	ASSERT3U(hdr->bth_count, ==, min_count);
+
+	/*
+	 * Now we try to take a node from a neighbor. We check left, then
+	 * right. If the neighbor exists and has more than the minimum number
+	 * of elements, we move the separator betweeen us and them to our
+	 * node, move their closest element (last for left, first for right)
+	 * to the separator, and move their closest child to our node. Along
+	 * the way we need to collapse the gap made by idx, and (for our right
+	 * neighbor) the gap made by removing their first element and child.
+	 *
+	 * Note: this logic currently doesn't support taking from a neighbor
+	 * that isn't a sibling (i.e. a neighbor with a different
+	 * parent). This isn't critical functionality, but may be worth
+	 * implementing in the future for completeness' sake.
+	 */
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+	    parent->btc_children[parent_idx - 1]);
+	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+		/* We can take a node from the left neighbor. */
+		ASSERT(l_hdr->bth_core);
+		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
+
+		/*
+		 * Start by shifting the elements and children in the current
+		 * node to the right by one spot.
+		 */
+		bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
+
+		/*
+		 * Move the separator between node and neighbor to the first
+		 * element slot in the current node.
+		 */
+		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+		    size;
+		bmov(separator, node->btc_elems, size);
+
+		/* Move the last child of neighbor to our first child slot. */
+		zfs_btree_hdr_t **take_child = neighbor->btc_children +
+		    l_hdr->bth_count;
+		bmov(take_child, node->btc_children, sizeof (*take_child));
+		node->btc_children[0]->bth_parent = node;
+
+		/* Move the last element of neighbor to the separator spot. */
+		uint8_t *take_elem = neighbor->btc_elems +
+		    (l_hdr->bth_count - 1) * size;
+		bmov(take_elem, separator, size);
+		l_hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+		return;
+	}
+
+	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+	    NULL : parent->btc_children[parent_idx + 1]);
+	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+		/* We can take a node from the right neighbor. */
+		ASSERT(r_hdr->bth_core);
+		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
+
+		/*
+		 * Shift elements in node left by one spot to overwrite rm_hdr
+		 * and the separator before it.
+		 */
+		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+		    BSS_PARALLELOGRAM);
+
+		/*
+		 * Move the separator between node and neighbor to the last
+		 * element spot in node.
+		 */
+		uint8_t *separator = parent->btc_elems + parent_idx * size;
+		bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+		    size);
+
+		/*
+		 * Move the first child of neighbor to the last child spot in
+		 * node.
+		 */
+		zfs_btree_hdr_t **take_child = neighbor->btc_children;
+		bmov(take_child, node->btc_children + hdr->bth_count,
+		    sizeof (*take_child));
+		node->btc_children[hdr->bth_count]->bth_parent = node;
+
+		/* Move the first element of neighbor to the separator spot. */
+		uint8_t *take_elem = neighbor->btc_elems;
+		bmov(take_elem, separator, size);
+		r_hdr->bth_count--;
+
+		/*
+		 * Shift the elements and children of neighbor to cover the
+		 * stolen elements.
+		 */
+		bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
+		    BSS_TRAPEZOID);
+		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		return;
+	}
+
+	/*
+	 * In this case, neither of our neighbors can spare an element, so we
+	 * need to merge with one of them. We prefer the left one,
+	 * arabitrarily. Move the separator into the leftmost merging node
+	 * (which may be us or the left neighbor), and then move the right
+	 * merging node's elements. Once that's done, we go back and delete
+	 * the element we're removing. Finally, go into the parent and delete
+	 * the right merging node and the separator. This may cause further
+	 * merging.
+	 */
+	zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
+	uint64_t new_idx = idx;
+	if (l_hdr != NULL) {
+		keep_hdr = l_hdr;
+		new_rm_hdr = hdr;
+		new_idx += keep_hdr->bth_count + 1;
+	} else {
+		ASSERT3P(r_hdr, !=, NULL);
+		keep_hdr = hdr;
+		new_rm_hdr = r_hdr;
+		parent_idx++;
+	}
+
+	ASSERT(keep_hdr->bth_core);
+	ASSERT(new_rm_hdr->bth_core);
+
+	zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
+	zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
+
+	if (zfs_btree_verify_intensity >= 5) {
+		for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+			zfs_btree_verify_poison_at(tree, keep_hdr,
+			    keep_hdr->bth_count + i);
+		}
+	}
+
+	/* Move the separator into the left node. */
+	uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
+	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+	    size;
+	bmov(separator, e_out, size);
+	keep_hdr->bth_count++;
+
+	/* Move all our elements and children into the left node. */
+	bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
+	    keep_hdr->bth_count, BSS_TRAPEZOID);
+
+	uint64_t old_count = keep_hdr->bth_count;
+
+	/* Update bookkeeping */
+	keep_hdr->bth_count += new_rm_hdr->bth_count;
+	ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
+
+	/*
+	 * Shift the element and children to the right of rm_hdr to
+	 * the left by one spot.
+	 */
+	ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
+	bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
+	    BSS_PARALLELOGRAM);
+	keep_hdr->bth_count--;
+
+	/* Reparent all our children to point to the left node. */
+	zfs_btree_hdr_t **new_start = keep->btc_children +
+	    old_count - 1;
+	for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+		new_start[i]->bth_parent = keep;
+	for (int i = 0; i <= keep_hdr->bth_count; i++) {
+		ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
+		ASSERT3P(keep->btc_children[i], !=, rm_hdr);
+	}
+	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+	new_rm_hdr->bth_count = 0;
+	zfs_btree_node_destroy(tree, new_rm_hdr);
+	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+}
+
+/* Remove the element at the specific location. */
+void
+zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	size_t size = tree->bt_elem_size;
+	zfs_btree_hdr_t *hdr = where->bti_node;
+	uint64_t idx = where->bti_offset;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	ASSERT(!where->bti_before);
+	if (tree->bt_bulk != NULL) {
+		/*
+		 * Leave bulk insert mode. Note that our index would be
+		 * invalid after we correct the tree, so we copy the value
+		 * we're planning to remove and find it again after
+		 * bulk_finish.
+		 */
+		uint8_t *value = zfs_btree_get(tree, where);
+		uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
+		bmov(value, tmp, size);
+		zfs_btree_bulk_finish(tree);
+		VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
+		kmem_free(tmp, size);
+		hdr = where->bti_node;
+		idx = where->bti_offset;
+	}
+
+	tree->bt_num_elems--;
+	/*
+	 * If the element happens to be in a core node, we move a leaf node's
+	 * element into its place and then remove the leaf node element. This
+	 * makes the rebalance logic not need to be recursive both upwards and
+	 * downwards.
+	 */
+	if (hdr->bth_core) {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
+		void *new_value = zfs_btree_last_helper(tree, left_subtree,
+		    where);
+		ASSERT3P(new_value, !=, NULL);
+
+		bmov(new_value, node->btc_elems + idx * size, size);
+
+		hdr = where->bti_node;
+		idx = where->bti_offset;
+		ASSERT(!where->bti_before);
+	}
+
+	/*
+	 * First, we'll update the leaf's metadata. Then, we shift any
+	 * elements after the idx to the left. After that, we rebalance if
+	 * needed.
+	 */
+	ASSERT(!hdr->bth_core);
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+	ASSERT3U(hdr->bth_count, >, 0);
+
+	uint64_t min_count = (capacity / 2) - 1;
+
+	/*
+	 * If we're over the minimum size or this is the root, just overwrite
+	 * the value and return.
+	 */
+	if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
+		hdr->bth_count--;
+		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+		if (hdr->bth_parent == NULL) {
+			ASSERT0(tree->bt_height);
+			if (hdr->bth_count == 0) {
+				tree->bt_root = NULL;
+				tree->bt_height--;
+				zfs_btree_node_destroy(tree, &leaf->btl_hdr);
+			}
+		}
+		if (tree->bt_root != NULL)
+			zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+		zfs_btree_verify(tree);
+		return;
+	}
+	ASSERT3U(hdr->bth_count, ==, min_count);
+
+	/*
+	 * Now we try to take a node from a sibling. We check left, then
+	 * right. If they exist and have more than the minimum number of
+	 * elements, we move the separator betweeen us and them to our node
+	 * and move their closest element (last for left, first for right) to
+	 * the separator. Along the way we need to collapse the gap made by
+	 * idx, and (for our right neighbor) the gap made by removing their
+	 * first element.
+	 *
+	 * Note: this logic currently doesn't support taking from a neighbor
+	 * that isn't a sibling. This isn't critical functionality, but may be
+	 * worth implementing in the future for completeness' sake.
+	 */
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+	    parent->btc_children[parent_idx - 1]);
+	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+		/* We can take a node from the left neighbor. */
+		ASSERT(!l_hdr->bth_core);
+
+		/*
+		 * Move our elements back by one spot to make room for the
+		 * stolen element and overwrite the element being removed.
+		 */
+		bt_shift_leaf_right(tree, leaf, 0, idx);
+		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+		    size;
+		uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
+		    (l_hdr->bth_count - 1) * size;
+		/* Move the separator to our first spot. */
+		bmov(separator, leaf->btl_elems, size);
+
+		/* Move our neighbor's last element to the separator. */
+		bmov(take_elem, separator, size);
+
+		/* Update the bookkeeping. */
+		l_hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+
+		zfs_btree_verify(tree);
+		return;
+	}
+
+	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+	    NULL : parent->btc_children[parent_idx + 1]);
+	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+		/* We can take a node from the right neighbor. */
+		ASSERT(!r_hdr->bth_core);
+		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
+
+		/*
+		 * Move our elements after the element being removed forwards
+		 * by one spot to make room for the stolen element and
+		 * overwrite the element being removed.
+		 */
+		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
+		    1);
+
+		uint8_t *separator = parent->btc_elems + parent_idx * size;
+		uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
+		/* Move the separator between us to our last spot. */
+		bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
+		    size);
+
+		/* Move our neighbor's first element to the separator. */
+		bmov(take_elem, separator, size);
+
+		/* Update the bookkeeping. */
+		r_hdr->bth_count--;
+
+		/*
+		 * Move our neighbors elements forwards to overwrite the
+		 * stolen element.
+		 */
+		bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
+		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		zfs_btree_verify(tree);
+		return;
+	}
+
+	/*
+	 * In this case, neither of our neighbors can spare an element, so we
+	 * need to merge with one of them. We prefer the left one,
+	 * arabitrarily. Move the separator into the leftmost merging node
+	 * (which may be us or the left neighbor), and then move the right
+	 * merging node's elements. Once that's done, we go back and delete
+	 * the element we're removing. Finally, go into the parent and delete
+	 * the right merging node and the separator. This may cause further
+	 * merging.
+	 */
+	zfs_btree_hdr_t *rm_hdr, *keep_hdr;
+	uint64_t new_idx = idx;
+	if (l_hdr != NULL) {
+		keep_hdr = l_hdr;
+		rm_hdr = hdr;
+		new_idx += keep_hdr->bth_count + 1; // 449
+	} else {
+		ASSERT3P(r_hdr, !=, NULL);
+		keep_hdr = hdr;
+		rm_hdr = r_hdr;
+		parent_idx++;
+	}
+
+	ASSERT(!keep_hdr->bth_core);
+	ASSERT(!rm_hdr->bth_core);
+	ASSERT3U(keep_hdr->bth_count, ==, min_count);
+	ASSERT3U(rm_hdr->bth_count, ==, min_count);
+
+	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+	zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
+
+	if (zfs_btree_verify_intensity >= 5) {
+		for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
+			zfs_btree_verify_poison_at(tree, keep_hdr,
+			    keep_hdr->bth_count + i);
+		}
+	}
+	/*
+	 * Move the separator into the first open spot in the left
+	 * neighbor.
+	 */
+	uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
+	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+	    size;
+	bmov(separator, out, size);
+	keep_hdr->bth_count++;
+
+	/* Move our elements to the left neighbor. */
+	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
+	    keep_hdr->bth_count);
+
+	/* Update the bookkeeping. */
+	keep_hdr->bth_count += rm_hdr->bth_count;
+	ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+
+	/* Remove the value from the node */
+	keep_hdr->bth_count--;
+	bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
+	    new_idx);
+	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+	rm_hdr->bth_count = 0;
+	zfs_btree_node_destroy(tree, rm_hdr);
+	/* Remove the emptied node from the parent. */
+	zfs_btree_remove_from_node(tree, parent, rm_hdr);
+	zfs_btree_verify(tree);
+}
+
+/* Remove the given value from the tree. */
+void
+zfs_btree_remove(zfs_btree_t *tree, const void *value)
+{
+	zfs_btree_index_t where = {0};
+	VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
+	zfs_btree_remove_from(tree, &where);
+}
+
+/* Return the number of elements in the tree. */
+ulong_t
+zfs_btree_numnodes(zfs_btree_t *tree)
+{
+	return (tree->bt_num_elems);
+}
+
+/*
+ * This function is used to visit all the elements in the tree before
+ * destroying the tree. This allows the calling code to perform any cleanup it
+ * needs to do. This is more efficient than just removing the first element
+ * over and over, because it removes all rebalancing. Once the destroy_nodes()
+ * function has been called, no other btree operations are valid until it
+ * returns NULL, which point the only valid operation is zfs_btree_destroy().
+ *
+ * example:
+ *
+ *      zfs_btree_index_t *cookie = NULL;
+ *      my_data_t *node;
+ *
+ *      while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ *              free(node->ptr);
+ *      zfs_btree_destroy(tree);
+ *
+ */
+void *
+zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
+{
+	if (*cookie == NULL) {
+		if (tree->bt_height == -1)
+			return (NULL);
+		*cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
+		return (zfs_btree_first(tree, *cookie));
+	}
+
+	void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
+	    zfs_btree_node_destroy);
+	if (rval == NULL)   {
+		tree->bt_root = NULL;
+		tree->bt_height = -1;
+		tree->bt_num_elems = 0;
+		kmem_free(*cookie, sizeof (**cookie));
+		tree->bt_bulk = NULL;
+	}
+	return (rval);
+}
+
+static void
+zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (hdr->bth_core) {
+		zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			zfs_btree_clear_helper(tree, btc->btc_children[i]);
+		}
+	}
+
+	zfs_btree_node_destroy(tree, hdr);
+}
+
+void
+zfs_btree_clear(zfs_btree_t *tree)
+{
+	if (tree->bt_root == NULL) {
+		ASSERT0(tree->bt_num_elems);
+		return;
+	}
+
+	zfs_btree_clear_helper(tree, tree->bt_root);
+	tree->bt_num_elems = 0;
+	tree->bt_root = NULL;
+	tree->bt_num_nodes = 0;
+	tree->bt_height = -1;
+	tree->bt_bulk = NULL;
+}
+
+void
+zfs_btree_destroy(zfs_btree_t *tree)
+{
+	ASSERT0(tree->bt_num_elems);
+	ASSERT3P(tree->bt_root, ==, NULL);
+}
+
+/* Verify that every child of this node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (!hdr->bth_core)
+		return;
+
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
+		zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
+	}
+}
+
+/* Verify that every node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers(zfs_btree_t *tree)
+{
+	if (tree->bt_height == -1) {
+		VERIFY3P(tree->bt_root, ==, NULL);
+		return;
+	}
+	VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
+	zfs_btree_verify_pointers_helper(tree, tree->bt_root);
+}
+
+/*
+ * Verify that all the current node and its children satisfy the count
+ * invariants, and return the total count in the subtree rooted in this node.
+ */
+static uint64_t
+zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (!hdr->bth_core) {
+		if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
+			uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+			    sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
+			VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+		}
+
+		return (hdr->bth_count);
+	} else {
+
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		uint64_t ret = hdr->bth_count;
+		if (tree->bt_root != hdr && tree->bt_bulk == NULL)
+			VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			ret += zfs_btree_verify_counts_helper(tree,
+			    node->btc_children[i]);
+		}
+
+		return (ret);
+	}
+}
+
+/*
+ * Verify that all nodes satisfy the invariants and that the total number of
+ * elements is correct.
+ */
+static void
+zfs_btree_verify_counts(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
+	if (tree->bt_height == -1) {
+		return;
+	}
+	VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
+	    tree->bt_num_elems);
+}
+
+/*
+ * Check that the subtree rooted at this node has a uniform height. Returns
+ * the number of nodes under this node, to help verify bt_num_nodes.
+ */
+static uint64_t
+zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    int64_t height)
+{
+	if (!hdr->bth_core) {
+		VERIFY0(height);
+		return (1);
+	}
+
+	VERIFY(hdr->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	uint64_t ret = 1;
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		ret += zfs_btree_verify_height_helper(tree,
+		    node->btc_children[i], height - 1);
+	}
+	return (ret);
+}
+
+/*
+ * Check that the tree rooted at this node has a uniform height, and that the
+ * bt_height in the tree is correct.
+ */
+static void
+zfs_btree_verify_height(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+	if (tree->bt_height == -1) {
+		return;
+	}
+
+	VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
+	    tree->bt_height), ==, tree->bt_num_nodes);
+}
+
+/*
+ * Check that the elements in this node are sorted, and that if this is a core
+ * node, the separators are properly between the subtrees they separaate and
+ * that the children also satisfy this requirement.
+ */
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		for (int i = 1; i < hdr->bth_count; i++) {
+			VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
+			    size, leaf->btl_elems + i * size), ==, -1);
+		}
+		return;
+	}
+
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	for (int i = 1; i < hdr->bth_count; i++) {
+		VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
+		    node->btc_elems + i * size), ==, -1);
+	}
+	for (int i = 0; i < hdr->bth_count; i++) {
+		uint8_t *left_child_last = NULL;
+		zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
+		if (left_child_hdr->bth_core) {
+			zfs_btree_core_t *left_child =
+			    (zfs_btree_core_t *)left_child_hdr;
+			left_child_last = left_child->btc_elems +
+			    (left_child_hdr->bth_count - 1) * size;
+		} else {
+			zfs_btree_leaf_t *left_child =
+			    (zfs_btree_leaf_t *)left_child_hdr;
+			left_child_last = left_child->btl_elems +
+			    (left_child_hdr->bth_count - 1) * size;
+		}
+		if (tree->bt_compar(node->btc_elems + i * size,
+		    left_child_last) != 1) {
+			panic("btree: compar returned %d (expected 1) at "
+			    "%px %d: compar(%px,  %px)", tree->bt_compar(
+			    node->btc_elems + i * size, left_child_last),
+			    (void *)node, i, (void *)(node->btc_elems + i *
+			    size), (void *)left_child_last);
+		}
+
+		uint8_t *right_child_first = NULL;
+		zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
+		if (right_child_hdr->bth_core) {
+			zfs_btree_core_t *right_child =
+			    (zfs_btree_core_t *)right_child_hdr;
+			right_child_first = right_child->btc_elems;
+		} else {
+			zfs_btree_leaf_t *right_child =
+			    (zfs_btree_leaf_t *)right_child_hdr;
+			right_child_first = right_child->btl_elems;
+		}
+		if (tree->bt_compar(node->btc_elems + i * size,
+		    right_child_first) != -1) {
+			panic("btree: compar returned %d (expected -1) at "
+			    "%px %d: compar(%px,  %px)", tree->bt_compar(
+			    node->btc_elems + i * size, right_child_first),
+			    (void *)node, i, (void *)(node->btc_elems + i *
+			    size), (void *)right_child_first);
+		}
+	}
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		zfs_btree_verify_order_helper(tree, node->btc_children[i]);
+	}
+}
+
+/* Check that all elements in the tree are in sorted order. */
+static void
+zfs_btree_verify_order(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+	if (tree->bt_height == -1) {
+		return;
+	}
+
+	zfs_btree_verify_order_helper(tree, tree->bt_root);
+}
+
+#ifdef ZFS_DEBUG
+/* Check that all unused memory is poisoned correctly. */
+static void
+zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		uint8_t val = 0x0f;
+		for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
+		    sizeof (zfs_btree_hdr_t); i++) {
+			VERIFY3U(leaf->btl_elems[i], ==, val);
+		}
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		uint8_t val = 0x0f;
+		for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
+		    i++) {
+			VERIFY3U(node->btc_elems[i], ==, val);
+		}
+
+		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+			VERIFY3P(node->btc_children[i], ==,
+			    (zfs_btree_hdr_t *)BTREE_POISON);
+		}
+
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			zfs_btree_verify_poison_helper(tree,
+			    node->btc_children[i]);
+		}
+	}
+}
+#endif
+
+/* Check that unused memory in the tree is still poisoned. */
+static void
+zfs_btree_verify_poison(zfs_btree_t *tree)
+{
+#ifdef ZFS_DEBUG
+	if (tree->bt_height == -1)
+		return;
+	zfs_btree_verify_poison_helper(tree, tree->bt_root);
+#endif
+}
+
+void
+zfs_btree_verify(zfs_btree_t *tree)
+{
+	if (zfs_btree_verify_intensity == 0)
+		return;
+	zfs_btree_verify_height(tree);
+	if (zfs_btree_verify_intensity == 1)
+		return;
+	zfs_btree_verify_pointers(tree);
+	if (zfs_btree_verify_intensity == 2)
+		return;
+	zfs_btree_verify_counts(tree);
+	if (zfs_btree_verify_intensity == 3)
+		return;
+	zfs_btree_verify_order(tree);
+
+	if (zfs_btree_verify_intensity == 4)
+		return;
+	zfs_btree_verify_poison(tree);
+}
--- a/usr/src/uts/common/fs/zfs/ddt.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/ddt.c	Mon Dec 09 14:15:34 2019 +0000
@@ -794,7 +794,7 @@
 			break;
 	}
 
-	return (AVL_ISIGN(cmp));
+	return (TREE_ISIGN(cmp));
 }
 
 static ddt_t *
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Dec 09 14:15:34 2019 +0000
@@ -1829,7 +1829,7 @@
 	 */
 	rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
 
-	return (AVL_ISIGN(rv));
+	return (TREE_ISIGN(rv));
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/dmu_recv.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_recv.c	Mon Dec 09 14:15:34 2019 +0000
@@ -883,14 +883,10 @@
 static int
 guid_compare(const void *arg1, const void *arg2)
 {
-	const guid_map_entry_t *gmep1 = arg1;
-	const guid_map_entry_t *gmep2 = arg2;
-
-	if (gmep1->guid < gmep2->guid)
-		return (-1);
-	else if (gmep1->guid > gmep2->guid)
-		return (1);
-	return (0);
+	const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
+	const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
+
+	return (TREE_CMP(gmep1->guid, gmep2->guid));
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/dnode.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Mon Dec 09 14:15:34 2019 +0000
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 RackTop Systems.
@@ -90,11 +90,11 @@
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
-	int cmp = AVL_CMP(d1->db_level, d2->db_level);
+	int cmp = TREE_CMP(d1->db_level, d2->db_level);
 	if (likely(cmp))
 		return (cmp);
 
-	cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
 	if (likely(cmp))
 		return (cmp);
 
@@ -106,7 +106,7 @@
 		return (1);
 	}
 
-	return (AVL_PCMP(d1, d2));
+	return (TREE_PCMP(d1, d2));
 }
 
 /* ARGSUSED */
@@ -2197,7 +2197,8 @@
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] == NULL) {
-		dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
+		dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
+		    RANGE_SEG64, NULL, 0, 0);
 	}
 	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
 	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
--- a/usr/src/uts/common/fs/zfs/dsl_deadlist.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c	Mon Dec 09 14:15:34 2019 +0000
@@ -58,7 +58,7 @@
 	const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
 	const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
 
-	return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
+	return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c	Mon Dec 09 14:15:34 2019 +0000
@@ -390,7 +390,7 @@
 
 	val = strcmp(node1->p_setname, node2->p_setname);
 
-	return (AVL_ISIGN(val));
+	return (TREE_ISIGN(val));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c	Mon Dec 09 14:15:34 2019 +0000
@@ -295,7 +295,7 @@
 
 	/* trees used for sorting I/Os and extents of I/Os */
 	range_tree_t	*q_exts_by_addr;
-	avl_tree_t	q_exts_by_size;
+	zfs_btree_t		q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
 	uint64_t	q_sio_memused;
 
@@ -653,7 +653,8 @@
 
 			mutex_enter(&vd->vdev_scan_io_queue_lock);
 			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
-			ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+			ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
+			    NULL);
 			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
 			mutex_exit(&vd->vdev_scan_io_queue_lock);
 		}
@@ -1239,7 +1240,7 @@
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
 			/* # extents in exts_by_size = # in exts_by_addr */
-			mused += avl_numnodes(&queue->q_exts_by_size) *
+			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
 			    sizeof (range_seg_t) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
@@ -2769,7 +2770,7 @@
 
 	srch_sio = sio_alloc(1);
 	srch_sio->sio_nr_dvas = 1;
-	SIO_SET_OFFSET(srch_sio, rs->rs_start);
+	SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
@@ -2781,10 +2782,12 @@
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
-	while (sio != NULL &&
-	    SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
-		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
-		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
+	while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+	    queue->q_exts_by_addr) && num_sios <= 32) {
+		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
+		    queue->q_exts_by_addr));
+		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
+		    queue->q_exts_by_addr));
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
@@ -2802,16 +2805,19 @@
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
-	if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
+	if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+	    queue->q_exts_by_addr)) {
 		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
-		    SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
+		    SIO_GET_OFFSET(sio), rs_get_end(rs,
+		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
 
 		return (B_TRUE);
 	} else {
-		range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
-		    rs->rs_end - rs->rs_start);
+		uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
+		uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
+		range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
 		return (B_FALSE);
 	}
 }
@@ -2832,6 +2838,7 @@
 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
+	range_tree_t *rt = queue->q_exts_by_addr;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 	ASSERT(scn->scn_is_sorted);
@@ -2839,9 +2846,16 @@
 	/* handle tunable overrides */
 	if (scn->scn_checkpointing || scn->scn_clearing) {
 		if (zfs_scan_issue_strategy == 1) {
-			return (range_tree_first(queue->q_exts_by_addr));
+			return (range_tree_first(rt));
 		} else if (zfs_scan_issue_strategy == 2) {
-			return (avl_first(&queue->q_exts_by_size));
+			range_seg_t *size_rs =
+			    zfs_btree_first(&queue->q_exts_by_size, NULL);
+			uint64_t start = rs_get_start(size_rs, rt);
+			uint64_t size = rs_get_end(size_rs, rt) - start;
+			range_seg_t *addr_rs = range_tree_find(rt, start,
+			    size);
+			ASSERT3P(addr_rs, !=, NULL);
+			return (addr_rs);
 		}
 	}
 
@@ -2855,9 +2869,15 @@
 	 * In this case, we instead switch to issuing extents in LBA order.
 	 */
 	if (scn->scn_checkpointing) {
-		return (range_tree_first(queue->q_exts_by_addr));
+		return (range_tree_first(rt));
 	} else if (scn->scn_clearing) {
-		return (avl_first(&queue->q_exts_by_size));
+		range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
+		    NULL);
+		uint64_t start = rs_get_start(size_rs, rt);
+		uint64_t size = rs_get_end(size_rs, rt) - start;
+		range_seg_t *addr_rs = range_tree_find(rt, start, size);
+		ASSERT3P(addr_rs, !=, NULL);
+		return (addr_rs);
 	} else {
 		return (NULL);
 	}
@@ -3946,9 +3966,10 @@
 static int
 ext_size_compare(const void *x, const void *y)
 {
-	const range_seg_t *rsa = x, *rsb = y;
-	uint64_t sa = rsa->rs_end - rsa->rs_start,
-	    sb = rsb->rs_end - rsb->rs_start;
+	const range_seg_gap_t *rsa = x, *rsb = y;
+
+	uint64_t sa = rsa->rs_end - rsa->rs_start;
+	uint64_t sb = rsb->rs_end - rsb->rs_start;
 	uint64_t score_a, score_b;
 
 	score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
@@ -3977,7 +3998,7 @@
 {
 	const scan_io_t *a = x, *b = y;
 
-	return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
+	return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
@@ -3991,8 +4012,8 @@
 	q->q_vd = vd;
 	q->q_sio_memused = 0;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
-	q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
-	    &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
+	q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
+	    &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
 	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Mon Dec 09 14:15:34 2019 +0000
@@ -38,6 +38,7 @@
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
+#include <sys/btree.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
@@ -179,6 +180,13 @@
 int metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
+ * Forces the metaslab_block_picker function to search for at least this many
+ * segments forwards until giving up on finding a segment that the allocation
+ * will fit into.
+ */
+uint32_t metaslab_min_search_count = 100;
+
+/*
  * If we are not searching forward (due to metaslab_df_max_search,
  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  * controls what segment is used.  If it is set, we will use the largest free
@@ -274,17 +282,32 @@
 int max_disabled_ms = 3;
 
 /*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
+/*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
-int zfs_metaslab_mem_limit = 25;
+int zfs_metaslab_mem_limit = 75;
 
 /*
- * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
- * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ * Force the per-metaslab range trees to use 64-bit integers to store
+ * segments. Used for debugging purposes.
  */
-unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+
+/*
+ * By default we only store segments over a certain size in the size-sorted
+ * metaslab trees (ms_allocatable_by_size and
+ * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
+ * improves load and unload times at the cost of causing us to use slightly
+ * larger segments than we would otherwise in some cases.
+ */
+uint32_t metaslab_by_size_min_shift = 14;
 
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
@@ -295,9 +318,56 @@
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
+static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
+typedef struct metaslab_stats {
+	kstat_named_t metaslabstat_trace_over_limit;
+	kstat_named_t metaslabstat_df_find_under_floor;
+	kstat_named_t metaslabstat_reload_tree;
+} metaslab_stats_t;
+
+static metaslab_stats_t metaslab_stats = {
+	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
+	{ "df_find_under_floor",	KSTAT_DATA_UINT64 },
+	{ "reload_tree",		KSTAT_DATA_UINT64 },
+};
+
+#define	METASLABSTAT_BUMP(stat) \
+	atomic_inc_64(&metaslab_stats.stat.value.ui64);
+
+
+kstat_t *metaslab_ksp;
+
+void
+metaslab_stat_init(void)
+{
+	ASSERT(metaslab_alloc_trace_cache == NULL);
+	metaslab_alloc_trace_cache = kmem_cache_create(
+	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
+	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
+	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (metaslab_ksp != NULL) {
+		metaslab_ksp->ks_data = &metaslab_stats;
+		kstat_install(metaslab_ksp);
+	}
+}
+
+void
+metaslab_stat_fini(void)
+{
+	if (metaslab_ksp != NULL) {
+		kstat_delete(metaslab_ksp);
+		metaslab_ksp = NULL;
+	}
+
+	kmem_cache_destroy(metaslab_alloc_trace_cache);
+	metaslab_alloc_trace_cache = NULL;
+}
+
 /*
  * ==========================================================================
  * Metaslab classes
@@ -608,13 +678,13 @@
 	if (sort1 > sort2)
 		return (1);
 
-	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
-	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
-
-	return (AVL_CMP(m1->ms_start, m2->ms_start));
+	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+	return (TREE_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
@@ -711,17 +781,17 @@
 	const metaslab_t *a = va;
 	const metaslab_t *b = vb;
 
-	int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 	if (likely(cmp))
 		return (cmp);
 
 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
-	cmp = AVL_CMP(a_vdev_id, b_vdev_id);
+	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 	if (cmp)
 		return (cmp);
 
-	return (AVL_CMP(a->ms_id, b->ms_id));
+	return (TREE_CMP(a->ms_id, b->ms_id));
 }
 
 metaslab_group_t *
@@ -1248,25 +1318,170 @@
  */
 
 /*
- * Comparison function for the private size-ordered tree. Tree is sorted
- * by size, larger sizes at the end of the tree.
+ * Comparison function for the private size-ordered tree using 32-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize32_compare(const void *x1, const void *x2)
+{
+	const range_seg32_t *r1 = x1;
+	const range_seg32_t *r2 = x2;
+
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+	int cmp = TREE_CMP(rs_size1, rs_size2);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * Comparison function for the private size-ordered tree using 64-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 static int
-metaslab_rangesize_compare(const void *x1, const void *x2)
+metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
-	const range_seg_t *r1 = x1;
-	const range_seg_t *r2 = x2;
+	const range_seg64_t *r1 = x1;
+	const range_seg64_t *r2 = x2;
+
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
-	int cmp = AVL_CMP(rs_size1, rs_size2);
+	int cmp = TREE_CMP(rs_size1, rs_size2);
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_CMP(r1->rs_start, r2->rs_start));
+	return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+typedef struct metaslab_rt_arg {
+	zfs_btree_t *mra_bt;
+	uint32_t mra_floor_shift;
+} metaslab_rt_arg_t;
+
+struct mssa_arg {
+	range_tree_t *rt;
+	metaslab_rt_arg_t *mra;
+};
+
+static void
+metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
+{
+	struct mssa_arg *mssap = arg;
+	range_tree_t *rt = mssap->rt;
+	metaslab_rt_arg_t *mrap = mssap->mra;
+	range_seg_max_t seg = {0};
+	rs_set_start(&seg, rt, start);
+	rs_set_end(&seg, rt, start + size);
+	metaslab_rt_add(rt, &seg, mrap);
+}
+
+static void
+metaslab_size_tree_full_load(range_tree_t *rt)
+{
+	metaslab_rt_arg_t *mrap = rt->rt_arg;
+#ifdef _METASLAB_TRACING
+	METASLABSTAT_BUMP(metaslabstat_reload_tree);
+#endif
+	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
+	mrap->mra_floor_shift = 0;
+	struct mssa_arg arg = {0};
+	arg.rt = rt;
+	arg.mra = mrap;
+	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
 /*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
+/* ARGSUSED */
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	size_t size;
+	int (*compare) (const void *, const void *);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		compare = metaslab_rangesize32_compare;
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		compare = metaslab_rangesize64_compare;
+		break;
+	default:
+		panic("Invalid range seg type %d", rt->rt_type);
+	}
+	zfs_btree_create(size_tree, compare, size);
+	mrap->mra_floor_shift = metaslab_by_size_min_shift;
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	zfs_btree_destroy(size_tree);
+	kmem_free(mrap, sizeof (*mrap));
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
+	    (1 << mrap->mra_floor_shift))
+		return;
+
+	zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+	    mrap->mra_floor_shift))
+		return;
+
+	zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+	zfs_btree_clear(size_tree);
+	zfs_btree_destroy(size_tree);
+
+	metaslab_rt_create(rt, arg);
+}
+
+static range_tree_ops_t metaslab_rt_ops = {
+	.rtop_create = metaslab_rt_create,
+	.rtop_destroy = metaslab_rt_destroy,
+	.rtop_add = metaslab_rt_add,
+	.rtop_remove = metaslab_rt_remove,
+	.rtop_vacate = metaslab_rt_vacate
+};
+
+/*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
@@ -1278,16 +1493,20 @@
 uint64_t
 metaslab_largest_allocatable(metaslab_t *msp)
 {
-	avl_tree_t *t = &msp->ms_allocatable_by_size;
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
 	if (t == NULL)
 		return (0);
-	rs = avl_last(t);
+	if (zfs_btree_numnodes(t) == 0)
+		metaslab_size_tree_full_load(msp->ms_allocatable);
+
+	rs = zfs_btree_last(t, NULL);
 	if (rs == NULL)
 		return (0);
 
-	return (rs->rs_end - rs->rs_start);
+	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
+	    msp->ms_allocatable));
 }
 
 /*
@@ -1302,7 +1521,10 @@
 	if (msp->ms_unflushed_frees == NULL)
 		return (0);
 
-	range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
+	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
+		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
+	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
+	    NULL);
 	if (rs == NULL)
 		return (0);
 
@@ -1329,8 +1551,8 @@
 	 * the largest segment; there may be other usable chunks in the
 	 * largest segment, but we ignore them.
 	 */
-	uint64_t rstart = rs->rs_start;
-	uint64_t rsize = rs->rs_end - rstart;
+	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
+	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		uint64_t start = 0;
 		uint64_t size = 0;
@@ -1354,44 +1576,52 @@
 }
 
 static range_seg_t *
-metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
+metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
+    uint64_t size, zfs_btree_index_t *where)
 {
-	range_seg_t *rs, rsearch;
-	avl_index_t where;
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = start + size;
-
-	rs = avl_find(t, &rsearch, &where);
+	range_seg_t *rs;
+	range_seg_max_t rsearch;
+
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, start + size);
+
+	rs = zfs_btree_find(t, &rsearch, where);
 	if (rs == NULL) {
-		rs = avl_nearest(t, where, AVL_AFTER);
+		rs = zfs_btree_next(t, where, where);
 	}
 
 	return (rs);
 }
 
 /*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * This is a helper function that can be used by the allocator to find a
+ * suitable block to allocate. This will search the specified B-tree looking
+ * for a block that matches the specified criteria.
  */
 static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
     uint64_t max_search)
 {
-	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+	if (*cursor == 0)
+		*cursor = rt->rt_start;
+	zfs_btree_t *bt = &rt->rt_root;
+	zfs_btree_index_t where;
+	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
 	uint64_t first_found;
+	int count_searched = 0;
 
 	if (rs != NULL)
-		first_found = rs->rs_start;
-
-	while (rs != NULL && rs->rs_start - first_found <= max_search) {
-		uint64_t offset = rs->rs_start;
-		if (offset + size <= rs->rs_end) {
+		first_found = rs_get_start(rs, rt);
+
+	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
+	    max_search || count_searched < metaslab_min_search_count)) {
+		uint64_t offset = rs_get_start(rs, rt);
+		if (offset + size <= rs_get_end(rs, rt)) {
 			*cursor = offset + size;
 			return (offset);
 		}
-		rs = AVL_NEXT(t, rs);
+		rs = zfs_btree_next(bt, &where, &where);
+		count_searched++;
 	}
 
 	*cursor = 0;
@@ -1435,8 +1665,6 @@
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(&rt->rt_root), ==,
-	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	/*
 	 * If we're running low on space, find a segment based on size,
@@ -1446,22 +1674,33 @@
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
-		offset = metaslab_block_picker(&rt->rt_root,
+		offset = metaslab_block_picker(rt,
 		    cursor, size, metaslab_df_max_search);
 	}
 
 	if (offset == -1) {
 		range_seg_t *rs;
+		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
+			metaslab_size_tree_full_load(msp->ms_allocatable);
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
-			rs = avl_last(&msp->ms_allocatable_by_size);
+			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 		} else {
+			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
+#ifdef _METASLAB_TRACING
+			metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
+			if (size < (1 << mrap->mra_floor_shift)) {
+				METASLABSTAT_BUMP(
+				    metaslabstat_df_find_under_floor);
+			}
+#endif
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
-			    0, size);
+			    rt, msp->ms_start, size, &where);
 		}
-		if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
-			offset = rs->rs_start;
+		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
+		    rt)) {
+			offset = rs_get_start(rs, rt);
 			*cursor = offset + size;
 		}
 	}
@@ -1486,25 +1725,27 @@
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	range_tree_t *rt = msp->ms_allocatable;
-	avl_tree_t *t = &msp->ms_allocatable_by_size;
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
-		rs = avl_last(&msp->ms_allocatable_by_size);
-		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+		if (zfs_btree_numnodes(t) == 0)
+			metaslab_size_tree_full_load(msp->ms_allocatable);
+		rs = zfs_btree_last(t, NULL);
+		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
+		    size)
 			return (-1ULL);
 
-		*cursor = rs->rs_start;
-		*cursor_end = rs->rs_end;
+		*cursor = rs_get_start(rs, rt);
+		*cursor_end = rs_get_end(rs, rt);
 	}
 
 	offset = *cursor;
@@ -1535,39 +1776,40 @@
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &msp->ms_allocatable->rt_root;
-	avl_index_t where;
-	range_seg_t *rs, rsearch;
+	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
+	range_tree_t *rt = msp->ms_allocatable;
+	zfs_btree_index_t where;
+	range_seg_t *rs;
+	range_seg_max_t rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT3U(avl_numnodes(t), ==,
-	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	if (max_size < size)
 		return (-1ULL);
 
-	rsearch.rs_start = *cursor;
-	rsearch.rs_end = *cursor + size;
-
-	rs = avl_find(t, &rsearch, &where);
-	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+	rs_set_start(&rsearch, rt, *cursor);
+	rs_set_end(&rsearch, rt, *cursor + size);
+
+	rs = zfs_btree_find(t, &rsearch, &where);
+	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
 		t = &msp->ms_allocatable_by_size;
 
-		rsearch.rs_start = 0;
-		rsearch.rs_end = MIN(max_size,
-		    1ULL << (hbit + metaslab_ndf_clump_shift));
-		rs = avl_find(t, &rsearch, &where);
+		rs_set_start(&rsearch, rt, 0);
+		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
+		    metaslab_ndf_clump_shift)));
+
+		rs = zfs_btree_find(t, &rsearch, &where);
 		if (rs == NULL)
-			rs = avl_nearest(t, where, AVL_AFTER);
+			rs = zfs_btree_next(t, &where, &where);
 		ASSERT(rs != NULL);
 	}
 
-	if ((rs->rs_end - rs->rs_start) >= size) {
-		*cursor = rs->rs_start + size;
-		return (rs->rs_start);
+	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
+		*cursor = rs_get_start(rs, rt) + size;
+		return (rs_get_start(rs, rt));
 	}
 	return (-1ULL);
 }
@@ -1905,9 +2147,9 @@
 {
 #ifdef _KERNEL
 	uint64_t allmem = arc_all_memory();
-	extern kmem_cache_t *range_seg_cache;
-	uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
-	uint64_t size =	kmem_cache_stat(range_seg_cache, "buf_size");
+	extern kmem_cache_t *zfs_btree_leaf_cache;
+	uint64_t inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse");
+	uint64_t size =  kmem_cache_stat(zfs_btree_leaf_cache, "buf_size");
 	int tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
@@ -1944,7 +2186,7 @@
 			 */
 			if (msp->ms_loading) {
 				msp = next_msp;
-				inuse = kmem_cache_stat(range_seg_cache,
+				inuse = kmem_cache_stat(zfs_btree_leaf_cache,
 				    "buf_inuse");
 				continue;
 			}
@@ -1967,7 +2209,8 @@
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
-			inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
+			inuse = kmem_cache_stat(zfs_btree_leaf_cache,
+			    "buf_inuse");
 		}
 	}
 #endif
@@ -2010,11 +2253,40 @@
 	mutex_exit(&msp->ms_lock);
 
 	hrtime_t load_start = gethrtime();
+	metaslab_rt_arg_t *mrap;
+	if (msp->ms_allocatable->rt_arg == NULL) {
+		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+	} else {
+		mrap = msp->ms_allocatable->rt_arg;
+		msp->ms_allocatable->rt_ops = NULL;
+		msp->ms_allocatable->rt_arg = NULL;
+	}
+	mrap->mra_bt = &msp->ms_allocatable_by_size;
+	mrap->mra_floor_shift = metaslab_by_size_min_shift;
+
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
+
+		/* Now, populate the size-sorted tree. */
+		metaslab_rt_create(msp->ms_allocatable, mrap);
+		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+		msp->ms_allocatable->rt_arg = mrap;
+
+		struct mssa_arg arg = {0};
+		arg.rt = msp->ms_allocatable;
+		arg.mra = mrap;
+		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
+		    &arg);
 	} else {
 		/*
+		 * Add the size-sorted tree first, since we don't need to load
+		 * the metaslab from the spacemap.
+		 */
+		metaslab_rt_create(msp->ms_allocatable, mrap);
+		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+		msp->ms_allocatable->rt_arg = mrap;
+		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
@@ -2276,6 +2548,29 @@
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
+/*
+ * We want to optimize the memory use of the per-metaslab range
+ * trees. To do this, we store the segments in the range trees in
+ * units of sectors, zero-indexing from the start of the metaslab. If
+ * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
+ * the ranges using two uint32_ts, rather than two uint64_ts.
+ */
+static range_seg_type_t
+metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
+    uint64_t *start, uint64_t *shift)
+{
+	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
+	    !zfs_metaslab_force_large_segs) {
+		*shift = vdev->vdev_ashift;
+		*start = msp->ms_start;
+		return (RANGE_SEG32);
+	} else {
+		*shift = 0;
+		*start = 0;
+		return (RANGE_SEG64);
+	}
+}
+
 void
 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 {
@@ -2352,6 +2647,10 @@
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
+	range_seg_type_t type;
+	uint64_t shift, start;
+	type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+
 	/*
 	 * We create the ms_allocatable here, but we don't create the
 	 * other range trees until metaslab_sync_done().  This serves
@@ -2360,10 +2659,9 @@
 	 * we'd data fault on any attempt to use this metaslab before
 	 * it's ready.
 	 */
-	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
-	    &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
-
-	ms->ms_trim = range_tree_create(NULL, NULL);
+	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
+
+	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms);
@@ -2418,7 +2716,7 @@
 {
 	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
 	    range_tree_numsegs(ms->ms_unflushed_frees)) *
-	    sizeof (range_seg_t));
+	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 }
 
 void
@@ -3207,7 +3505,7 @@
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
-	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
@@ -3253,28 +3551,29 @@
 	 * So to truncate the space map to represent all the entries of
 	 * previous TXGs we do the following:
 	 *
-	 * 1] We create a range tree (condense tree) that is 100% allocated.
-	 * 2] We remove from it all segments found in the ms_defer trees
+	 * 1] We create a range tree (condense tree) that is 100% empty.
+	 * 2] We add to it all segments found in the ms_defer trees
 	 *    as those segments are marked as free in the original space
 	 *    map. We do the same with the ms_allocating trees for the same
-	 *    reason. Removing these segments should be a relatively
+	 *    reason. Adding these segments should be a relatively
 	 *    inexpensive operation since we expect these trees to have a
 	 *    small number of nodes.
-	 * 3] We vacate any unflushed allocs as they should already exist
-	 *    in the condense tree. Then we vacate any unflushed frees as
-	 *    they should already be part of ms_allocatable.
-	 * 4] At this point, we would ideally like to remove all segments
+	 * 3] We vacate any unflushed allocs, since they are not frees we
+	 *    need to add to the condense tree. Then we vacate any
+	 *    unflushed frees as they should already be part of ms_allocatable.
+	 * 4] At this point, we would ideally like to add all segments
 	 *    in the ms_allocatable tree from the condense tree. This way
 	 *    we would write all the entries of the condense tree as the
-	 *    condensed space map, which would only contain allocated
-	 *    segments with everything else assumed to be freed.
+	 *    condensed space map, which would only contain freeed
+	 *    segments with everything else assumed to be allocated.
 	 *
 	 *    Doing so can be prohibitively expensive as ms_allocatable can
-	 *    be large, and therefore computationally expensive to subtract
-	 *    from the condense_tree. Instead we first sync out the
-	 *    condense_tree and then the ms_allocatable, in the condensed
-	 *    space map. While this is not optimal, it is typically close to
-	 *    optimal and more importantly much cheaper to compute.
+	 *    be large, and therefore computationally expensive to add to
+	 *    the condense_tree. Instead we first sync out an entry marking
+	 *    everything as allocated, then the condense_tree and then the
+	 *    ms_allocatable, in the condensed space map. While this is not
+	 *    optimal, it is typically close to optimal and more importantly
+	 *    much cheaper to compute.
 	 *
 	 * 5] Finally, as both of the unflushed trees were written to our
 	 *    new and condensed metaslab space map, we basically flushed
@@ -3288,22 +3587,26 @@
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
 	    spa->spa_name, space_map_length(msp->ms_sm),
-	    avl_numnodes(&msp->ms_allocatable->rt_root),
+	    range_tree_numsegs(msp->ms_allocatable),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
-	condense_tree = range_tree_create(NULL, NULL);
-	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
+	range_seg_type_t type;
+	uint64_t shift, start;
+	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
+	    &start, &shift);
+
+	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
-		    range_tree_remove, condense_tree);
+		    range_tree_add, condense_tree);
 	}
 
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
-		    range_tree_remove, condense_tree);
+		    range_tree_add, condense_tree);
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
@@ -3351,11 +3654,17 @@
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
+	    shift);
+	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
+	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
+	range_tree_vacate(tmp_tree, NULL, NULL);
+	range_tree_destroy(tmp_tree);
 	mutex_enter(&msp->ms_lock);
 
 	msp->ms_condensing = B_FALSE;
@@ -3598,7 +3907,7 @@
 		return;
 
 
-	VERIFY(txg <= spa_final_dirty_txg(spa));
+	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
@@ -3887,32 +4196,46 @@
 	 * range trees and add its capacity to the vdev.
 	 */
 	if (msp->ms_freed == NULL) {
+		range_seg_type_t type;
+		uint64_t shift, start;
+		type = metaslab_calculate_range_tree_type(vd, msp, &start,
+		    &shift);
+
 		for (int t = 0; t < TXG_SIZE; t++) {
 			ASSERT(msp->ms_allocating[t] == NULL);
 
-			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
+			msp->ms_allocating[t] = range_tree_create(NULL, type,
+			    NULL, start, shift);
 		}
 
 		ASSERT3P(msp->ms_freeing, ==, NULL);
-		msp->ms_freeing = range_tree_create(NULL, NULL);
+		msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
+		    shift);
 
 		ASSERT3P(msp->ms_freed, ==, NULL);
-		msp->ms_freed = range_tree_create(NULL, NULL);
+		msp->ms_freed = range_tree_create(NULL, type, NULL, start,
+		    shift);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			ASSERT3P(msp->ms_defer[t], ==, NULL);
-			msp->ms_defer[t] = range_tree_create(NULL, NULL);
+			msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
+			    start, shift);
 		}
 
 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
-		msp->ms_checkpointing = range_tree_create(NULL, NULL);
+		msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
+		    start, shift);
 
 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
-		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
+		msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
+		    start, shift);
+
+		metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+		mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
+		mrap->mra_floor_shift = metaslab_by_size_min_shift;
 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
-		msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
-		    &msp->ms_unflushed_frees_by_size,
-		    metaslab_rangesize_compare, 0);
+		msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+		    type, mrap, start, shift);
 
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
@@ -4091,36 +4414,6 @@
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
-kstat_t *metaslab_trace_ksp;
-kstat_named_t metaslab_trace_over_limit;
-
-void
-metaslab_alloc_trace_init(void)
-{
-	ASSERT(metaslab_alloc_trace_cache == NULL);
-	metaslab_alloc_trace_cache = kmem_cache_create(
-	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
-	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
-	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
-	if (metaslab_trace_ksp != NULL) {
-		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
-		kstat_named_init(&metaslab_trace_over_limit,
-		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
-		kstat_install(metaslab_trace_ksp);
-	}
-}
-
-void
-metaslab_alloc_trace_fini(void)
-{
-	if (metaslab_trace_ksp != NULL) {
-		kstat_delete(metaslab_trace_ksp);
-		metaslab_trace_ksp = NULL;
-	}
-	kmem_cache_destroy(metaslab_alloc_trace_cache);
-	metaslab_alloc_trace_cache = NULL;
-}
 
 /*
  * Add an allocation trace element to the allocation tracing list.
@@ -4145,7 +4438,7 @@
 #ifdef DEBUG
 		panic("too many entries in allocation list");
 #endif
-		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
--- a/usr/src/uts/common/fs/zfs/range_tree.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/range_tree.c	Mon Dec 09 14:15:34 2019 +0000
@@ -74,42 +74,38 @@
  * support removing complete segments.
  */
 
-kmem_cache_t *range_seg_cache;
-
-/* Generic ops for managing an AVL tree alongside a range tree */
-struct range_tree_ops rt_avl_ops = {
-	.rtop_create = rt_avl_create,
-	.rtop_destroy = rt_avl_destroy,
-	.rtop_add = rt_avl_add,
-	.rtop_remove = rt_avl_remove,
-	.rtop_vacate = rt_avl_vacate,
-};
-
-void
-range_tree_init(void)
+static inline void
+rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
 {
-	ASSERT(range_seg_cache == NULL);
-	range_seg_cache = kmem_cache_create("range_seg_cache",
-	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-range_tree_fini(void)
-{
-	kmem_cache_destroy(range_seg_cache);
-	range_seg_cache = NULL;
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	size_t size = 0;
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		break;
+	default:
+		VERIFY(0);
+	}
+	bcopy(src, dest, size);
 }
 
 void
 range_tree_stat_verify(range_tree_t *rt)
 {
 	range_seg_t *rs;
+	zfs_btree_index_t where;
 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
 	int i;
 
-	for (rs = avl_first(&rt->rt_root); rs != NULL;
-	    rs = AVL_NEXT(&rt->rt_root, rs)) {
-		uint64_t size = rs->rs_end - rs->rs_start;
+	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
+	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
 		int idx	= highbit64(size) - 1;
 
 		hist[idx]++;
@@ -128,7 +124,7 @@
 static void
 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 {
-	uint64_t size = rs->rs_end - rs->rs_start;
+	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
 	int idx = highbit64(size) - 1;
 
 	ASSERT(size != 0);
@@ -142,7 +138,7 @@
 static void
 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
 {
-	uint64_t size = rs->rs_end - rs->rs_start;
+	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
 	int idx = highbit64(size) - 1;
 
 	ASSERT(size != 0);
@@ -153,14 +149,35 @@
 	rt->rt_histogram[idx]--;
 }
 
-/*
- * NOTE: caller is responsible for all locking.
- */
+static int
+range_tree_seg32_compare(const void *x1, const void *x2)
+{
+	const range_seg32_t *r1 = x1;
+	const range_seg32_t *r2 = x2;
+
+	ASSERT3U(r1->rs_start, <=, r1->rs_end);
+	ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
 static int
-range_tree_seg_compare(const void *x1, const void *x2)
+range_tree_seg64_compare(const void *x1, const void *x2)
 {
-	const range_seg_t *r1 = (const range_seg_t *)x1;
-	const range_seg_t *r2 = (const range_seg_t *)x2;
+	const range_seg64_t *r1 = x1;
+	const range_seg64_t *r2 = x2;
+
+	ASSERT3U(r1->rs_start, <=, r1->rs_end);
+	ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg_gap_compare(const void *x1, const void *x2)
+{
+	const range_seg_gap_t *r1 = x1;
+	const range_seg_gap_t *r2 = x2;
 
 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
@@ -169,18 +186,42 @@
 }
 
 range_tree_t *
-range_tree_create_impl(range_tree_ops_t *ops, void *arg,
-    int (*avl_compare) (const void *, const void *), uint64_t gap)
+range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
+    uint64_t start, uint64_t shift,
+    int (*zfs_btree_compare) (const void *, const void *),
+    uint64_t gap)
 {
 	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 
-	avl_create(&rt->rt_root, range_tree_seg_compare,
-	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
+	ASSERT3U(shift, <, 64);
+	ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
+	size_t size;
+	int (*compare) (const void *, const void *);
+	switch (type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		compare = range_tree_seg32_compare;
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		compare = range_tree_seg64_compare;
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		compare = range_tree_seg_gap_compare;
+		break;
+	default:
+		panic("Invalid range seg type %d", type);
+	}
+	zfs_btree_create(&rt->rt_root, compare, size);
 
 	rt->rt_ops = ops;
 	rt->rt_arg = arg;
 	rt->rt_gap = gap;
-	rt->rt_avl_compare = avl_compare;
+	rt->rt_type = type;
+	rt->rt_start = start;
+	rt->rt_shift = shift;
+	rt->rt_btree_compare = zfs_btree_compare;
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
@@ -189,9 +230,10 @@
 }
 
 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
+range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+    void *arg, uint64_t start, uint64_t shift)
 {
-	return (range_tree_create_impl(ops, arg, NULL, 0));
+	return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
 }
 
 void
@@ -202,19 +244,20 @@
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
-	avl_destroy(&rt->rt_root);
+	zfs_btree_destroy(&rt->rt_root);
 	kmem_free(rt, sizeof (*rt));
 }
 
 void
 range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
 {
-	ASSERT3U(rs->rs_fill + delta, !=, 0);
-	ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+	ASSERT3U(rs_get_fill(rs, rt) + delta, !=, 0);
+	ASSERT3U(rs_get_fill(rs, rt) + delta, <=, rs_get_end(rs, rt) -
+	    rs_get_start(rs, rt));
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
-	rs->rs_fill += delta;
+	rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 }
@@ -223,28 +266,20 @@
 range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
 	range_tree_t *rt = arg;
-	avl_index_t where;
-	range_seg_t rsearch, *rs_before, *rs_after, *rs;
+	zfs_btree_index_t where;
+	range_seg_t *rs_before, *rs_after, *rs;
+	range_seg_max_t tmp, rsearch;
 	uint64_t end = start + size, gap = rt->rt_gap;
 	uint64_t bridge_size = 0;
 	boolean_t merge_before, merge_after;
 
 	ASSERT3U(size, !=, 0);
 	ASSERT3U(fill, <=, size);
-
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	rs = avl_find(&rt->rt_root, &rsearch, &where);
+	ASSERT3U(start + size, >, start);
 
-	if (gap == 0 && rs != NULL &&
-	    rs->rs_start <= start && rs->rs_end >= end) {
-		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size,
-		    (longlong_t)rs->rs_start,
-		    (longlong_t)rs->rs_end - rs->rs_start);
-		return;
-	}
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 
 	/*
 	 * If this is a gap-supporting range tree, it is possible that we
@@ -255,27 +290,28 @@
 	 * the normal code paths.
 	 */
 	if (rs != NULL) {
+		ASSERT3U(rt->rt_gap, !=, 0);
+		uint64_t rstart = rs_get_start(rs, rt);
+		uint64_t rend = rs_get_end(rs, rt);
 		ASSERT3U(gap, !=, 0);
-		if (rs->rs_start <= start && rs->rs_end >= end) {
+		if (rstart <= start && rend >= end) {
 			range_tree_adjust_fill(rt, rs, fill);
 			return;
 		}
 
-		avl_remove(&rt->rt_root, rs);
+		zfs_btree_remove(&rt->rt_root, rs);
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 		range_tree_stat_decr(rt, rs);
-		rt->rt_space -= rs->rs_end - rs->rs_start;
+		rt->rt_space -= rend - rstart;
 
-		fill += rs->rs_fill;
-		start = MIN(start, rs->rs_start);
-		end = MAX(end, rs->rs_end);
+		fill += rs_get_fill(rs, rt);
+		start = MIN(start, rstart);
+		end = MAX(end, rend);
 		size = end - start;
 
 		range_tree_add_impl(rt, start, size, fill);
-
-		kmem_cache_free(range_seg_cache, rs);
 		return;
 	}
 
@@ -286,19 +322,21 @@
 	 * If gap != 0, we might need to merge with our neighbors even if we
 	 * aren't directly touching.
 	 */
-	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
-	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+	zfs_btree_index_t where_before, where_after;
+	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
+	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
 
-	merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
-	merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+	merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >=
+	    start - gap);
+	merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end +
+	    gap);
 
 	if (merge_before && gap != 0)
-		bridge_size += start - rs_before->rs_end;
+		bridge_size += start - rs_get_end(rs_before, rt);
 	if (merge_after && gap != 0)
-		bridge_size += rs_after->rs_start - end;
+		bridge_size += rs_get_start(rs_after, rt) - end;
 
 	if (merge_before && merge_after) {
-		avl_remove(&rt->rt_root, rs_before);
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
@@ -307,9 +345,19 @@
 		range_tree_stat_decr(rt, rs_before);
 		range_tree_stat_decr(rt, rs_after);
 
-		rs_after->rs_fill += rs_before->rs_fill + fill;
-		rs_after->rs_start = rs_before->rs_start;
-		kmem_cache_free(range_seg_cache, rs_before);
+		rs_copy(rs_after, &tmp, rt);
+		uint64_t before_start = rs_get_start_raw(rs_before, rt);
+		uint64_t before_fill = rs_get_fill(rs_before, rt);
+		uint64_t after_fill = rs_get_fill(rs_after, rt);
+		zfs_btree_remove_from(&rt->rt_root, &where_before);
+
+		/*
+		 * We have to re-find the node because our old reference is
+		 * invalid as soon as we do any mutating btree operations.
+		 */
+		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+		rs_set_start_raw(rs_after, rt, before_start);
+		rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
 		rs = rs_after;
 	} else if (merge_before) {
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
@@ -317,8 +365,9 @@
 
 		range_tree_stat_decr(rt, rs_before);
 
-		rs_before->rs_fill += fill;
-		rs_before->rs_end = end;
+		uint64_t before_fill = rs_get_fill(rs_before, rt);
+		rs_set_end(rs_before, rt, end);
+		rs_set_fill(rs_before, rt, before_fill + fill);
 		rs = rs_before;
 	} else if (merge_after) {
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
@@ -326,22 +375,26 @@
 
 		range_tree_stat_decr(rt, rs_after);
 
-		rs_after->rs_fill += fill;
-		rs_after->rs_start = start;
+		uint64_t after_fill = rs_get_fill(rs_after, rt);
+		rs_set_start(rs_after, rt, start);
+		rs_set_fill(rs_after, rt, after_fill + fill);
 		rs = rs_after;
 	} else {
-		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+		rs = &tmp;
 
-		rs->rs_fill = fill;
-		rs->rs_start = start;
-		rs->rs_end = end;
-		avl_insert(&rt->rt_root, rs, where);
+		rs_set_start(rs, rt, start);
+		rs_set_end(rs, rt, end);
+		rs_set_fill(rs, rt, fill);
+		zfs_btree_insert(&rt->rt_root, rs, &where);
 	}
 
-	if (gap != 0)
-		ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
-	else
-		ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+	if (gap != 0) {
+		ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
+	} else {
+		ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
+	}
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
@@ -360,22 +413,25 @@
 range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
     boolean_t do_fill)
 {
-	avl_index_t where;
-	range_seg_t rsearch, *rs, *newseg;
+	zfs_btree_index_t where;
+	range_seg_t *rs;
+	range_seg_max_t rsearch, rs_tmp;
 	uint64_t end = start + size;
 	boolean_t left_over, right_over;
 
 	VERIFY3U(size, !=, 0);
 	VERIFY3U(size, <=, rt->rt_space);
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
 
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	rs = avl_find(&rt->rt_root, &rsearch, &where);
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 
 	/* Make sure we completely overlap with someone */
 	if (rs == NULL) {
-		zfs_panic_recover("zfs: freeing free segment "
-		    "(offset=%llu size=%llu)",
+		zfs_panic_recover("zfs: removing nonexistent segment from "
+		    "range tree (offset=%llu size=%llu)",
 		    (longlong_t)start, (longlong_t)size);
 		return;
 	}
@@ -388,30 +444,32 @@
 	 */
 	if (rt->rt_gap != 0) {
 		if (do_fill) {
-			if (rs->rs_fill == size) {
-				start = rs->rs_start;
-				end = rs->rs_end;
+			if (rs_get_fill(rs, rt) == size) {
+				start = rs_get_start(rs, rt);
+				end = rs_get_end(rs, rt);
 				size = end - start;
 			} else {
 				range_tree_adjust_fill(rt, rs, -size);
 				return;
 			}
-		} else if (rs->rs_start != start || rs->rs_end != end) {
+		} else if (rs_get_start(rs, rt) != start ||
+		    rs_get_end(rs, rt) != end) {
 			zfs_panic_recover("zfs: freeing partial segment of "
 			    "gap tree (offset=%llu size=%llu) of "
 			    "(offset=%llu size=%llu)",
 			    (longlong_t)start, (longlong_t)size,
-			    (longlong_t)rs->rs_start,
-			    (longlong_t)rs->rs_end - rs->rs_start);
+			    (longlong_t)rs_get_start(rs, rt),
+			    (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs,
+			    rt));
 			return;
 		}
 	}
 
-	VERIFY3U(rs->rs_start, <=, start);
-	VERIFY3U(rs->rs_end, >=, end);
+	VERIFY3U(rs_get_start(rs, rt), <=, start);
+	VERIFY3U(rs_get_end(rs, rt), >=, end);
 
-	left_over = (rs->rs_start != start);
-	right_over = (rs->rs_end != end);
+	left_over = (rs_get_start(rs, rt) != start);
+	right_over = (rs_get_end(rs, rt) != end);
 
 	range_tree_stat_decr(rt, rs);
 
@@ -419,24 +477,33 @@
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
 	if (left_over && right_over) {
-		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
-		newseg->rs_start = end;
-		newseg->rs_end = rs->rs_end;
-		newseg->rs_fill = newseg->rs_end - newseg->rs_start;
-		range_tree_stat_incr(rt, newseg);
+		range_seg_max_t newseg;
+		rs_set_start(&newseg, rt, end);
+		rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt));
+		rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end);
+		range_tree_stat_incr(rt, &newseg);
 
-		rs->rs_end = start;
+		// This modifies the buffer already inside the range tree
+		rs_set_end(rs, rt, start);
 
-		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
+		rs_copy(rs, &rs_tmp, rt);
+		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
+			zfs_btree_insert(&rt->rt_root, &newseg, &where);
+		else
+			zfs_btree_add(&rt->rt_root, &newseg);
+
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
+			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
 	} else if (left_over) {
-		rs->rs_end = start;
+		// This modifies the buffer already inside the range tree
+		rs_set_end(rs, rt, start);
+		rs_copy(rs, &rs_tmp, rt);
 	} else if (right_over) {
-		rs->rs_start = end;
+		// This modifies the buffer already inside the range tree
+		rs_set_start(rs, rt, end);
+		rs_copy(rs, &rs_tmp, rt);
 	} else {
-		avl_remove(&rt->rt_root, rs);
-		kmem_cache_free(range_seg_cache, rs);
+		zfs_btree_remove_from(&rt->rt_root, &where);
 		rs = NULL;
 	}
 
@@ -446,11 +513,12 @@
 		 * the size, since we do not support removing partial segments
 		 * of range trees with gaps.
 		 */
-		rs->rs_fill = rs->rs_end - rs->rs_start;
-		range_tree_stat_incr(rt, rs);
+		rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) -
+		    rs_get_start_raw(rs, rt));
+		range_tree_stat_incr(rt, &rs_tmp);
 
 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
-			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
 	}
 
 	rt->rt_space -= size;
@@ -472,14 +540,14 @@
 range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
     uint64_t newstart, uint64_t newsize)
 {
-	int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+	int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt));
 
 	range_tree_stat_decr(rt, rs);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
-	rs->rs_start = newstart;
-	rs->rs_end = newstart + newsize;
+	rs_set_start(rs, rt, newstart);
+	rs_set_end(rs, rt, newstart + newsize);
 
 	range_tree_stat_incr(rt, rs);
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
@@ -491,22 +559,27 @@
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
-	range_seg_t rsearch;
+	range_seg_max_t rsearch;
 	uint64_t end = start + size;
 
 	VERIFY(size != 0);
 
-	rsearch.rs_start = start;
-	rsearch.rs_end = end;
-	return (avl_find(&rt->rt_root, &rsearch, NULL));
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
 }
 
 range_seg_t *
 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 {
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
-	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
+	if (rs != NULL && rs_get_start(rs, rt) <= start &&
+	    rs_get_end(rs, rt) >= start + size) {
 		return (rs);
+	}
 	return (NULL);
 }
 
@@ -533,24 +606,28 @@
 range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
     uint64_t *ostart, uint64_t *osize)
 {
-	range_seg_t rsearch;
-	rsearch.rs_start = start;
-	rsearch.rs_end = start + 1;
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
 
-	avl_index_t where;
-	range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where);
+	range_seg_max_t rsearch;
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1);
+
+	zfs_btree_index_t where;
+	range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
 	if (rs != NULL) {
 		*ostart = start;
-		*osize = MIN(size, rs->rs_end - start);
+		*osize = MIN(size, rs_get_end(rs, rt) - start);
 		return (B_TRUE);
 	}
 
-	rs = avl_nearest(&rt->rt_root, where, AVL_AFTER);
-	if (rs == NULL || rs->rs_start > start + size)
+	rs = zfs_btree_next(&rt->rt_root, &where, &where);
+	if (rs == NULL || rs_get_start(rs, rt) > start + size)
 		return (B_FALSE);
 
-	*ostart = rs->rs_start;
-	*osize = MIN(start + size, rs->rs_end) - rs->rs_start;
+	*ostart = rs_get_start(rs, rt);
+	*osize = MIN(start + size, rs_get_end(rs, rt)) -
+	    rs_get_start(rs, rt);
 	return (B_TRUE);
 }
 
@@ -566,9 +643,12 @@
 	if (size == 0)
 		return;
 
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
 	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
-		uint64_t free_start = MAX(rs->rs_start, start);
-		uint64_t free_end = MIN(rs->rs_end, start + size);
+		uint64_t free_start = MAX(rs_get_start(rs, rt), start);
+		uint64_t free_end = MIN(rs_get_end(rs, rt), start + size);
 		range_tree_remove(rt, free_start, free_end - free_start);
 	}
 }
@@ -579,7 +659,7 @@
 	range_tree_t *rt;
 
 	ASSERT0(range_tree_space(*rtdst));
-	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
+	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
 
 	rt = *rtsrc;
 	*rtsrc = *rtdst;
@@ -589,17 +669,21 @@
 void
 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
-	range_seg_t *rs;
-	void *cookie = NULL;
-
 
 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 
-	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
-		if (func != NULL)
-			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
-		kmem_cache_free(range_seg_cache, rs);
+	if (func != NULL) {
+		range_seg_t *rs;
+		zfs_btree_index_t *cookie = NULL;
+
+		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
+		    NULL) {
+			func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+			    rs_get_start(rs, rt));
+		}
+	} else {
+		zfs_btree_clear(&rt->rt_root);
 	}
 
 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
@@ -609,16 +693,18 @@
 void
 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
-	for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
-	    rs = AVL_NEXT(&rt->rt_root, rs)) {
-		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
+	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
 	}
 }
 
 range_seg_t *
 range_tree_first(range_tree_t *rt)
 {
-	return (avl_first(&rt->rt_root));
+	return (zfs_btree_first(&rt->rt_root, NULL));
 }
 
 uint64_t
@@ -630,52 +716,7 @@
 uint64_t
 range_tree_numsegs(range_tree_t *rt)
 {
-	return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root));
-}
-
-/* Generic range tree functions for maintaining segments in an AVL tree. */
-void
-rt_avl_create(range_tree_t *rt, void *arg)
-{
-	avl_tree_t *tree = arg;
-
-	avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
-	    offsetof(range_seg_t, rs_pp_node));
-}
-
-void
-rt_avl_destroy(range_tree_t *rt, void *arg)
-{
-	avl_tree_t *tree = arg;
-
-	ASSERT0(avl_numnodes(tree));
-	avl_destroy(tree);
-}
-
-void
-rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	avl_tree_t *tree = arg;
-	avl_add(tree, rs);
-}
-
-void
-rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-	avl_tree_t *tree = arg;
-	avl_remove(tree, rs);
-}
-
-void
-rt_avl_vacate(range_tree_t *rt, void *arg)
-{
-	/*
-	 * Normally one would walk the tree freeing nodes along the way.
-	 * Since the nodes are shared with the range trees we can avoid
-	 * walking all nodes and just reinitialize the avl tree. The nodes
-	 * will be freed by the range tree, so we don't want to free them here.
-	 */
-	rt_avl_create(rt, arg);
+	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
 }
 
 boolean_t
@@ -685,26 +726,76 @@
 	return (range_tree_space(rt) == 0);
 }
 
-uint64_t
-range_tree_min(range_tree_t *rt)
+/* ARGSUSED */
+void
+rt_btree_create(range_tree_t *rt, void *arg)
 {
-	range_seg_t *rs = avl_first(&rt->rt_root);
-	return (rs != NULL ? rs->rs_start : 0);
+	zfs_btree_t *size_tree = arg;
+
+	size_t size;
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		break;
+	default:
+		panic("Invalid range seg type %d", rt->rt_type);
+	}
+	zfs_btree_create(size_tree, rt->rt_btree_compare, size);
+}
+
+/* ARGSUSED */
+void
+rt_btree_destroy(range_tree_t *rt, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	ASSERT0(zfs_btree_numnodes(size_tree));
+
+	zfs_btree_destroy(size_tree);
 }
 
-uint64_t
-range_tree_max(range_tree_t *rt)
+/* ARGSUSED */
+void
+rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	range_seg_t *rs = avl_last(&rt->rt_root);
-	return (rs != NULL ? rs->rs_end : 0);
+	zfs_btree_t *size_tree = arg;
+
+	zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+void
+rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+
+	zfs_btree_remove(size_tree, rs);
 }
 
-uint64_t
-range_tree_span(range_tree_t *rt)
+/* ARGSUSED */
+void
+rt_btree_vacate(range_tree_t *rt, void *arg)
 {
-	return (range_tree_max(rt) - range_tree_min(rt));
+	zfs_btree_t *size_tree = arg;
+	zfs_btree_clear(size_tree);
+	zfs_btree_destroy(size_tree);
+
+	rt_btree_create(rt, arg);
 }
 
+range_tree_ops_t rt_btree_ops = {
+	.rtop_create = rt_btree_create,
+	.rtop_destroy = rt_btree_destroy,
+	.rtop_add = rt_btree_add,
+	.rtop_remove = rt_btree_remove,
+	.rtop_vacate = rt_btree_vacate
+};
+
 /*
  * Remove any overlapping ranges between the given segment [start, end)
  * from removefrom. Add non-overlapping leftovers to addto.
@@ -713,42 +804,62 @@
 range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
     range_tree_t *removefrom, range_tree_t *addto)
 {
-	avl_index_t where;
-	range_seg_t starting_rs = {
-		.rs_start = start,
-		.rs_end = start + 1
-	};
+	zfs_btree_index_t where;
+	range_seg_max_t starting_rs;
+	rs_set_start(&starting_rs, removefrom, start);
+	rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs,
+	    removefrom) + 1);
 
-	range_seg_t *curr = avl_find(&removefrom->rt_root,
+	range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
 	    &starting_rs, &where);
 
 	if (curr == NULL)
-		curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER);
+		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
 
 	range_seg_t *next;
 	for (; curr != NULL; curr = next) {
-		next = AVL_NEXT(&removefrom->rt_root, curr);
-
 		if (start == end)
 			return;
 		VERIFY3U(start, <, end);
 
 		/* there is no overlap */
-		if (end <= curr->rs_start) {
+		if (end <= rs_get_start(curr, removefrom)) {
 			range_tree_add(addto, start, end - start);
 			return;
 		}
 
-		uint64_t overlap_start = MAX(curr->rs_start, start);
-		uint64_t overlap_end = MIN(curr->rs_end, end);
+		uint64_t overlap_start = MAX(rs_get_start(curr, removefrom),
+		    start);
+		uint64_t overlap_end = MIN(rs_get_end(curr, removefrom),
+		    end);
 		uint64_t overlap_size = overlap_end - overlap_start;
 		ASSERT3S(overlap_size, >, 0);
+		range_seg_max_t rs;
+		rs_copy(curr, &rs, removefrom);
+
 		range_tree_remove(removefrom, overlap_start, overlap_size);
 
 		if (start < overlap_start)
 			range_tree_add(addto, start, overlap_start - start);
 
 		start = overlap_end;
+		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
+		/*
+		 * If we find something here, we only removed part of the
+		 * curr segment. Either there's some left at the end
+		 * because we've reached the end of the range we're removing,
+		 * or there's some left at the start because we started
+		 * partway through the range.  Either way, we continue with
+		 * the loop. If it's the former, we'll return at the start of
+		 * the loop, and if it's the latter we'll see if there is more
+		 * area to process.
+		 */
+		if (next != NULL) {
+			ASSERT(start == end || start == rs_get_end(&rs,
+			    removefrom));
+		}
+
+		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
 	}
 	VERIFY3P(curr, ==, NULL);
 
@@ -768,9 +879,30 @@
 range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
     range_tree_t *addto)
 {
-	for (range_seg_t *rs = avl_first(&rt->rt_root); rs;
-	    rs = AVL_NEXT(&rt->rt_root, rs)) {
-		range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end,
-		    removefrom, addto);
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
+	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		range_tree_remove_xor_add_segment(rs_get_start(rs, rt),
+		    rs_get_end(rs, rt), removefrom, addto);
 	}
 }
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+	range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
+	return (rs != NULL ? rs_get_start(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+	range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
+	return (rs != NULL ? rs_get_end(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+	return (range_tree_max(rt) - range_tree_min(rt));
+}
--- a/usr/src/uts/common/fs/zfs/sa.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sa.c	Mon Dec 09 14:15:34 2019 +0000
@@ -252,7 +252,7 @@
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
-	return (AVL_CMP(node1->lot_num, node2->lot_num));
+	return (TREE_CMP(node1->lot_num, node2->lot_num));
 }
 
 static int
@@ -261,11 +261,11 @@
 	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
 	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
-	int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
+	int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_CMP(node1->lot_instance, node2->lot_instance));
+	return (TREE_CMP(node1->lot_instance, node2->lot_instance));
 }
 
 boolean_t
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Dec 09 14:15:34 2019 +0000
@@ -917,7 +917,7 @@
 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
-	return (AVL_ISIGN(ret));
+	return (TREE_ISIGN(ret));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Dec 09 14:15:34 2019 +0000
@@ -57,6 +57,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
+#include <sys/btree.h>
 #include <sys/zfeature.h>
 
 /*
@@ -601,7 +602,7 @@
 	const spa_log_sm_t *a = va;
 	const spa_log_sm_t *b = vb;
 
-	return (AVL_CMP(a->sls_txg, b->sls_txg));
+	return (TREE_CMP(a->sls_txg, b->sls_txg));
 }
 
 /*
@@ -943,7 +944,7 @@
 	const spa_aux_t *sa = (const spa_aux_t *)a;
 	const spa_aux_t *sb = (const spa_aux_t *)b;
 
-	return (AVL_CMP(sa->aux_guid, sb->aux_guid));
+	return (TREE_CMP(sa->aux_guid, sb->aux_guid));
 }
 
 void
@@ -2058,7 +2059,7 @@
 
 	s = strcmp(s1->spa_name, s2->spa_name);
 
-	return (AVL_ISIGN(s));
+	return (TREE_ISIGN(s));
 }
 
 int
@@ -2108,8 +2109,8 @@
 
 	zfs_refcount_init();
 	unique_init();
-	range_tree_init();
-	metaslab_alloc_trace_init();
+	zfs_btree_init();
+	metaslab_stat_init();
 	zio_init();
 	dmu_init();
 	zil_init();
@@ -2135,8 +2136,8 @@
 	zil_fini();
 	dmu_fini();
 	zio_fini();
-	metaslab_alloc_trace_fini();
-	range_tree_fini();
+	metaslab_stat_fini();
+	zfs_btree_fini();
 	unique_fini();
 	zfs_refcount_fini();
 	scan_fini();
--- a/usr/src/uts/common/fs/zfs/space_map.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Mon Dec 09 14:15:34 2019 +0000
@@ -525,8 +525,9 @@
  * dbuf must be dirty for the changes in sm_phys to take effect.
  */
 static void
-space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
-    uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
+    maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
+    void *tag, dmu_tx_t *tx)
 {
 	ASSERT3U(words, !=, 0);
 	ASSERT3U(words, <=, 2);
@@ -550,14 +551,14 @@
 
 	ASSERT3P(block_cursor, <=, block_end);
 
-	uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+	uint64_t size = (rend - rstart) >> sm->sm_shift;
+	uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
 	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
 
-	ASSERT3U(rs->rs_start, >=, sm->sm_start);
-	ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
-	ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
-	ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+	ASSERT3U(rstart, >=, sm->sm_start);
+	ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
+	ASSERT3U(rend - rstart, <=, sm->sm_size);
+	ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
 
 	while (size != 0) {
 		ASSERT3P(block_cursor, <=, block_end);
@@ -675,10 +676,14 @@
 
 	dmu_buf_will_dirty(db, tx);
 
-	avl_tree_t *t = &rt->rt_root;
-	for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-		uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+	zfs_btree_t *t = &rt->rt_root;
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
+	    rs = zfs_btree_next(t, &where, &where)) {
+		uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >>
+		    sm->sm_shift;
+		uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >>
+		    sm->sm_shift;
 		uint8_t words = 1;
 
 		/*
@@ -703,8 +708,8 @@
 		    spa_get_random(100) == 0)))
 			words = 2;
 
-		space_map_write_seg(sm, rs, maptype, vdev_id, words,
-		    &db, FTAG, tx);
+		space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs,
+		    rt), maptype, vdev_id, words, &db, FTAG, tx);
 	}
 
 	dmu_buf_rele(db, FTAG);
@@ -753,7 +758,7 @@
 	else
 		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	uint64_t nodes = avl_numnodes(&rt->rt_root);
+	uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
 	uint64_t rt_space = range_tree_space(rt);
 
 	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
@@ -762,7 +767,7 @@
 	 * Ensure that the space_map's accounting wasn't changed
 	 * while we were in the middle of writing it out.
 	 */
-	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+	VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
 	VERIFY3U(range_tree_space(rt), ==, rt_space);
 }
 
--- a/usr/src/uts/common/fs/zfs/space_reftree.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/space_reftree.c	Mon Dec 09 14:15:34 2019 +0000
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -57,11 +57,11 @@
 	const space_ref_t *sr1 = (const space_ref_t *)x1;
 	const space_ref_t *sr2 = (const space_ref_t *)x2;
 
-	int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
+	int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset);
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_PCMP(sr1, sr2));
+	return (TREE_PCMP(sr1, sr2));
 }
 
 void
@@ -109,10 +109,13 @@
 void
 space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
 {
-	range_seg_t *rs;
+	zfs_btree_index_t where;
 
-	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
-		space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs =
+	    zfs_btree_next(&rt->rt_root, &where, &where)) {
+		space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs,
+		    rt),  refcnt);
+	}
 }
 
 /*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/bitops.h	Mon Dec 09 14:15:34 2019 +0000
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#ifndef _SYS_BITOPS_H
+#define	_SYS_BITOPS_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
+#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
+#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
+#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
+
+#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
+#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
+
+#define	BF32_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1U << (len)); \
+	ASSERT3U(low + len, <=, 32); \
+	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BF64_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1ULL << (len)); \
+	ASSERT3U(low + len, <=, 64); \
+	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BF32_GET_SB(x, low, len, shift, bias)	\
+	((BF32_GET(x, low, len) + (bias)) << (shift))
+#define	BF64_GET_SB(x, low, len, shift, bias)	\
+	((BF64_GET(x, low, len) + (bias)) << (shift))
+
+/*
+ * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in
+ * the case where val is a constant.  We can't fix ASSERT because it's used as
+ * an expression in several places in the kernel; as a result, changing it to
+ * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option.
+ */
+#define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BITOPS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/btree.h	Mon Dec 09 14:15:34 2019 +0000
@@ -0,0 +1,236 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef	_BTREE_H
+#define	_BTREE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include	<sys/zfs_context.h>
+
+/*
+ * This file defines the interface for a B-Tree implementation for ZFS. The
+ * tree can be used to store arbitrary sortable data types with low overhead
+ * and good operation performance. In addition the tree intelligently
+ * optimizes bulk in-order insertions to improve memory use and performance.
+ *
+ * Note that for all B-Tree functions, the values returned are pointers to the
+ * internal copies of the data in the tree. The internal data can only be
+ * safely mutated if the changes cannot change the ordering of the element
+ * with respect to any other elements in the tree.
+ *
+ * The major drawback of the B-Tree is that any returned elements or indexes
+ * are only valid until a side-effectful operation occurs, since these can
+ * result in reallocation or relocation of data. Side effectful operations are
+ * defined as insertion, removal, and zfs_btree_destroy_nodes.
+ *
+ * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
+ * nodes have an array of children pointing to other nodes, and an array of
+ * elements that act as separators between the elements of the subtrees rooted
+ * at its children. Leaf nodes only contain data elements, and form the bottom
+ * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
+ * elements in the core nodes are not copies of or references to leaf node
+ * elements.  Each element occcurs only once in the tree, no matter what kind
+ * of node it is in.
+ *
+ * The tree's height is the same throughout, unlike many other forms of search
+ * tree. Each node (except for the root) must be between half minus one and
+ * completely full of elements (and children) at all times. Any operation that
+ * would put the node outside of that range results in a rebalancing operation
+ * (taking, merging, or splitting).
+ *
+ * This tree was implemented using descriptions from Wikipedia's articles on
+ * B-Trees and B+ Trees.
+ */
+
+/*
+ * Decreasing these values results in smaller memmove operations, but more of
+ * them, and increased memory overhead. Increasing these values results in
+ * higher variance in operation time, and reduces memory overhead.
+ */
+#define	BTREE_CORE_ELEMS	128
+#define	BTREE_LEAF_SIZE		4096
+
+typedef struct zfs_btree_hdr {
+	struct zfs_btree_core	*bth_parent;
+	boolean_t		bth_core;
+	/*
+	 * For both leaf and core nodes, represents the number of elements in
+	 * the node. For core nodes, they will have bth_count + 1 children.
+	 */
+	uint32_t		bth_count;
+} zfs_btree_hdr_t;
+
+typedef struct zfs_btree_core {
+	zfs_btree_hdr_t	btc_hdr;
+	zfs_btree_hdr_t	*btc_children[BTREE_CORE_ELEMS + 1];
+	uint8_t		btc_elems[];
+} zfs_btree_core_t;
+
+typedef struct zfs_btree_leaf {
+	zfs_btree_hdr_t	btl_hdr;
+	uint8_t		btl_elems[];
+} zfs_btree_leaf_t;
+
+typedef struct zfs_btree_index {
+	zfs_btree_hdr_t	*bti_node;
+	uint64_t	bti_offset;
+	/*
+	 * True if the location is before the list offset, false if it's at
+	 * the listed offset.
+	 */
+	boolean_t	bti_before;
+} zfs_btree_index_t;
+
+typedef struct btree {
+	zfs_btree_hdr_t		*bt_root;
+	int64_t			bt_height;
+	size_t			bt_elem_size;
+	uint64_t		bt_num_elems;
+	uint64_t		bt_num_nodes;
+	zfs_btree_leaf_t	*bt_bulk; // non-null if bulk loading
+	int (*bt_compar) (const void *, const void *);
+} zfs_btree_t;
+
+/*
+ * Allocate and deallocate caches for btree nodes.
+ */
+void zfs_btree_init(void);
+void zfs_btree_fini(void);
+
+/*
+ * Initialize an B-Tree. Arguments are:
+ *
+ * tree   - the tree to be initialized
+ * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
+ *          -1 for <, 0 for ==, and +1 for >
+ * size   - the value of sizeof(struct my_type)
+ */
+void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
+    size_t);
+
+/*
+ * Find a node with a matching value in the tree. Returns the matching node
+ * found. If not found, it returns NULL and then if "where" is not NULL it sets
+ * "where" for use with zfs_btree_insert() or zfs_btree_nearest().
+ *
+ * node   - node that has the value being looked for
+ * where  - position for use with zfs_btree_nearest() or zfs_btree_insert(),
+ *          may be NULL
+ */
+void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);
+
+/*
+ * Insert a node into the tree.
+ *
+ * node   - the node to insert
+ * where  - position as returned from zfs_btree_find()
+ */
+void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *);
+
+/*
+ * Return the first or last valued node in the tree. Will return NULL
+ * if the tree is empty.
+ */
+void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
+void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the next or previous valued node in the tree.
+ */
+void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
+    zfs_btree_index_t *);
+void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
+    zfs_btree_index_t *);
+
+/*
+ * Get a value from a tree and an index.
+ */
+void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Add a single value to the tree. The value must not compare equal to any
+ * other node already in the tree.
+ */
+void zfs_btree_add(zfs_btree_t *, const void *);
+
+/*
+ * Remove a single value from the tree.  The value must be in the tree. The
+ * pointer passed in may be a pointer into a tree-controlled buffer, but it
+ * need not be.
+ */
+void zfs_btree_remove(zfs_btree_t *, const void *);
+
+/*
+ * Remove the value at the given location from the tree.
+ */
+void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *);
+
+/*
+ * Return the number of nodes in the tree
+ */
+ulong_t zfs_btree_numnodes(zfs_btree_t *);
+
+/*
+ * Used to destroy any remaining nodes in a tree. The cookie argument should
+ * be initialized to NULL before the first call. Returns a node that has been
+ * removed from the tree and may be free()'d. Returns NULL when the tree is
+ * empty.
+ *
+ * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
+ * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
+ *
+ * cookie - an index used to save state between calls to
+ * zfs_btree_destroy_nodes()
+ *
+ * EXAMPLE:
+ *	zfs_btree_t *tree;
+ *	struct my_data *node;
+ *	zfs_btree_index_t *cookie;
+ *
+ *	cookie = NULL;
+ *	while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ *		data_destroy(node);
+ *	zfs_btree_destroy(tree);
+ */
+void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);
+
+/*
+ * Destroys all nodes in the tree quickly. This doesn't give the caller an
+ * opportunity to iterate over each node and do its own cleanup; for that, use
+ * zfs_btree_destroy_nodes().
+ */
+void zfs_btree_clear(zfs_btree_t *);
+
+/*
+ * Final destroy of an B-Tree. Arguments are:
+ *
+ * tree   - the empty tree to destroy
+ */
+void zfs_btree_destroy(zfs_btree_t *tree);
+
+/* Runs a variety of self-checks on the btree to verify integrity. */
+void zfs_btree_verify(zfs_btree_t *tree);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _BTREE_H */
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Mon Dec 09 14:15:34 2019 +0000
@@ -93,8 +93,8 @@
 int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 void metaslab_check_free(spa_t *, const blkptr_t *);
 
-void metaslab_alloc_trace_init(void);
-void metaslab_alloc_trace_fini(void);
+void metaslab_stat_init(void);
+void metaslab_stat_fini(void);
 void metaslab_trace_init(zio_alloc_list_t *);
 void metaslab_trace_fini(zio_alloc_list_t *);
 
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Mon Dec 09 14:15:34 2019 +0000
@@ -517,8 +517,8 @@
 	 * only difference is that the ms_allocatable_by_size is ordered by
 	 * segment sizes.
 	 */
-	avl_tree_t	ms_allocatable_by_size;
-	avl_tree_t	ms_unflushed_frees_by_size;
+	zfs_btree_t		ms_allocatable_by_size;
+	zfs_btree_t		ms_unflushed_frees_by_size;
 	uint64_t	ms_lbas[MAX_LBAS];
 
 	metaslab_group_t *ms_group;	/* metaslab group		*/
--- a/usr/src/uts/common/fs/zfs/sys/range_tree.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h	Mon Dec 09 14:15:34 2019 +0000
@@ -30,7 +30,7 @@
 #ifndef _SYS_RANGE_TREE_H
 #define	_SYS_RANGE_TREE_H
 
-#include <sys/avl.h>
+#include <sys/btree.h>
 #include <sys/dmu.h>
 
 #ifdef	__cplusplus
@@ -41,19 +41,35 @@
 
 typedef struct range_tree_ops range_tree_ops_t;
 
+typedef enum range_seg_type {
+	RANGE_SEG32,
+	RANGE_SEG64,
+	RANGE_SEG_GAP,
+	RANGE_SEG_NUM_TYPES,
+} range_seg_type_t;
+
 /*
  * Note: the range_tree may not be accessed concurrently; consumers
  * must provide external locking if required.
  */
 typedef struct range_tree {
-	avl_tree_t	rt_root;	/* offset-ordered segment AVL tree */
+	zfs_btree_t	rt_root;	/* offset-ordered segment b-tree */
 	uint64_t	rt_space;	/* sum of all segments in the map */
-	uint64_t	rt_gap;		/* allowable inter-segment gap */
+	range_seg_type_t rt_type;	/* type of range_seg_t in use */
+	/*
+	 * All data that is stored in the range tree must have a start higher
+	 * than or equal to rt_start, and all sizes and offsets must be
+	 * multiples of 1 << rt_shift.
+	 */
+	uint8_t		rt_shift;
+	uint64_t	rt_start;
 	range_tree_ops_t *rt_ops;
-	void		*rt_arg;
 
-	/* rt_avl_compare should only be set it rt_arg is an AVL tree */
-	int (*rt_avl_compare)(const void *, const void *);
+	/* rt_btree_compare should only be set if rt_arg is a b-tree */
+	void		*rt_arg;
+	int (*rt_btree_compare)(const void *, const void *);
+
+	uint64_t	rt_gap;		/* allowable inter-segment gap */
 
 	/*
 	 * The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -63,36 +79,217 @@
 	uint64_t	rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 } range_tree_t;
 
-typedef struct range_seg {
-	avl_node_t	rs_node;	/* AVL node */
-	avl_node_t	rs_pp_node;	/* AVL picker-private node */
+typedef struct range_seg32 {
+	uint32_t	rs_start;	/* starting offset of this segment */
+	uint32_t	rs_end;		/* ending offset (non-inclusive) */
+} range_seg32_t;
+
+/*
+ * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may
+ * require 64-bit integers for ranges.
+ */
+typedef struct range_seg64 {
+	uint64_t	rs_start;	/* starting offset of this segment */
+	uint64_t	rs_end;		/* ending offset (non-inclusive) */
+} range_seg64_t;
+
+typedef struct range_seg_gap {
 	uint64_t	rs_start;	/* starting offset of this segment */
 	uint64_t	rs_end;		/* ending offset (non-inclusive) */
 	uint64_t	rs_fill;	/* actual fill if gap mode is on */
-} range_seg_t;
+} range_seg_gap_t;
+
+/*
+ * This type needs to be the largest of the range segs, since it will be stack
+ * allocated and then cast the actual type to do tree operations.
+ */
+typedef range_seg_gap_t range_seg_max_t;
+
+/*
+ * This is just for clarity of code purposes, so we can make it clear that a
+ * pointer is to a range seg of some type; when we need to do the actual math,
+ * we'll figure out the real type.
+ */
+typedef void range_seg_t;
 
 struct range_tree_ops {
 	void    (*rtop_create)(range_tree_t *rt, void *arg);
 	void    (*rtop_destroy)(range_tree_t *rt, void *arg);
-	void	(*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
-	void    (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
+	void	(*rtop_add)(range_tree_t *rt, void *rs, void *arg);
+	void    (*rtop_remove)(range_tree_t *rt, void *rs, void *arg);
 	void	(*rtop_vacate)(range_tree_t *rt, void *arg);
 };
 
+static inline uint64_t
+rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		return (((range_seg32_t *)rs)->rs_start);
+	case RANGE_SEG64:
+		return (((range_seg64_t *)rs)->rs_start);
+	case RANGE_SEG_GAP:
+		return (((range_seg_gap_t *)rs)->rs_start);
+	default:
+		VERIFY(0);
+		return (0);
+	}
+}
+
+static inline uint64_t
+rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		return (((range_seg32_t *)rs)->rs_end);
+	case RANGE_SEG64:
+		return (((range_seg64_t *)rs)->rs_end);
+	case RANGE_SEG_GAP:
+		return (((range_seg_gap_t *)rs)->rs_end);
+	default:
+		VERIFY(0);
+		return (0);
+	}
+}
+
+static inline uint64_t
+rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32: {
+		const range_seg32_t *r32 = rs;
+		return (r32->rs_end - r32->rs_start);
+	}
+	case RANGE_SEG64: {
+		const range_seg64_t *r64 = rs;
+		return (r64->rs_end - r64->rs_start);
+	}
+	case RANGE_SEG_GAP:
+		return (((range_seg_gap_t *)rs)->rs_fill);
+	default:
+		VERIFY(0);
+		return (0);
+	}
+
+}
+
+static inline uint64_t
+rs_get_start(const range_seg_t *rs, const range_tree_t *rt)
+{
+	return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_end(const range_seg_t *rs, const range_tree_t *rt)
+{
+	return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start);
+}
+
+static inline uint64_t
+rs_get_fill(const range_seg_t *rs, const range_tree_t *rt)
+{
+	return (rs_get_fill_raw(rs, rt) << rt->rt_shift);
+}
+
+static inline void
+rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		ASSERT3U(start, <=, UINT32_MAX);
+		((range_seg32_t *)rs)->rs_start = (uint32_t)start;
+		break;
+	case RANGE_SEG64:
+		((range_seg64_t *)rs)->rs_start = start;
+		break;
+	case RANGE_SEG_GAP:
+		((range_seg_gap_t *)rs)->rs_start = start;
+		break;
+	default:
+		VERIFY(0);
+	}
+}
+
+static inline void
+rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		ASSERT3U(end, <=, UINT32_MAX);
+		((range_seg32_t *)rs)->rs_end = (uint32_t)end;
+		break;
+	case RANGE_SEG64:
+		((range_seg64_t *)rs)->rs_end = end;
+		break;
+	case RANGE_SEG_GAP:
+		((range_seg_gap_t *)rs)->rs_end = end;
+		break;
+	default:
+		VERIFY(0);
+	}
+}
+
+static inline void
+rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		/* fall through */
+	case RANGE_SEG64:
+		ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs,
+		    rt));
+		break;
+	case RANGE_SEG_GAP:
+		((range_seg_gap_t *)rs)->rs_fill = fill;
+		break;
+	default:
+		VERIFY(0);
+	}
+}
+
+static inline void
+rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start)
+{
+	ASSERT3U(start, >=, rt->rt_start);
+	ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift));
+	rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end)
+{
+	ASSERT3U(end, >=, rt->rt_start);
+	ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift));
+	rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift);
+}
+
+static inline void
+rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
+{
+	ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift));
+	rs_set_fill_raw(rs, rt, fill >> rt->rt_shift);
+}
+
 typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
 
-void range_tree_init(void);
-void range_tree_fini(void);
-range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
-    int (*avl_compare)(const void*, const void*), uint64_t gap);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops,
+    range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+    int (*zfs_btree_compare) (const void *, const void *), uint64_t gap);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+    void *arg, uint64_t start, uint64_t shift);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
 boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
     uint64_t *ostart, uint64_t *osize);
 void range_tree_verify_not_present(range_tree_t *rt,
     uint64_t start, uint64_t size);
-range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
     uint64_t newstart, uint64_t newsize);
 uint64_t range_tree_space(range_tree_t *rt);
@@ -119,19 +316,12 @@
 void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
     range_tree_t *addto);
 
-void rt_avl_create(range_tree_t *rt, void *arg);
-void rt_avl_destroy(range_tree_t *rt, void *arg);
-void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_vacate(range_tree_t *rt, void *arg);
-extern struct range_tree_ops rt_avl_ops;
-
-void rt_avl_create(range_tree_t *rt, void *arg);
-void rt_avl_destroy(range_tree_t *rt, void *arg);
-void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
-void rt_avl_vacate(range_tree_t *rt, void *arg);
-extern struct range_tree_ops rt_avl_ops;
+void rt_btree_create(range_tree_t *rt, void *arg);
+void rt_btree_destroy(range_tree_t *rt, void *arg);
+void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_btree_vacate(range_tree_t *rt, void *arg);
+extern range_tree_ops_t rt_btree_ops;
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Dec 09 14:15:34 2019 +0000
@@ -42,6 +42,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
+#include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -65,45 +66,6 @@
 struct dsl_crypto_params;
 
 /*
- * General-purpose 32-bit and 64-bit bitfield encodings.
- */
-#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
-#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
-#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
-#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
-
-#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
-#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
-
-#define	BF32_SET(x, low, len, val) do { \
-	ASSERT3U(val, <, 1U << (len)); \
-	ASSERT3U(low + len, <=, 32); \
-	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BF64_SET(x, low, len, val) do { \
-	ASSERT3U(val, <, 1ULL << (len)); \
-	ASSERT3U(low + len, <=, 64); \
-	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
-_NOTE(CONSTCOND) } while (0)
-
-#define	BF32_GET_SB(x, low, len, shift, bias)	\
-	((BF32_GET(x, low, len) + (bias)) << (shift))
-#define	BF64_GET_SB(x, low, len, shift, bias)	\
-	((BF64_GET(x, low, len) + (bias)) << (shift))
-
-#define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
-	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
-	ASSERT3S((val) >> (shift), >=, bias); \
-	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-#define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
-	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
-	ASSERT3S((val) >> (shift), >=, bias); \
-	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
-_NOTE(CONSTCOND) } while (0)
-
-/*
  * We currently support block sizes from 512 bytes to 16MB.
  * The benefits of larger blocks, and thus larger IO, need to be weighed
  * against the cost of COWing a giant block to modify one byte, and the
--- a/usr/src/uts/common/fs/zfs/sys/space_reftree.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/space_reftree.h	Mon Dec 09 14:15:34 2019 +0000
@@ -31,7 +31,7 @@
 #define	_SYS_SPACE_REFTREE_H
 
 #include <sys/range_tree.h>
-
+#include <sys/avl.h>
 #ifdef	__cplusplus
 extern "C" {
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Dec 09 14:15:34 2019 +0000
@@ -96,8 +96,8 @@
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd);
-extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
-    range_seg_t *physical_rs);
+extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Mon Dec 09 14:15:34 2019 +0000
@@ -87,8 +87,8 @@
  * Given a target vdev, translates the logical range "in" to the physical
  * range "res"
  */
-typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
-    range_seg_t *res);
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in,
+    range_seg64_t *res);
 
 typedef struct vdev_ops {
 	vdev_open_func_t		*vdev_op_open;
@@ -517,8 +517,8 @@
 /*
  * Common size functions
  */
-extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
-    range_seg_t *out);
+extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in,
+    range_seg64_t *out);
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h	Mon Dec 09 14:15:34 2019 +0000
@@ -26,6 +26,7 @@
 /*
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
@@ -84,6 +85,16 @@
 
 #define	CPU_SEQID	(CPU->cpu_seqid)
 
+/*
+ * In ZoL the following defines were added to their sys/avl.h header, but
+ * we want to limit these to the ZFS code on illumos.
+ */
+#define	TREE_ISIGN(a)	(((a) > 0) - ((a) < 0))
+#define	TREE_CMP(a, b)	(((a) > (b)) - ((a) < (b)))
+#define	TREE_PCMP(a, b)	\
+	(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/unique.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/unique.c	Mon Dec 09 14:15:34 2019 +0000
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
 #include <sys/unique.h>
@@ -45,7 +43,7 @@
 	const unique_t *una = (const unique_t *)a;
 	const unique_t *unb = (const unique_t *)b;
 
-	return (AVL_CMP(una->un_value, unb->un_value));
+	return (TREE_CMP(una->un_value, unb->un_value));
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Dec 09 14:15:34 2019 +0000
@@ -215,7 +215,7 @@
 
 /* ARGSUSED */
 void
-vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
 {
 	res->rs_start = in->rs_start;
 	res->rs_end = in->rs_end;
@@ -496,7 +496,8 @@
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
-	vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
+	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
 
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
@@ -517,7 +518,8 @@
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
-		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
+		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+		    0);
 	}
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
@@ -2434,14 +2436,11 @@
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
-	range_seg_t *rs;
-
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
-	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
-	return (rs->rs_start - 1);
+	return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
@@ -2450,14 +2449,11 @@
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
-	range_seg_t *rs;
-
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
-	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
-	return (rs->rs_end);
+	return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
@@ -2768,7 +2764,7 @@
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
-	rtsync = range_tree_create(NULL, NULL);
+	rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
@@ -4475,7 +4471,8 @@
  * translation function to do the real conversion.
  */
 void
-vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs)
 {
 	/*
 	 * Walk up the vdev tree
@@ -4502,7 +4499,7 @@
 	 * range into its physical components by calling the
 	 * vdev specific translate function.
 	 */
-	range_seg_t intermediate = { { { 0, 0 } } };
+	range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
 
 	physical_rs->rs_start = intermediate.rs_start;
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c	Mon Dec 09 14:15:34 2019 +0000
@@ -111,7 +111,7 @@
 	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
 	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
 
-	return (AVL_CMP(ve1->ve_offset, ve2->ve_offset));
+	return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
 }
 
 static int
@@ -120,7 +120,7 @@
 	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
 	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
 
-	int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused);
+	int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
 	if (likely(cmp))
 		return (cmp);
 
--- a/usr/src/uts/common/fs/zfs/vdev_initialize.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c	Mon Dec 09 14:15:34 2019 +0000
@@ -278,11 +278,13 @@
 static int
 vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 {
-	avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+	range_tree_t *rt = vd->vdev_initialize_tree;
+	zfs_btree_t *bt = &rt->rt_root;
+	zfs_btree_index_t where;
 
-	for (range_seg_t *rs = avl_first(rt); rs != NULL;
-	    rs = AVL_NEXT(rt, rs)) {
-		uint64_t size = rs->rs_end - rs->rs_start;
+	for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
+	    rs = zfs_btree_next(bt, &where, &where)) {
+		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
 
 		/* Split range into legally-sized physical chunks */
 		uint64_t writes_required =
@@ -292,7 +294,7 @@
 			int error;
 
 			error = vdev_initialize_write(vd,
-			    VDEV_LABEL_START_SIZE + rs->rs_start +
+			    VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
 			    (w * zfs_initialize_chunk_size),
 			    MIN(size - (w * zfs_initialize_chunk_size),
 			    zfs_initialize_chunk_size), data);
@@ -328,7 +330,7 @@
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg_t logical_rs, physical_rs;
+		range_seg64_t logical_rs, physical_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
 		vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -352,10 +354,14 @@
 		 */
 		VERIFY0(metaslab_load(msp));
 
-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
-		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
-			logical_rs.rs_start = rs->rs_start;
-			logical_rs.rs_end = rs->rs_end;
+		zfs_btree_index_t where;
+		range_tree_t *rt = msp->ms_allocatable;
+		for (range_seg_t *rs =
+		    zfs_btree_first(&rt->rt_root, &where); rs;
+		    rs = zfs_btree_next(&rt->rt_root, &where,
+		    &where)) {
+			logical_rs.rs_start = rs_get_start(rs, rt);
+			logical_rs.rs_end = rs_get_end(rs, rt);
 			vdev_xlate(vd, &logical_rs, &physical_rs);
 
 			uint64_t size = physical_rs.rs_end -
@@ -410,7 +416,7 @@
 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
-	range_seg_t logical_rs, physical_rs;
+	range_seg64_t logical_rs, physical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;
 
@@ -469,7 +475,8 @@
 
 	abd_t *deadbeef = vdev_initialize_block_alloc();
 
-	vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+	vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
 
 	for (uint64_t i = 0; !vd->vdev_detached &&
 	    i < vd->vdev_top->vdev_ms_count; i++) {
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Mon Dec 09 14:15:34 2019 +0000
@@ -1021,12 +1021,12 @@
 static int
 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
-	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+	int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
 
 	if (likely(cmp))
 		return (cmp);
 
-	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+	cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
 	if (likely(cmp))
 		return (cmp);
 
@@ -1050,7 +1050,7 @@
 	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
 		seq2 = MMP_SEQ(ub2);
 
-	return (AVL_CMP(seq1, seq2));
+	return (TREE_CMP(seq1, seq2));
 }
 
 struct ubl_cbdata {
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Mon Dec 09 14:15:34 2019 +0000
@@ -216,12 +216,12 @@
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
-	int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
+	int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
 
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_PCMP(z1, z2));
+	return (TREE_PCMP(z1, z2));
 }
 
 static inline avl_tree_t *
@@ -248,12 +248,12 @@
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
-	int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp);
+	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
 
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_PCMP(z1, z2));
+	return (TREE_PCMP(z1, z2));
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Mon Dec 09 14:15:34 2019 +0000
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -1911,7 +1911,7 @@
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 
-	range_seg_t logical_rs, physical_rs;
+	range_seg64_t logical_rs, physical_rs;
 	logical_rs.rs_start = zio->io_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
@@ -2655,7 +2655,7 @@
 }
 
 static void
-vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
 {
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_removal.c	Mon Dec 09 14:15:34 2019 +0000
@@ -189,11 +189,12 @@
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
-	svr->svr_allocd_segs = range_tree_create(NULL, NULL);
+	svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
-		svr->svr_frees[i] = range_tree_create(NULL, NULL);
+		svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
+		    0, 0);
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
@@ -920,18 +921,15 @@
 		 * the allocation at the end of a segment, thus avoiding
 		 * additional split blocks.
 		 */
-		range_seg_t search;
-		avl_index_t where;
-		search.rs_start = start + maxalloc;
-		search.rs_end = search.rs_start;
-		range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
-		if (rs == NULL) {
-			rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
-		} else {
-			rs = AVL_PREV(&segs->rt_root, rs);
-		}
+		range_seg_max_t search;
+		zfs_btree_index_t where;
+		rs_set_start(&search, segs, start + maxalloc);
+		rs_set_end(&search, segs, start + maxalloc);
+		(void) zfs_btree_find(&segs->rt_root, &search, &where);
+		range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
+		    &where);
 		if (rs != NULL) {
-			size = rs->rs_end - start;
+			size = rs_get_end(rs, segs) - start;
 		} else {
 			/*
 			 * There are no segments that end before maxalloc.
@@ -963,20 +961,22 @@
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
-	range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+	range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
 
-	range_seg_t *rs = avl_first(&segs->rt_root);
-	ASSERT3U(rs->rs_start, ==, start);
-	uint64_t prev_seg_end = rs->rs_end;
-	while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
-		if (rs->rs_start >= start + size) {
+	zfs_btree_index_t where;
+	range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
+	ASSERT3U(rs_get_start(rs, segs), ==, start);
+	uint64_t prev_seg_end = rs_get_end(rs, segs);
+	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
+		if (rs_get_start(rs, segs) >= start + size) {
 			break;
 		} else {
 			range_tree_add(obsolete_segs,
 			    prev_seg_end - start,
-			    rs->rs_start - prev_seg_end);
+			    rs_get_start(rs, segs) - prev_seg_end);
 		}
-		prev_seg_end = rs->rs_end;
+		prev_seg_end = rs_get_end(rs, segs);
 	}
 	/* We don't end in the middle of an obsolete range */
 	ASSERT3U(start + size, <=, prev_seg_end);
@@ -1222,9 +1222,11 @@
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
-	range_tree_t *segs = range_tree_create(NULL, NULL);
+	range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	for (;;) {
-		range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+		range_tree_t *rt = svr->svr_allocd_segs;
+		range_seg_t *rs = range_tree_first(rt);
+
 		if (rs == NULL)
 			break;
 
@@ -1232,17 +1234,17 @@
 
 		if (range_tree_is_empty(segs)) {
 			/* need to truncate the first seg based on max_alloc */
-			seg_length =
-			    MIN(rs->rs_end - rs->rs_start, *max_alloc);
+			seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
+			    rt), *max_alloc);
 		} else {
-			if (rs->rs_start - range_tree_max(segs) >
+			if (rs_get_start(rs, rt) - range_tree_max(segs) >
 			    vdev_removal_max_span) {
 				/*
 				 * Including this segment would cause us to
 				 * copy a larger unneeded chunk than is allowed.
 				 */
 				break;
-			} else if (rs->rs_end - range_tree_min(segs) >
+			} else if (rs_get_end(rs, rt) - range_tree_min(segs) >
 			    *max_alloc) {
 				/*
 				 * This additional segment would extend past
@@ -1251,13 +1253,14 @@
 				 */
 				break;
 			} else {
-				seg_length = rs->rs_end - rs->rs_start;
+				seg_length = rs_get_end(rs, rt) -
+				    rs_get_start(rs, rt);
 			}
 		}
 
-		range_tree_add(segs, rs->rs_start, seg_length);
+		range_tree_add(segs, rs_get_start(rs, rt), seg_length);
 		range_tree_remove(svr->svr_allocd_segs,
-		    rs->rs_start, seg_length);
+		    rs_get_start(rs, rt), seg_length);
 	}
 
 	if (range_tree_is_empty(segs)) {
@@ -1420,7 +1423,7 @@
 
 		vca.vca_msp = msp;
 		zfs_dbgmsg("copying %llu segments for metaslab %llu",
-		    avl_numnodes(&svr->svr_allocd_segs->rt_root),
+		    zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root),
 		    msp->ms_id);
 
 		while (!svr->svr_thread_exit &&
--- a/usr/src/uts/common/fs/zfs/vdev_trim.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_trim.c	Mon Dec 09 14:15:34 2019 +0000
@@ -534,7 +534,8 @@
 vdev_trim_ranges(trim_args_t *ta)
 {
 	vdev_t *vd = ta->trim_vdev;
-	avl_tree_t *rt = &ta->trim_tree->rt_root;
+	zfs_btree_t *t = &ta->trim_tree->rt_root;
+	zfs_btree_index_t idx;
 	uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
 	uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
 	spa_t *spa = vd->vdev_spa;
@@ -542,9 +543,10 @@
 	ta->trim_start_time = gethrtime();
 	ta->trim_bytes_done = 0;
 
-	for (range_seg_t *rs = avl_first(rt); rs != NULL;
-	    rs = AVL_NEXT(rt, rs)) {
-		uint64_t size = rs->rs_end - rs->rs_start;
+	for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+	    rs = zfs_btree_next(t, &idx, &idx)) {
+		uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs,
+		    ta->trim_tree);
 
 		if (extent_bytes_min && size < extent_bytes_min) {
 			/*
@@ -561,9 +563,9 @@
 			int error;
 
 			error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
-			    rs->rs_start + (w * extent_bytes_max),
-			    MIN(size - (w * extent_bytes_max),
-			    extent_bytes_max));
+			    rs_get_start(rs, ta->trim_tree) +
+			    (w *extent_bytes_max), MIN(size -
+			    (w * extent_bytes_max), extent_bytes_max));
 			if (error != 0) {
 				return (error);
 			}
@@ -601,7 +603,7 @@
 		 * on our vdev. We use this to determine if we are
 		 * in the middle of this metaslab range.
 		 */
-		range_seg_t logical_rs, physical_rs;
+		range_seg64_t logical_rs, physical_rs;
 		logical_rs.rs_start = msp->ms_start;
 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
 		vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -624,10 +626,13 @@
 		 */
 		VERIFY0(metaslab_load(msp));
 
-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
-		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
-			logical_rs.rs_start = rs->rs_start;
-			logical_rs.rs_end = rs->rs_end;
+		range_tree_t *rt = msp->ms_allocatable;
+		zfs_btree_t *bt = &rt->rt_root;
+		zfs_btree_index_t idx;
+		for (range_seg_t *rs = zfs_btree_first(bt, &idx);
+		    rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
+			logical_rs.rs_start = rs_get_start(rs, rt);
+			logical_rs.rs_end = rs_get_end(rs, rt);
 			vdev_xlate(vd, &logical_rs, &physical_rs);
 
 			uint64_t size = physical_rs.rs_end -
@@ -719,7 +724,7 @@
 {
 	trim_args_t *ta = arg;
 	vdev_t *vd = ta->trim_vdev;
-	range_seg_t logical_rs, physical_rs;
+	range_seg64_t logical_rs, physical_rs;
 	logical_rs.rs_start = start;
 	logical_rs.rs_end = start + size;
 
@@ -732,7 +737,7 @@
 		metaslab_t *msp = ta->trim_msp;
 		VERIFY0(metaslab_load(msp));
 		VERIFY3B(msp->ms_loaded, ==, B_TRUE);
-		VERIFY(range_tree_find(msp->ms_allocatable, start, size));
+		VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
 	}
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -811,7 +816,7 @@
 	ta.trim_vdev = vd;
 	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
 	ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
-	ta.trim_tree = range_tree_create(NULL, NULL);
+	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	ta.trim_type = TRIM_TYPE_MANUAL;
 	ta.trim_flags = 0;
 
@@ -1093,7 +1098,7 @@
 
 	VERIFY3B(msp->ms_loaded, ==, B_TRUE);
 	VERIFY3U(msp->ms_disabled, >, 0);
-	VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL);
+	VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
 }
 
 /*
@@ -1191,7 +1196,8 @@
 			 * Allocate an empty range tree which is swapped in
 			 * for the existing ms_trim tree while it is processed.
 			 */
-			trim_tree = range_tree_create(NULL, NULL);
+			trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+			    0, 0);
 			range_tree_swap(&msp->ms_trim, &trim_tree);
 			ASSERT(range_tree_is_empty(msp->ms_trim));
 
@@ -1245,7 +1251,8 @@
 				if (!cvd->vdev_ops->vdev_op_leaf)
 					continue;
 
-				ta->trim_tree = range_tree_create(NULL, NULL);
+				ta->trim_tree = range_tree_create(NULL,
+				    RANGE_SEG64, NULL, 0, 0);
 				range_tree_walk(trim_tree,
 				    vdev_trim_range_add, ta);
 			}
--- a/usr/src/uts/common/fs/zfs/zap_micro.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c	Mon Dec 09 14:15:34 2019 +0000
@@ -281,11 +281,11 @@
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
-	int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
+	int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
+	return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/zfs_fuid.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_fuid.c	Mon Dec 09 14:15:34 2019 +0000
@@ -75,7 +75,7 @@
 	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
 	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
 
-	return (AVL_CMP(node1->f_idx, node2->f_idx));
+	return (TREE_CMP(node1->f_idx, node2->f_idx));
 }
 
 /*
@@ -90,7 +90,7 @@
 
 	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
 
-	return (AVL_ISIGN(val));
+	return (TREE_ISIGN(val));
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c	Mon Dec 09 14:15:34 2019 +0000
@@ -109,7 +109,7 @@
 	const locked_range_t *rl1 = (const locked_range_t *)arg1;
 	const locked_range_t *rl2 = (const locked_range_t *)arg2;
 
-	return (AVL_CMP(rl1->lr_offset, rl2->lr_offset));
+	return (TREE_CMP(rl1->lr_offset, rl2->lr_offset));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/zil.c	Fri Dec 06 12:00:18 2019 -0600
+++ b/usr/src/uts/common/fs/zfs/zil.c	Mon Dec 09 14:15:34 2019 +0000
@@ -139,11 +139,11 @@
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
-	int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
-	return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
+	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
@@ -526,7 +526,7 @@
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
-	return (AVL_CMP(v1, v2));
+	return (TREE_CMP(v1, v2));
 }
 
 static lwb_t *
@@ -1759,7 +1759,7 @@
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
-	return (AVL_CMP(o1, o2));
+	return (TREE_CMP(o1, o2));
 }
 
 /*