Mercurial > illumos > git > illumos-joyent
changeset 24358:b0bb68020907
[illumos-gate merge]
commit d9172ac4b58b7d14d2dc462871e5a554135283bf
12004 format: err_check is duplicate of efi_err_check
commit edbad4fe075c5178507eaf0bfb85aa903b66bf91
11997 format: cstyle cleanup
commit 5ac07b12fb4c39cb2415c0997f7c5b4dd5209f96
12058 loader.efi: use libi386/comconsole with x86
commit 4d7988d6050abba5c1ff60e7fd196e95c22e20f4
11971 Reduce loaded range tree memory usage
line wrap: on
line diff
--- a/usr/src/boot/Makefile.version Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/Makefile.version Mon Dec 09 14:15:34 2019 +0000 @@ -33,4 +33,4 @@ # Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes. # The version is processed from left to right, the version number can only # be increased. -BOOT_VERSION = $(LOADER_VERSION)-2019.12.03.1 +BOOT_VERSION = $(LOADER_VERSION)-2019.12.05.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/boot/sys/boot/common/mb_header.S Mon Dec 09 14:15:34 2019 +0000 @@ -0,0 +1,43 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Toomas Soome <tsoome@me.com> + */ + + .file "mb_header.S" + +/* + * Provide fake multiboot header to support versioning and partition + * start. The fake MB header is used by versioning code located in + * usr/src/cmd/boot/common. Since the BIOS bootblock is stored on raw disk, + * this fake header is used to store the location of the version info block. + * Additionally we use it to store partition start_sector, so we can identify + * our root file system partition. Note we are using LBA64 here. + */ + +#define ASM_FILE +#include <sys/multiboot.h> + + .globl mb_header, start_sector + .text + + .align 4 +mb_header: + .long MULTIBOOT_HEADER_MAGIC + .long MULTIBOOT_AOUT_KLUDGE + .long -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_AOUT_KLUDGE) + .long 0 /* header_addr */ + .long 0 /* load_addr */ + .long 0 /* load_end_addr */ +start_sector: .long 0 /* partition LBA */ + .long 0 +
--- a/usr/src/boot/sys/boot/common/multiboot.S Fri Dec 06 12:00:18 2019 -0600 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2018 Toomas Soome <tsoome@me.com> - */ - - .file "multiboot.s" - -/* - * Provide fake multiboot header to support versioning and partition - * start. The fake MB header is used by versioning code located in - * usr/src/cmd/boot/common. Since the BIOS bootblock is stored on raw disk, - * this fake header is used to store the location of the version info block. - * Additionally we use it to store partition start_sector, so we can identify - * our root file system partition. Note we are using LBA64 here. - */ - -#define ASM_FILE -#include <sys/multiboot.h> - - .globl mb_header, start_sector - .text - - .align 4 -mb_header: - .long MULTIBOOT_HEADER_MAGIC - .long MULTIBOOT_AOUT_KLUDGE - .long -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_AOUT_KLUDGE) - .long 0 /* header_addr */ - .long 0 /* load_addr */ - .long 0 /* load_end_addr */ -start_sector: .long 0 /* partition LBA */ - .long 0 -
--- a/usr/src/boot/sys/boot/efi/loader/Makefile.com Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/Makefile.com Mon Dec 09 14:15:34 2019 +0000 @@ -34,7 +34,7 @@ framebuffer.c \ main.c \ memmap.c \ - multiboot.S \ + mb_header.S \ multiboot2.c \ self_reloc.c \ smbios.c \ @@ -53,7 +53,7 @@ framebuffer.o \ main.o \ memmap.o \ - multiboot.o \ + mb_header.o \ multiboot2.o \ self_reloc.o \ smbios.o \ @@ -186,9 +186,6 @@ %.o: ../../../common/linenoise/%.c $(COMPILE.c) $< -%.o: ../../../i386/libi386/%.c - $(COMPILE.c) $< - %.o: $(SRC)/common/font/%.c $(COMPILE.c) $<
--- a/usr/src/boot/sys/boot/efi/loader/amd64/Makefile Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/amd64/Makefile Mon Dec 09 14:15:34 2019 +0000 @@ -31,3 +31,6 @@ CLEANFILES += machine x86 $(EFIPROG) $(OBJS): machine x86 + +%.o: ../../../i386/libi386/%.c + $(COMPILE.c) $<
--- a/usr/src/boot/sys/boot/efi/loader/arch/amd64/ldscript.amd64 Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/arch/amd64/ldscript.amd64 Mon Dec 09 14:15:34 2019 +0000 @@ -9,7 +9,7 @@ .hash : { *(.hash) } /* this MUST come first! */ . = ALIGN(4096); .text : { - multiboot.o(.text) + mb_header.o(.text) *(.text .stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning)
--- a/usr/src/boot/sys/boot/efi/loader/arch/i386/ldscript.i386 Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/arch/i386/ldscript.i386 Mon Dec 09 14:15:34 2019 +0000 @@ -9,7 +9,7 @@ . = SIZEOF_HEADERS; . = ALIGN(4096); .text : { - multiboot.o(.text) + mb_header.o(.text) *(.text .stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning)
--- a/usr/src/boot/sys/boot/efi/loader/comconsole.c Fri Dec 06 12:00:18 2019 -0600 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,694 +0,0 @@ -/* - * Copyright (c) 1998 Michael Smith (msmith@freebsd.org) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> - -#include <stand.h> -#include <sys/errno.h> -#include <bootstrap.h> -#include <stdbool.h> - -#include <efi.h> -#include <efilib.h> - -#include "loader_efi.h" - -static EFI_GUID serial = SERIAL_IO_PROTOCOL; - -#define COMC_TXWAIT 0x40000 /* transmit timeout */ - -#ifndef COMSPEED -#define COMSPEED 9600 -#endif - -#define PNP0501 0x501 /* 16550A-compatible COM port */ - -struct serial { - uint64_t baudrate; - uint8_t databits; - EFI_PARITY_TYPE parity; - EFI_STOP_BITS_TYPE stopbits; - uint8_t ignore_cd; /* boolean */ - uint8_t rtsdtr_off; /* boolean */ - int ioaddr; /* index in handles array */ - SERIAL_IO_INTERFACE *sio; -}; - -static void comc_probe(struct console *); -static int comc_init(struct console *, int); -static void comc_putchar(struct console *, int); -static int comc_getchar(struct console *); -static int comc_ischar(struct console *); -static int comc_ioctl(struct console *, int, void *); -static void comc_devinfo(struct console *); -static bool comc_setup(struct console *); -static char *comc_asprint_mode(struct serial *); -static int comc_parse_mode(struct serial *, const char *); -static int comc_mode_set(struct env_var *, int, const void *); -static int comc_cd_set(struct env_var *, int, const void *); -static int comc_rtsdtr_set(struct env_var *, int, const void *); - -struct console ttya = { - .c_name = "ttya", - .c_desc = "serial port a", - .c_flags = 0, - .c_probe = comc_probe, - .c_init = comc_init, - .c_out = comc_putchar, - .c_in = comc_getchar, - .c_ready = comc_ischar, - .c_ioctl = comc_ioctl, - .c_devinfo = comc_devinfo, - .c_private = NULL -}; - -struct console ttyb = { - .c_name = "ttyb", - .c_desc = "serial port b", - .c_flags = 0, - .c_probe = comc_probe, - .c_init = comc_init, - .c_out = comc_putchar, - .c_in = comc_getchar, - .c_ready = comc_ischar, - .c_ioctl = comc_ioctl, - .c_devinfo = comc_devinfo, - .c_private = NULL -}; - -struct console ttyc = { - .c_name = "ttyc", - .c_desc = "serial port c", - .c_flags = 0, - .c_probe = comc_probe, - .c_init = comc_init, - .c_out = comc_putchar, - .c_in = comc_getchar, - .c_ready = comc_ischar, - .c_ioctl = comc_ioctl, - .c_devinfo = comc_devinfo, - .c_private = NULL -}; - -struct console ttyd = { - .c_name = "ttyd", - .c_desc = "serial port d", - .c_flags = 0, - .c_probe = comc_probe, - .c_init = comc_init, - .c_out = comc_putchar, - .c_in = comc_getchar, - .c_ready = comc_ischar, - .c_ioctl = comc_ioctl, - .c_devinfo = comc_devinfo, - .c_private = NULL -}; - -static EFI_STATUS -efi_serial_init(EFI_HANDLE **handlep, int *nhandles) -{ - UINTN bufsz = 0; - EFI_STATUS status; - EFI_HANDLE *handles; - - /* - * get buffer size - */ - *nhandles = 0; - handles = NULL; - status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles); - if (status != EFI_BUFFER_TOO_SMALL) - return (status); - - if ((handles = malloc(bufsz)) == NULL) - return (ENOMEM); - - *nhandles = (int)(bufsz / sizeof (EFI_HANDLE)); - /* - * get handle array - */ - status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles); - if (EFI_ERROR(status)) { - free(handles); - *nhandles = 0; - } else - *handlep = handles; - return (status); -} - -/* - * Find serial device number from device path. - * Return -1 if not found. - */ -static int -efi_serial_get_index(EFI_DEVICE_PATH *devpath) -{ - ACPI_HID_DEVICE_PATH *acpi; - - while (!IsDevicePathEnd(devpath)) { - if (DevicePathType(devpath) == ACPI_DEVICE_PATH && - DevicePathSubType(devpath) == ACPI_DP) { - - acpi = (ACPI_HID_DEVICE_PATH *)devpath; - if (acpi->HID == EISA_PNP_ID(PNP0501)) { - return (acpi->UID); - } - } - - devpath = NextDevicePathNode(devpath); - } - return (-1); -} - -/* - * The order of handles from LocateHandle() is not known, we need to - * iterate handles, pick device path for handle, and check the device - * number. - */ -static EFI_HANDLE -efi_serial_get_handle(int port) -{ - EFI_STATUS status; - EFI_HANDLE *handles, handle; - EFI_DEVICE_PATH *devpath; - int index, nhandles; - - if (port == -1) - return (NULL); - - handles = NULL; - nhandles = 0; - status = efi_serial_init(&handles, &nhandles); - if (EFI_ERROR(status)) - return (NULL); - - handle = NULL; - for (index = 0; index < nhandles; index++) { - devpath = efi_lookup_devpath(handles[index]); - if (port == efi_serial_get_index(devpath)) { - handle = (handles[index]); - break; - } - } - - /* - * In case we did fail to identify the device by path, use port as - * array index. Note, we did check port == -1 above. - */ - if (port < nhandles && handle == NULL) - handle = handles[port]; - - free(handles); - return (handle); -} - -static void -comc_probe(struct console *cp) -{ - EFI_STATUS status; - EFI_HANDLE handle; - struct serial *port; - char name[20]; - char value[20]; - char *env; - - /* are we already set up? */ - if (cp->c_private != NULL) - return; - - cp->c_private = malloc(sizeof (struct serial)); - port = cp->c_private; - port->baudrate = COMSPEED; - - port->ioaddr = -1; /* invalid port */ - if (strcmp(cp->c_name, "ttya") == 0) - port->ioaddr = 0; - else if (strcmp(cp->c_name, "ttyb") == 0) - port->ioaddr = 1; - else if (strcmp(cp->c_name, "ttyc") == 0) - port->ioaddr = 2; - else if (strcmp(cp->c_name, "ttyd") == 0) - port->ioaddr = 3; - - port->databits = 8; /* 8,n,1 */ - port->parity = NoParity; /* 8,n,1 */ - port->stopbits = OneStopBit; /* 8,n,1 */ - port->ignore_cd = 1; /* ignore cd */ - port->rtsdtr_off = 0; /* rts-dtr is on */ - port->sio = NULL; - - handle = efi_serial_get_handle(port->ioaddr); - - if (handle != NULL) { - status = BS->OpenProtocol(handle, &serial, - (void**)&port->sio, IH, NULL, - EFI_OPEN_PROTOCOL_GET_PROTOCOL); - - if (EFI_ERROR(status)) - port->sio = NULL; - } - - snprintf(name, sizeof (name), "%s-mode", cp->c_name); - env = getenv(name); - - if (env != NULL) - (void) comc_parse_mode(port, env); - - env = comc_asprint_mode(port); - - if (env != NULL) { - unsetenv(name); - env_setenv(name, EV_VOLATILE, env, comc_mode_set, env_nounset); - free(env); - } - - snprintf(name, sizeof (name), "%s-ignore-cd", cp->c_name); - env = getenv(name); - if (env != NULL) { - if (strcmp(env, "true") == 0) - port->ignore_cd = 1; - else if (strcmp(env, "false") == 0) - port->ignore_cd = 0; - } - - snprintf(value, sizeof (value), "%s", - port->ignore_cd? "true" : "false"); - unsetenv(name); - env_setenv(name, EV_VOLATILE, value, comc_cd_set, env_nounset); - - snprintf(name, sizeof (name), "%s-rts-dtr-off", cp->c_name); - env = getenv(name); - if (env != NULL) { - if (strcmp(env, "true") == 0) - port->rtsdtr_off = 1; - else if (strcmp(env, "false") == 0) - port->rtsdtr_off = 0; - } - - snprintf(value, sizeof (value), "%s", - port->rtsdtr_off? "true" : "false"); - unsetenv(name); - env_setenv(name, EV_VOLATILE, value, comc_rtsdtr_set, env_nounset); - - cp->c_flags = 0; - if (comc_setup(cp)) - cp->c_flags = C_PRESENTIN | C_PRESENTOUT; -} - -static int -comc_init(struct console *cp, int arg __attribute((unused))) -{ - - if (comc_setup(cp)) - return (CMD_OK); - - cp->c_flags = 0; - return (CMD_ERROR); -} - -static void -comc_putchar(struct console *cp, int c) -{ - int wait; - EFI_STATUS status; - UINTN bufsz = 1; - char cb = c; - struct serial *sp = cp->c_private; - - if (sp->sio == NULL) - return; - - for (wait = COMC_TXWAIT; wait > 0; wait--) { - status = sp->sio->Write(sp->sio, &bufsz, &cb); - if (status != EFI_TIMEOUT) - break; - } -} - -static int -comc_getchar(struct console *cp) -{ - EFI_STATUS status; - UINTN bufsz = 1; - char c; - struct serial *sp = cp->c_private; - - if (sp->sio == NULL || !comc_ischar(cp)) - return (-1); - - status = sp->sio->Read(sp->sio, &bufsz, &c); - if (EFI_ERROR(status) || bufsz == 0) - return (-1); - - return (c); -} - -static int -comc_ischar(struct console *cp) -{ - EFI_STATUS status; - uint32_t control; - struct serial *sp = cp->c_private; - - if (sp->sio == NULL) - return (0); - - status = sp->sio->GetControl(sp->sio, &control); - if (EFI_ERROR(status)) - return (0); - - return (!(control & EFI_SERIAL_INPUT_BUFFER_EMPTY)); -} - -static int -comc_ioctl(struct console *cp __unused, int cmd __unused, void *data __unused) -{ - return (ENOTTY); -} - -static void -comc_devinfo(struct console *cp) -{ - struct serial *port = cp->c_private; - EFI_HANDLE handle; - EFI_DEVICE_PATH *dp; - CHAR16 *text; - - handle = efi_serial_get_handle(port->ioaddr); - if (handle == NULL) { - printf("\tdevice is not present"); - return; - } - - dp = efi_lookup_devpath(handle); - if (dp == NULL) - return; - - text = efi_devpath_name(dp); - if (text == NULL) - return; - - printf("\t%S", text); - efi_free_devpath_name(text); -} - -static char * -comc_asprint_mode(struct serial *sp) -{ - char par, *buf; - char *stop; - - if (sp == NULL) - return (NULL); - - switch (sp->parity) { - case NoParity: - par = 'n'; - break; - case EvenParity: - par = 'e'; - break; - case OddParity: - par = 'o'; - break; - case MarkParity: - par = 'm'; - break; - case SpaceParity: - par = 's'; - break; - default: - par = 'n'; - break; - } - - switch (sp->stopbits) { - case OneStopBit: - stop = "1"; - break; - case TwoStopBits: - stop = "2"; - break; - case OneFiveStopBits: - stop = "1.5"; - break; - default: - stop = "1"; - break; - } - - asprintf(&buf, "%ju,%d,%c,%s,-", sp->baudrate, sp->databits, par, stop); - return (buf); -} - -static int -comc_parse_mode(struct serial *sp, const char *value) -{ - unsigned long n; - uint64_t baudrate; - uint8_t databits = 8; - int parity = NoParity; - int stopbits = OneStopBit; - char *ep; - - if (value == NULL || *value == '\0') - return (CMD_ERROR); - - errno = 0; - n = strtoul(value, &ep, 10); - if (errno != 0 || *ep != ',') - return (CMD_ERROR); - baudrate = n; - - ep++; - n = strtoul(ep, &ep, 10); - if (errno != 0 || *ep != ',') - return (CMD_ERROR); - - switch (n) { - case 5: databits = 5; - break; - case 6: databits = 6; - break; - case 7: databits = 7; - break; - case 8: databits = 8; - break; - default: - return (CMD_ERROR); - } - - ep++; - switch (*ep++) { - case 'n': parity = NoParity; - break; - case 'e': parity = EvenParity; - break; - case 'o': parity = OddParity; - break; - case 'm': parity = MarkParity; - break; - case 's': parity = SpaceParity; - break; - default: - return (CMD_ERROR); - } - - if (*ep == ',') - ep++; - else - return (CMD_ERROR); - - switch (*ep++) { - case '1': stopbits = OneStopBit; - if (ep[0] == '.' && ep[1] == '5') { - ep += 2; - stopbits = OneFiveStopBits; - } - break; - case '2': stopbits = TwoStopBits; - break; - default: - return (CMD_ERROR); - } - - /* handshake is ignored, but we check syntax anyhow */ - if (*ep == ',') - ep++; - else - return (CMD_ERROR); - - switch (*ep++) { - case '-': - case 'h': - case 's': - break; - default: - return (CMD_ERROR); - } - - if (*ep != '\0') - return (CMD_ERROR); - - sp->baudrate = baudrate; - sp->databits = databits; - sp->parity = parity; - sp->stopbits = stopbits; - return (CMD_OK); -} - -static struct console * -get_console(char *name) -{ - struct console *cp = NULL; - - switch (name[3]) { - case 'a': cp = &ttya; - break; - case 'b': cp = &ttyb; - break; - case 'c': cp = &ttyc; - break; - case 'd': cp = &ttyd; - break; - } - return (cp); -} - -static int -comc_mode_set(struct env_var *ev, int flags, const void *value) -{ - struct console *cp; - - if (value == NULL) - return (CMD_ERROR); - - if ((cp = get_console(ev->ev_name)) == NULL) - return (CMD_ERROR); - - if (comc_parse_mode(cp->c_private, value) == CMD_ERROR) - return (CMD_ERROR); - - (void) comc_setup(cp); - - env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); - - return (CMD_OK); -} - -static int -comc_cd_set(struct env_var *ev, int flags, const void *value) -{ - struct console *cp; - struct serial *sp; - - if (value == NULL) - return (CMD_ERROR); - - if ((cp = get_console(ev->ev_name)) == NULL) - return (CMD_ERROR); - - sp = cp->c_private; - if (strcmp(value, "true") == 0) - sp->ignore_cd = 1; - else if (strcmp(value, "false") == 0) - sp->ignore_cd = 0; - else - return (CMD_ERROR); - - (void) comc_setup(cp); - - env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); - - return (CMD_OK); -} - -static int -comc_rtsdtr_set(struct env_var *ev, int flags, const void *value) -{ - struct console *cp; - struct serial *sp; - - if (value == NULL) - return (CMD_ERROR); - - if ((cp = get_console(ev->ev_name)) == NULL) - return (CMD_ERROR); - - sp = cp->c_private; - if (strcmp(value, "true") == 0) - sp->rtsdtr_off = 1; - else if (strcmp(value, "false") == 0) - sp->rtsdtr_off = 0; - else - return (CMD_ERROR); - - (void) comc_setup(cp); - - env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); - - return (CMD_OK); -} - -/* - * In case of error, we also reset ACTIVE flags, so the console - * framefork will try alternate consoles. - */ -static bool -comc_setup(struct console *cp) -{ - EFI_STATUS status; - UINT32 control; - struct serial *sp = cp->c_private; - - /* port is not usable */ - if (sp->sio == NULL) - return (false); - - status = sp->sio->Reset(sp->sio); - if (EFI_ERROR(status)) - return (false); - - status = sp->sio->SetAttributes(sp->sio, sp->baudrate, 0, 0, sp->parity, - sp->databits, sp->stopbits); - if (EFI_ERROR(status)) - return (false); - - status = sp->sio->GetControl(sp->sio, &control); - if (EFI_ERROR(status)) - return (false); - if (sp->rtsdtr_off) { - control &= ~(EFI_SERIAL_REQUEST_TO_SEND | - EFI_SERIAL_DATA_TERMINAL_READY); - } else { - control |= EFI_SERIAL_REQUEST_TO_SEND; - } - - (void) sp->sio->SetControl(sp->sio, control); - - /* Mark this port usable. */ - cp->c_flags |= (C_PRESENTIN | C_PRESENTOUT); - return (true); -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/boot/sys/boot/efi/loader/efiserialio.c Mon Dec 09 14:15:34 2019 +0000 @@ -0,0 +1,700 @@ +/* + * Copyright (c) 1998 Michael Smith (msmith@freebsd.org) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * We do not use this implementation with x86 till we can fix two issues: + * 1. Reliably identify the serial ports in correct order. + * 2. Ensure we get properly working reads from serial io. + */ + +#include <sys/cdefs.h> + +#include <stand.h> +#include <sys/errno.h> +#include <bootstrap.h> +#include <stdbool.h> + +#include <efi.h> +#include <efilib.h> + +#include "loader_efi.h" + +static EFI_GUID serial = SERIAL_IO_PROTOCOL; + +#define COMC_TXWAIT 0x40000 /* transmit timeout */ + +#ifndef COMSPEED +#define COMSPEED 9600 +#endif + +#define PNP0501 0x501 /* 16550A-compatible COM port */ + +struct serial { + uint64_t baudrate; + uint8_t databits; + EFI_PARITY_TYPE parity; + EFI_STOP_BITS_TYPE stopbits; + uint8_t ignore_cd; /* boolean */ + uint8_t rtsdtr_off; /* boolean */ + int ioaddr; /* index in handles array */ + SERIAL_IO_INTERFACE *sio; +}; + +static void comc_probe(struct console *); +static int comc_init(struct console *, int); +static void comc_putchar(struct console *, int); +static int comc_getchar(struct console *); +static int comc_ischar(struct console *); +static int comc_ioctl(struct console *, int, void *); +static void comc_devinfo(struct console *); +static bool comc_setup(struct console *); +static char *comc_asprint_mode(struct serial *); +static int comc_parse_mode(struct serial *, const char *); +static int comc_mode_set(struct env_var *, int, const void *); +static int comc_cd_set(struct env_var *, int, const void *); +static int comc_rtsdtr_set(struct env_var *, int, const void *); + +struct console ttya = { + .c_name = "ttya", + .c_desc = "serial port a", + .c_flags = 0, + .c_probe = comc_probe, + .c_init = comc_init, + .c_out = comc_putchar, + .c_in = comc_getchar, + .c_ready = comc_ischar, + .c_ioctl = comc_ioctl, + .c_devinfo = comc_devinfo, + .c_private = NULL +}; + +struct console ttyb = { + .c_name = "ttyb", + .c_desc = "serial port b", + .c_flags = 0, + .c_probe = comc_probe, + .c_init = comc_init, + .c_out = comc_putchar, + .c_in = comc_getchar, + .c_ready = comc_ischar, + .c_ioctl = comc_ioctl, + .c_devinfo = comc_devinfo, + .c_private = NULL +}; + +struct console ttyc = { + .c_name = "ttyc", + .c_desc = "serial port c", + .c_flags = 0, + .c_probe = comc_probe, + .c_init = comc_init, + .c_out = comc_putchar, + .c_in = comc_getchar, + .c_ready = comc_ischar, + .c_ioctl = comc_ioctl, + .c_devinfo = comc_devinfo, + .c_private = NULL +}; + +struct console ttyd = { + .c_name = "ttyd", + .c_desc = "serial port d", + .c_flags = 0, + .c_probe = comc_probe, + .c_init = comc_init, + .c_out = comc_putchar, + .c_in = comc_getchar, + .c_ready = comc_ischar, + .c_ioctl = comc_ioctl, + .c_devinfo = comc_devinfo, + .c_private = NULL +}; + +static EFI_STATUS +efi_serial_init(EFI_HANDLE **handlep, int *nhandles) +{ + UINTN bufsz = 0; + EFI_STATUS status; + EFI_HANDLE *handles; + + /* + * get buffer size + */ + *nhandles = 0; + handles = NULL; + status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles); + if (status != EFI_BUFFER_TOO_SMALL) + return (status); + + if ((handles = malloc(bufsz)) == NULL) + return (ENOMEM); + + *nhandles = (int)(bufsz / sizeof (EFI_HANDLE)); + /* + * get handle array + */ + status = BS->LocateHandle(ByProtocol, &serial, NULL, &bufsz, handles); + if (EFI_ERROR(status)) { + free(handles); + *nhandles = 0; + } else + *handlep = handles; + return (status); +} + +/* + * Find serial device number from device path. + * Return -1 if not found. + */ +static int +efi_serial_get_index(EFI_DEVICE_PATH *devpath) +{ + ACPI_HID_DEVICE_PATH *acpi; + + while (!IsDevicePathEnd(devpath)) { + if (DevicePathType(devpath) == ACPI_DEVICE_PATH && + DevicePathSubType(devpath) == ACPI_DP) { + + acpi = (ACPI_HID_DEVICE_PATH *)devpath; + if (acpi->HID == EISA_PNP_ID(PNP0501)) { + return (acpi->UID); + } + } + + devpath = NextDevicePathNode(devpath); + } + return (-1); +} + +/* + * The order of handles from LocateHandle() is not known, we need to + * iterate handles, pick device path for handle, and check the device + * number. + */ +static EFI_HANDLE +efi_serial_get_handle(int port) +{ + EFI_STATUS status; + EFI_HANDLE *handles, handle; + EFI_DEVICE_PATH *devpath; + int index, nhandles; + + if (port == -1) + return (NULL); + + handles = NULL; + nhandles = 0; + status = efi_serial_init(&handles, &nhandles); + if (EFI_ERROR(status)) + return (NULL); + + handle = NULL; + for (index = 0; index < nhandles; index++) { + devpath = efi_lookup_devpath(handles[index]); + if (port == efi_serial_get_index(devpath)) { + handle = (handles[index]); + break; + } + } + + /* + * In case we did fail to identify the device by path, use port as + * array index. Note, we did check port == -1 above. + */ + if (port < nhandles && handle == NULL) + handle = handles[port]; + + free(handles); + return (handle); +} + +static void +comc_probe(struct console *cp) +{ + EFI_STATUS status; + EFI_HANDLE handle; + struct serial *port; + char name[20]; + char value[20]; + char *env; + + /* are we already set up? */ + if (cp->c_private != NULL) + return; + + cp->c_private = malloc(sizeof (struct serial)); + port = cp->c_private; + port->baudrate = COMSPEED; + + port->ioaddr = -1; /* invalid port */ + if (strcmp(cp->c_name, "ttya") == 0) + port->ioaddr = 0; + else if (strcmp(cp->c_name, "ttyb") == 0) + port->ioaddr = 1; + else if (strcmp(cp->c_name, "ttyc") == 0) + port->ioaddr = 2; + else if (strcmp(cp->c_name, "ttyd") == 0) + port->ioaddr = 3; + + port->databits = 8; /* 8,n,1 */ + port->parity = NoParity; /* 8,n,1 */ + port->stopbits = OneStopBit; /* 8,n,1 */ + port->ignore_cd = 1; /* ignore cd */ + port->rtsdtr_off = 0; /* rts-dtr is on */ + port->sio = NULL; + + handle = efi_serial_get_handle(port->ioaddr); + + if (handle != NULL) { + status = BS->OpenProtocol(handle, &serial, + (void**)&port->sio, IH, NULL, + EFI_OPEN_PROTOCOL_GET_PROTOCOL); + + if (EFI_ERROR(status)) + port->sio = NULL; + } + + snprintf(name, sizeof (name), "%s-mode", cp->c_name); + env = getenv(name); + + if (env != NULL) + (void) comc_parse_mode(port, env); + + env = comc_asprint_mode(port); + + if (env != NULL) { + unsetenv(name); + env_setenv(name, EV_VOLATILE, env, comc_mode_set, env_nounset); + free(env); + } + + snprintf(name, sizeof (name), "%s-ignore-cd", cp->c_name); + env = getenv(name); + if (env != NULL) { + if (strcmp(env, "true") == 0) + port->ignore_cd = 1; + else if (strcmp(env, "false") == 0) + port->ignore_cd = 0; + } + + snprintf(value, sizeof (value), "%s", + port->ignore_cd? "true" : "false"); + unsetenv(name); + env_setenv(name, EV_VOLATILE, value, comc_cd_set, env_nounset); + + snprintf(name, sizeof (name), "%s-rts-dtr-off", cp->c_name); + env = getenv(name); + if (env != NULL) { + if (strcmp(env, "true") == 0) + port->rtsdtr_off = 1; + else if (strcmp(env, "false") == 0) + port->rtsdtr_off = 0; + } + + snprintf(value, sizeof (value), "%s", + port->rtsdtr_off? "true" : "false"); + unsetenv(name); + env_setenv(name, EV_VOLATILE, value, comc_rtsdtr_set, env_nounset); + + cp->c_flags = 0; + if (comc_setup(cp)) + cp->c_flags = C_PRESENTIN | C_PRESENTOUT; +} + +static int +comc_init(struct console *cp, int arg __attribute((unused))) +{ + + if (comc_setup(cp)) + return (CMD_OK); + + cp->c_flags = 0; + return (CMD_ERROR); +} + +static void +comc_putchar(struct console *cp, int c) +{ + int wait; + EFI_STATUS status; + UINTN bufsz = 1; + char cb = c; + struct serial *sp = cp->c_private; + + if (sp->sio == NULL) + return; + + for (wait = COMC_TXWAIT; wait > 0; wait--) { + status = sp->sio->Write(sp->sio, &bufsz, &cb); + if (status != EFI_TIMEOUT) + break; + } +} + +static int +comc_getchar(struct console *cp) +{ + EFI_STATUS status; + UINTN bufsz = 1; + char c; + struct serial *sp = cp->c_private; + + if (sp->sio == NULL || !comc_ischar(cp)) + return (-1); + + status = sp->sio->Read(sp->sio, &bufsz, &c); + if (EFI_ERROR(status) || bufsz == 0) + return (-1); + + return (c); +} + +static int +comc_ischar(struct console *cp) +{ + EFI_STATUS status; + uint32_t control; + struct serial *sp = cp->c_private; + + if (sp->sio == NULL) + return (0); + + status = sp->sio->GetControl(sp->sio, &control); + if (EFI_ERROR(status)) + return (0); + + return (!(control & EFI_SERIAL_INPUT_BUFFER_EMPTY)); +} + +static int +comc_ioctl(struct console *cp __unused, int cmd __unused, void *data __unused) +{ + return (ENOTTY); +} + +static void +comc_devinfo(struct console *cp) +{ + struct serial *port = cp->c_private; + EFI_HANDLE handle; + EFI_DEVICE_PATH *dp; + CHAR16 *text; + + handle = efi_serial_get_handle(port->ioaddr); + if (handle == NULL) { + printf("\tdevice is not present"); + return; + } + + dp = efi_lookup_devpath(handle); + if (dp == NULL) + return; + + text = efi_devpath_name(dp); + if (text == NULL) + return; + + printf("\t%S", text); + efi_free_devpath_name(text); +} + +static char * +comc_asprint_mode(struct serial *sp) +{ + char par, *buf; + char *stop; + + if (sp == NULL) + return (NULL); + + switch (sp->parity) { + case NoParity: + par = 'n'; + break; + case EvenParity: + par = 'e'; + break; + case OddParity: + par = 'o'; + break; + case MarkParity: + par = 'm'; + break; + case SpaceParity: + par = 's'; + break; + default: + par = 'n'; + break; + } + + switch (sp->stopbits) { + case OneStopBit: + stop = "1"; + break; + case TwoStopBits: + stop = "2"; + break; + case OneFiveStopBits: + stop = "1.5"; + break; + default: + stop = "1"; + break; + } + + asprintf(&buf, "%ju,%d,%c,%s,-", sp->baudrate, sp->databits, par, stop); + return (buf); +} + +static int +comc_parse_mode(struct serial *sp, const char *value) +{ + unsigned long n; + uint64_t baudrate; + uint8_t databits = 8; + int parity = NoParity; + int stopbits = OneStopBit; + char *ep; + + if (value == NULL || *value == '\0') + return (CMD_ERROR); + + errno = 0; + n = strtoul(value, &ep, 10); + if (errno != 0 || *ep != ',') + return (CMD_ERROR); + baudrate = n; + + ep++; + n = strtoul(ep, &ep, 10); + if (errno != 0 || *ep != ',') + return (CMD_ERROR); + + switch (n) { + case 5: databits = 5; + break; + case 6: databits = 6; + break; + case 7: databits = 7; + break; + case 8: databits = 8; + break; + default: + return (CMD_ERROR); + } + + ep++; + switch (*ep++) { + case 'n': parity = NoParity; + break; + case 'e': parity = EvenParity; + break; + case 'o': parity = OddParity; + break; + case 'm': parity = MarkParity; + break; + case 's': parity = SpaceParity; + break; + default: + return (CMD_ERROR); + } + + if (*ep == ',') + ep++; + else + return (CMD_ERROR); + + switch (*ep++) { + case '1': stopbits = OneStopBit; + if (ep[0] == '.' && ep[1] == '5') { + ep += 2; + stopbits = OneFiveStopBits; + } + break; + case '2': stopbits = TwoStopBits; + break; + default: + return (CMD_ERROR); + } + + /* handshake is ignored, but we check syntax anyhow */ + if (*ep == ',') + ep++; + else + return (CMD_ERROR); + + switch (*ep++) { + case '-': + case 'h': + case 's': + break; + default: + return (CMD_ERROR); + } + + if (*ep != '\0') + return (CMD_ERROR); + + sp->baudrate = baudrate; + sp->databits = databits; + sp->parity = parity; + sp->stopbits = stopbits; + return (CMD_OK); +} + +static struct console * +get_console(char *name) +{ + struct console *cp = NULL; + + switch (name[3]) { + case 'a': cp = &ttya; + break; + case 'b': cp = &ttyb; + break; + case 'c': cp = &ttyc; + break; + case 'd': cp = &ttyd; + break; + } + return (cp); +} + +static int +comc_mode_set(struct env_var *ev, int flags, const void *value) +{ + struct console *cp; + + if (value == NULL) + return (CMD_ERROR); + + if ((cp = get_console(ev->ev_name)) == NULL) + return (CMD_ERROR); + + if (comc_parse_mode(cp->c_private, value) == CMD_ERROR) + return (CMD_ERROR); + + (void) comc_setup(cp); + + env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); + + return (CMD_OK); +} + +static int +comc_cd_set(struct env_var *ev, int flags, const void *value) +{ + struct console *cp; + struct serial *sp; + + if (value == NULL) + return (CMD_ERROR); + + if ((cp = get_console(ev->ev_name)) == NULL) + return (CMD_ERROR); + + sp = cp->c_private; + if (strcmp(value, "true") == 0) + sp->ignore_cd = 1; + else if (strcmp(value, "false") == 0) + sp->ignore_cd = 0; + else + return (CMD_ERROR); + + (void) comc_setup(cp); + + env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); + + return (CMD_OK); +} + +static int +comc_rtsdtr_set(struct env_var *ev, int flags, const void *value) +{ + struct console *cp; + struct serial *sp; + + if (value == NULL) + return (CMD_ERROR); + + if ((cp = get_console(ev->ev_name)) == NULL) + return (CMD_ERROR); + + sp = cp->c_private; + if (strcmp(value, "true") == 0) + sp->rtsdtr_off = 1; + else if (strcmp(value, "false") == 0) + sp->rtsdtr_off = 0; + else + return (CMD_ERROR); + + (void) comc_setup(cp); + + env_setenv(ev->ev_name, flags | EV_NOHOOK, value, NULL, NULL); + + return (CMD_OK); +} + +/* + * In case of error, we also reset ACTIVE flags, so the console + * framefork will try alternate consoles. + */ +static bool +comc_setup(struct console *cp) +{ + EFI_STATUS status; + UINT32 control; + struct serial *sp = cp->c_private; + + /* port is not usable */ + if (sp->sio == NULL) + return (false); + + status = sp->sio->Reset(sp->sio); + if (EFI_ERROR(status)) + return (false); + + status = sp->sio->SetAttributes(sp->sio, sp->baudrate, 0, 0, sp->parity, + sp->databits, sp->stopbits); + if (EFI_ERROR(status)) + return (false); + + status = sp->sio->GetControl(sp->sio, &control); + if (EFI_ERROR(status)) + return (false); + if (sp->rtsdtr_off) { + control &= ~(EFI_SERIAL_REQUEST_TO_SEND | + EFI_SERIAL_DATA_TERMINAL_READY); + } else { + control |= EFI_SERIAL_REQUEST_TO_SEND; + } + + (void) sp->sio->SetControl(sp->sio, control); + + /* Mark this port usable. */ + cp->c_flags |= (C_PRESENTIN | C_PRESENTOUT); + return (true); +}
--- a/usr/src/boot/sys/boot/efi/loader/i386/Makefile Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/i386/Makefile Mon Dec 09 14:15:34 2019 +0000 @@ -31,3 +31,6 @@ CLEANFILES += machine x86 $(EFIPROG) $(OBJS): machine x86 + +%.o: ../../../i386/libi386/%.c + $(COMPILE.c) $<
--- a/usr/src/boot/sys/boot/efi/loader/main.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/efi/loader/main.c Mon Dec 09 14:15:34 2019 +0000 @@ -81,7 +81,7 @@ EFI_DEVICE_PATH *devpath, *dp, *node; HARDDRIVE_DEVICE_PATH *hd; bool ret; - extern UINT64 start_sector; /* from multiboot.S */ + extern UINT64 start_sector; /* from mb_header.S */ /* This check is true for chainloader case. */ if (h == img->DeviceHandle)
--- a/usr/src/boot/sys/boot/i386/gptzfsboot/Makefile Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/i386/gptzfsboot/Makefile Mon Dec 09 14:15:34 2019 +0000 @@ -62,7 +62,7 @@ install: all $(ROOTBOOTPROG) -OBJS = multiboot.o zfsboot.o sio.o cons.o devopen.o \ +OBJS = mb_header.o zfsboot.o sio.o cons.o devopen.o \ part.o disk.o bcache.o zfs_cmd.o part.o := CPPFLAGS += -I$(ZLIB) @@ -108,8 +108,8 @@ %.o: ../../common/%.c $(COMPILE.c) $< -multiboot.o: ../../common/multiboot.S - $(COMPILE.S) ../../common/multiboot.S +%.o: ../../common/%.S + $(COMPILE.S) $< clobber: clean
--- a/usr/src/boot/sys/boot/i386/isoboot/Makefile Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/i386/isoboot/Makefile Mon Dec 09 14:15:34 2019 +0000 @@ -56,7 +56,7 @@ install: all $(ROOTBOOTPROG) -OBJS= multiboot.o isoboot.o sio.o drv.o cons.o gptldr.o +OBJS= mb_header.o isoboot.o sio.o drv.o cons.o gptldr.o CLEANFILES += isoboot
--- a/usr/src/boot/sys/boot/i386/libi386/comconsole.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/boot/sys/boot/i386/libi386/comconsole.c Mon Dec 09 14:15:34 2019 +0000 @@ -23,6 +23,11 @@ * SUCH DAMAGE. */ +/* + * This code is shared on BIOS and UEFI systems on x86 because + * we can access io ports on both platforms and the UEFI Serial IO protocol + * is not giving us reliable port order and we see issues with input. + */ #include <sys/cdefs.h> #include <stand.h> @@ -495,7 +500,7 @@ static uint32_t comc_parse_pcidev(const char *string) { -#ifdef NO_PCI +#ifdef EFI (void) string; return (0); #else @@ -539,7 +544,7 @@ static int comc_pcidev_handle(struct console *cp, uint32_t locator) { -#ifdef NO_PCI +#ifdef EFI (void) cp; (void) locator; return (CMD_ERROR);
--- a/usr/src/cmd/format/io.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/io.c Mon Dec 09 14:15:34 2019 +0000 @@ -49,7 +49,6 @@ extern int data_lineno; extern char *space2str(); -extern long strtol(); /* * This variable is used to determine whether a token is present in the pipe @@ -62,10 +61,6 @@ */ int last_token_type = 0; -#ifdef __STDC__ -/* - * Prototypes for ANSI C compilers - */ static int sup_get_token(char *); static void pushchar(int c); static int checkeof(void); @@ -79,33 +74,11 @@ static void sup_pushchar(int c); static int geti64(char *str, uint64_t *iptr, uint64_t *wild); -#else /* __STDC__ */ -/* - * Prototypes for non-ANSI C compilers - */ - -static int sup_get_token(); -static void pushchar(int c); -static int checkeof(void); -static void flushline(void); -static int strcnt(char *s1, char *s2); -static int getbn(char *str, diskaddr_t *iptr); -static void print_input_choices(int type, u_ioparam_t *param); -static int slist_widest_str(slist_t *slist); -static void ljust_print(char *str, int width); -static int sup_inputchar(void); -static void sup_pushchar(int c); -static int geti64(char *str, uint64_t *iptr, uint64_t *wild); - -#endif /* __STDC__ */ - - /* * This routine pushes the given character back onto the input stream. */ static void -pushchar(c) - int c; +pushchar(int c) { (void) ungetc(c, stdin); } @@ -114,7 +87,7 @@ * This routine checks the input stream for an eof condition. */ static int -checkeof() +checkeof(void) { return (feof(stdin)); } @@ -124,8 +97,7 @@ * basically any consecutive non-white characters. */ char * -gettoken(inbuf) - char *inbuf; +gettoken(char *inbuf) { char *ptr = inbuf; int c, quoted = 0; @@ -196,8 +168,7 @@ * This routine removes the leading and trailing spaces from a token. */ void -clean_token(cleantoken, token) - char *cleantoken, *token; +clean_token(char *cleantoken, char *token) { char *ptr; @@ -214,7 +185,7 @@ * Strip off trailing white-space. */ for (ptr = cleantoken + strlen(cleantoken) - 1; - isspace(*ptr) && (ptr >= cleantoken); ptr--) { + isspace(*ptr) && (ptr >= cleantoken); ptr--) { *ptr = '\0'; } } @@ -254,8 +225,7 @@ * between s1 and s2, stopping as soon as a mismatch is found. */ static int -strcnt(s1, s2) - char *s1, *s2; +strcnt(char *s1, char *s2) { int i = 0; @@ -271,9 +241,7 @@ * is present, the wildcard value will be returned. */ int -geti(str, iptr, wild) - char *str; - int *iptr, *wild; +geti(char *str, int *iptr, int *wild) { char *str2; @@ -306,9 +274,7 @@ * is present, the wildcard value will be returned. */ static int -geti64(str, iptr, wild) - char *str; - uint64_t *iptr, *wild; +geti64(char *str, uint64_t *iptr, uint64_t *wild) { char *str2; @@ -345,9 +311,7 @@ * to the highest possible legal value. */ static int -getbn(str, iptr) - char *str; - diskaddr_t *iptr; +getbn(char *str, diskaddr_t *iptr) { char *cptr, *hptr, *sptr; int cyl, head, sect; @@ -442,13 +406,8 @@ * values and prompt strings. */ uint64_t -input(type, promptstr, delim, param, deflt, cmdflag) - int type; - char *promptstr; - int delim; - u_ioparam_t *param; - int *deflt; - int cmdflag; +input(int type, char *promptstr, int delim, u_ioparam_t *param, int *deflt, + int cmdflag) { int interactive, help, i, length, index, tied; blkaddr_t bn; @@ -586,7 +545,7 @@ cylno = bn2c(part_deflt->deflt_size) - 1; } else { cylno = (bn2c(part_deflt->deflt_size) + - part_deflt->start_cyl) - 1; + part_deflt->start_cyl) - 1; } fmt_print("[%ub, %uc, %de, %1.2fmb, %1.2fgb]", @@ -609,11 +568,11 @@ efi_deflt->end_sector, efi_deflt->start_sector + efi_deflt->end_sector - 1, (efi_deflt->end_sector * cur_blksz) / - (1024 * 1024), + (1024 * 1024), (efi_deflt->end_sector * cur_blksz) / - (1024 * 1024 * 1024), + (1024 * 1024 * 1024), (efi_deflt->end_sector * cur_blksz) / - ((uint64_t)1024 * 1024 * 1024 * 1024)); + ((uint64_t)1024 * 1024 * 1024 * 1024)); break; case FIO_OPINT: /* no default value for optional input type */ @@ -659,9 +618,9 @@ * exit gracefully. */ if ((strlcat(shell_argv, arg, sizeof (shell_argv)) >= - sizeof (shell_argv)) || + sizeof (shell_argv)) || (strlcat(shell_argv, " ", sizeof (shell_argv)) >= - sizeof (shell_argv))) { + sizeof (shell_argv))) { err_print("Error: Command line too long.\n"); fullabort(); } @@ -781,8 +740,8 @@ * If token is a '?' or a 'h', it is a request for help. */ if ((strcmp(cleantoken, "?") == 0) || - (strcmp(cleantoken, "h") == 0) || - (strcmp(cleantoken, "help") == 0)) { + (strcmp(cleantoken, "h") == 0) || + (strcmp(cleantoken, "help") == 0)) { help = 1; } /* @@ -813,12 +772,12 @@ * Convert token to a disk block number. */ if (cur_label == L_TYPE_EFI) { - if (geti64(cleantoken, (uint64_t *)&bn64, - (uint64_t *)NULL)) - break; + if (geti64(cleantoken, (uint64_t *)&bn64, + (uint64_t *)NULL)) + break; } else { - if (getbn(cleantoken, &bn64)) - break; + if (getbn(cleantoken, &bn64)) + break; } /* * Check to be sure it is within the legal bounds. @@ -1066,8 +1025,7 @@ * Return the value associated with the matched string. */ case FIO_SLIST: - i = find_value((slist_t *)param->io_slist, - cleantoken, &value); + i = find_value((slist_t *)param->io_slist, cleantoken, &value); if (i == 1) { return (value); } else { @@ -1520,14 +1478,14 @@ fmt_print("Expecting up to %llu sectors,", cur_parts->etoc->efi_last_u_lba); fmt_print("or %llu megabytes,", - (cur_parts->etoc->efi_last_u_lba * cur_blksz)/ - (1024 * 1024)); + (cur_parts->etoc->efi_last_u_lba * cur_blksz) / + (1024 * 1024)); fmt_print("or %llu gigabytes\n", - (cur_parts->etoc->efi_last_u_lba * cur_blksz)/ - (1024 * 1024 * 1024)); + (cur_parts->etoc->efi_last_u_lba * cur_blksz) / + (1024 * 1024 * 1024)); fmt_print("or %llu terabytes\n", - (cur_parts->etoc->efi_last_u_lba * cur_blksz)/ - ((uint64_t)1024 * 1024 * 1024 * 1024)); + (cur_parts->etoc->efi_last_u_lba * cur_blksz) / + ((uint64_t)1024 * 1024 * 1024 * 1024)); break; } @@ -1591,12 +1549,12 @@ * Token is number of blocks */ if (geti64(cleantoken, &blokno, (uint64_t *)NULL)) { - break; + break; } if (blokno > bounds->upper) { - err_print( -"Number of blocks must be less that the total available blocks.\n"); - break; + err_print("Number of blocks must be less that " + "the total available blocks.\n"); + break; } return (blokno); @@ -1614,8 +1572,8 @@ * Some sanity check */ if (blokno < efi_deflt->start_sector) { - err_print( -"End Sector must fall on or after start sector %llu\n", + err_print("End Sector must fall on or after " + "start sector %llu\n", efi_deflt->start_sector); break; } @@ -1624,8 +1582,8 @@ * verify that our input is within range */ if (blokno > cur_parts->etoc->efi_last_u_lba) { - err_print( -"End Sector %llu is beyond max Sector %llu\n", + err_print("End Sector %llu is beyond max " + "Sector %llu\n", blokno, cur_parts->etoc->efi_last_u_lba); break; } @@ -1681,11 +1639,11 @@ break; } return (uint64_t)((float)nmegs * 1024.0 * - 1024.0 * 1024.0 * 1024.0 / cur_blksz); + 1024.0 * 1024.0 * 1024.0 / cur_blksz); default: - err_print( -"Please specify units in either b(number of blocks), e(end sector),\n"); + err_print("Please specify units in either " + "b(number of blocks), e(end sector),\n"); err_print(" g(gigabytes), m(megabytes)"); err_print(" or t(terabytes)\n"); break; @@ -1721,9 +1679,7 @@ * Print input choices */ static void -print_input_choices(type, param) - int type; - u_ioparam_t *param; +print_input_choices(int type, u_ioparam_t *param) { char **sp; slist_t *lp; @@ -1803,10 +1759,7 @@ * associated with the matched string in match_value. */ int -find_value(slist, match_str, match_value) - slist_t *slist; - char *match_str; - int *match_value; +find_value(slist_t *slist, char *match_str, int *match_value) { int i; int nmatches; @@ -1851,9 +1804,7 @@ * Return the string associated with that value. */ char * -find_string(slist, match_value) - slist_t *slist; - int match_value; +find_string(slist_t *slist, int match_value) { for (; slist->str != NULL; slist++) { if (slist->value == match_value) { @@ -1861,15 +1812,14 @@ } } - return ((char *)NULL); + return (NULL); } /* * Return the width of the widest string in an slist */ static int -slist_widest_str(slist) - slist_t *slist; +slist_widest_str(slist_t *slist) { int i; int width; @@ -1887,9 +1837,7 @@ * Print a string left-justified to a fixed width. */ static void -ljust_print(str, width) - char *str; - int width; +ljust_print(char *str, int width) { int i; @@ -2050,9 +1998,7 @@ * data is not crud, so be rather defensive. */ void -print_buf(buf, nbytes) - char *buf; - int nbytes; +print_buf(char *buf, int nbytes) { int c; @@ -2072,13 +2018,12 @@ * booting. */ void -pr_ctlrline(ctlr) - register struct ctlr_info *ctlr; +pr_ctlrline(struct ctlr_info *ctlr) { fmt_print(" %s%d at %s 0x%x ", - ctlr->ctlr_cname, ctlr->ctlr_num, - space2str(ctlr->ctlr_space), ctlr->ctlr_addr); + ctlr->ctlr_cname, ctlr->ctlr_num, + space2str(ctlr->ctlr_space), ctlr->ctlr_addr); if (ctlr->ctlr_vec != 0) fmt_print("vec 0x%x ", ctlr->ctlr_vec); else @@ -2093,9 +2038,7 @@ * booting. */ void -pr_diskline(disk, num) - register struct disk_info *disk; - int num; +pr_diskline(struct disk_info *disk, int num) { struct ctlr_info *ctlr = disk->disk_ctlr; struct disk_type *type = disk->disk_type; @@ -2103,13 +2046,13 @@ fmt_print(" %4d. %s ", num, disk->disk_name); if ((type != NULL) && (disk->label_type == L_TYPE_SOLARIS)) { fmt_print("<%s cyl %u alt %u hd %u sec %u>", - type->dtype_asciilabel, type->dtype_ncyl, - type->dtype_acyl, type->dtype_nhead, - type->dtype_nsect); + type->dtype_asciilabel, type->dtype_ncyl, + type->dtype_acyl, type->dtype_nhead, + type->dtype_nsect); } else if ((type != NULL) && (disk->label_type == L_TYPE_EFI)) { cur_blksz = disk->disk_lbasize; print_efi_string(type->vendor, type->product, - type->revision, type->capacity); + type->revision, type->capacity); } else if (disk->disk_flags & DSK_RESERVED) { fmt_print("<drive not available: reserved>"); } else if (disk->disk_flags & DSK_UNAVAILABLE) { @@ -2127,9 +2070,9 @@ fmt_print(" %s\n", disk->devfs_name); } else { fmt_print(" %s%d at %s%d slave %d\n", - ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit, - ctlr->ctlr_cname, ctlr->ctlr_num, - disk->disk_dkinfo.dki_slave); + ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit, + ctlr->ctlr_cname, ctlr->ctlr_num, + disk->disk_dkinfo.dki_slave); } #ifdef OLD @@ -2141,8 +2084,7 @@ } fmt_print("\n"); if (type != NULL) { - fmt_print( -" %s%d: <%s cyl %u alt %u hd %u sec %u>\n", + fmt_print(" %s%d: <%s cyl %u alt %u hd %u sec %u>\n", ctlr->ctlr_dname, disk->disk_dkinfo.dki_unit, type->dtype_asciilabel, type->dtype_ncyl, type->dtype_acyl, type->dtype_nhead, @@ -2175,7 +2117,7 @@ * track of the current line in the data file via a global variable. */ static int -sup_inputchar() +sup_inputchar(void) { int c; @@ -2210,8 +2152,7 @@ * This routine pushes a character back onto the input pipe for the data file. */ static void -sup_pushchar(c) - int c; +sup_pushchar(int c) { (void) ungetc(c, data_file); } @@ -2230,16 +2171,14 @@ * last token around, which is useful for error recovery. */ int -sup_gettoken(buf) - char *buf; +sup_gettoken(char *buf) { last_token_type = sup_get_token(buf); return (last_token_type); } static int -sup_get_token(buf) - char *buf; +sup_get_token(char *buf) { char *ptr = buf; int c, quoted = 0; @@ -2288,7 +2227,7 @@ * a token. */ if (!quoted && (c == '=' || c == ',' || c == ':' || - c == '#' || c == '|' || c == '&' || c == '~')) + c == '#' || c == '|' || c == '&' || c == '~')) break; /* * Store the character if there's room left. @@ -2350,9 +2289,7 @@ * Push back a token */ void -sup_pushtoken(token_buf, token_type) - char *token_buf; - int token_type; +sup_pushtoken(char *token_buf, int token_type) { /* * We can only push one token back at a time @@ -2369,9 +2306,7 @@ * and EOF. */ void -get_inputline(line, nbytes) - char *line; - int nbytes; +get_inputline(char *line, int nbytes) { char *p = line; int c; @@ -2481,9 +2416,9 @@ /* reopen file descriptor if one was open before */ if (cur_disk != NULL) { if ((cur_file = open_disk(cur_disk->disk_path, - O_RDWR | O_NDELAY)) < 0) { + O_RDWR | O_NDELAY)) < 0) { err_print("Error: can't reopen selected disk '%s'. \n", - cur_disk->disk_name); + cur_disk->disk_name); fullabort(); } }
--- a/usr/src/cmd/format/label.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/label.c Mon Dec 09 14:15:34 2019 +0000 @@ -282,7 +282,7 @@ if (cur_label == L_TYPE_EFI) { enter_critical(); vtoc64 = cur_parts->etoc; - err_check(vtoc64); + efi_err_check(vtoc64); if (efi_write(cur_file, vtoc64) != 0) { err_print("Warning: error writing EFI.\n"); error = -1; @@ -979,97 +979,6 @@ return (0); } -/* make sure the user specified something reasonable */ -void -err_check(struct dk_gpt *vtoc) -{ - int resv_part = -1; - int i, j; - diskaddr_t istart, jstart, isize, jsize, endsect; - int overlap = 0; - uint_t reserved; - - /* - * make sure no partitions overlap - */ - reserved = efi_reserved_sectors(vtoc); - for (i = 0; i < vtoc->efi_nparts; i++) { - /* It can't be unassigned and have an actual size */ - if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && - (vtoc->efi_parts[i].p_size != 0)) { - (void) fprintf(stderr, -"partition %d is \"unassigned\" but has a size of %llu\n", i, - vtoc->efi_parts[i].p_size); - } - if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) { - continue; - } - if (vtoc->efi_parts[i].p_tag == V_RESERVED) { - if (resv_part != -1) { - (void) fprintf(stderr, -"found duplicate reserved partition at %d\n", i); - } - resv_part = i; - if (vtoc->efi_parts[i].p_size != reserved) - (void) fprintf(stderr, -"Warning: reserved partition size must be %u sectors\n", - reserved); - } - if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) || - (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) { - (void) fprintf(stderr, - "Partition %d starts at %llu\n", - i, - vtoc->efi_parts[i].p_start); - (void) fprintf(stderr, - "It must be between %llu and %llu.\n", - vtoc->efi_first_u_lba, - vtoc->efi_last_u_lba); - } - if ((vtoc->efi_parts[i].p_start + - vtoc->efi_parts[i].p_size < - vtoc->efi_first_u_lba) || - (vtoc->efi_parts[i].p_start + - vtoc->efi_parts[i].p_size > - vtoc->efi_last_u_lba + 1)) { - (void) fprintf(stderr, - "Partition %d ends at %llu\n", - i, - vtoc->efi_parts[i].p_start + - vtoc->efi_parts[i].p_size); - (void) fprintf(stderr, - "It must be between %llu and %llu.\n", - vtoc->efi_first_u_lba, - vtoc->efi_last_u_lba); - } - - for (j = 0; j < vtoc->efi_nparts; j++) { - isize = vtoc->efi_parts[i].p_size; - jsize = vtoc->efi_parts[j].p_size; - istart = vtoc->efi_parts[i].p_start; - jstart = vtoc->efi_parts[j].p_start; - if ((i != j) && (isize != 0) && (jsize != 0)) { - endsect = jstart + jsize -1; - if ((jstart <= istart) && - (istart <= endsect)) { - if (!overlap) { - (void) fprintf(stderr, -"label error: EFI Labels do not support overlapping partitions\n"); - } - (void) fprintf(stderr, -"Partition %d overlaps partition %d.\n", i, j); - overlap = 1; - } - } - } - } - /* make sure there is a reserved partition */ - if (resv_part == -1) { - (void) fprintf(stderr, - "no reserved partition found\n"); - } -} - #ifdef DEBUG static void dump_label(struct dk_label *label)
--- a/usr/src/cmd/format/label.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/label.h Mon Dec 09 14:15:34 2019 +0000 @@ -44,7 +44,7 @@ int get_disk_info(int, struct efi_info *, struct disk_info *); int label_to_vtoc(struct extvtoc *, struct dk_label *); int SMI_vtoc_to_EFI(int, struct dk_gpt **); -void err_check(struct dk_gpt *); +void efi_err_check(struct dk_gpt *); extern int is_efi_type(int); #ifdef __cplusplus
--- a/usr/src/cmd/format/menu_command.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/menu_command.c Mon Dec 09 14:15:34 2019 +0000 @@ -1709,7 +1709,7 @@ return (-1); } if (efi_write(cur_file, vtoc64) != 0) { - err_check(vtoc64); + efi_err_check(vtoc64); err_print("Warning: error writing EFI.\n"); return (-1); } else {
--- a/usr/src/cmd/format/menu_partition.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/menu_partition.c Mon Dec 09 14:15:34 2019 +0000 @@ -35,25 +35,15 @@ #include "misc.h" #include "param.h" -#ifdef __STDC__ - /* Function prototypes for ANSI C Compilers */ static void nspaces(int); static int ndigits(uint64_t); -#else /* __STDC__ */ - -/* Function prototypes for non-ANSI C Compilers */ -static void nspaces(); -static int ndigits(); - -#endif /* __STDC__ */ - /* * This routine implements the 'a' command. It changes the 'a' partition. */ int -p_apart() +p_apart(void) { change_partition(0); @@ -64,7 +54,7 @@ * This routine implements the 'b' command. It changes the 'b' partition. */ int -p_bpart() +p_bpart(void) { change_partition(1); @@ -75,7 +65,7 @@ * This routine implements the 'c' command. It changes the 'c' partition. */ int -p_cpart() +p_cpart(void) { change_partition(2); @@ -86,7 +76,7 @@ * This routine implements the 'd' command. It changes the 'd' partition. */ int -p_dpart() +p_dpart(void) { change_partition(3); @@ -97,7 +87,7 @@ * This routine implements the 'e' command. It changes the 'e' partition. */ int -p_epart() +p_epart(void) { change_partition(4); @@ -108,7 +98,7 @@ * This routine implements the 'f' command. It changes the 'f' partition. */ int -p_fpart() +p_fpart(void) { change_partition(5); @@ -119,7 +109,7 @@ * This routine implements the 'g' command. It changes the 'g' partition. */ int -p_gpart() +p_gpart(void) { change_partition(6); @@ -130,7 +120,7 @@ * This routine implements the 'h' command. It changes the 'h' partition. */ int -p_hpart() +p_hpart(void) { change_partition(7); @@ -142,7 +132,7 @@ * labeled disks. This can be used only in expert mode. */ int -p_ipart() +p_ipart(void) { change_partition(8); return (0); @@ -153,7 +143,7 @@ * This routine implements the 'j' command. It changes the 'j' partition. */ int -p_jpart() +p_jpart(void) { change_partition(9); @@ -162,7 +152,7 @@ #endif /* defined(i386) */ int -p_expand() +p_expand(void) { uint64_t delta; uint_t nparts; @@ -193,7 +183,7 @@ * to make a pre-defined partition map the current map. */ int -p_select() +p_select(void) { struct partition_info *pptr, *parts; u_ioparam_t ioparam; @@ -254,8 +244,8 @@ cyl_offset = pptr->pinfo_map[I_PARTITION].dkl_cylno + 1; if (pptr->pinfo_map[J_PARTITION].dkl_nblk != 0) { cyl_offset = pptr->pinfo_map[J_PARTITION].dkl_cylno + - ((pptr->pinfo_map[J_PARTITION].dkl_nblk + - (spc() - 1)) / spc()); + ((pptr->pinfo_map[J_PARTITION].dkl_nblk + + (spc() - 1)) / spc()); } #else /* !defined(i386) */ @@ -281,11 +271,10 @@ } #endif /* defined(i386) */ if (pptr->pinfo_map[i].dkl_cylno < b_cylno || - pptr->pinfo_map[i].dkl_cylno > (ncyl-1)) { - err_print( -"partition %c: starting cylinder %d is out of range\n", - (PARTITION_BASE+i), - pptr->pinfo_map[i].dkl_cylno); + pptr->pinfo_map[i].dkl_cylno > (ncyl-1)) { + err_print("partition %c: starting cylinder %d is out " + "of range\n", (PARTITION_BASE + i), + pptr->pinfo_map[i].dkl_cylno); return (0); } if (pptr->pinfo_map[i].dkl_nblk > ((ncyl - @@ -336,7 +325,7 @@ * to be created. */ int -p_name() +p_name(void) { char *name; @@ -493,25 +482,23 @@ ncyl2_digits = ndigits(map->efi_last_u_lba); if (want_header) { - fmt_print("Part "); - fmt_print("Tag Flag "); - fmt_print("First Sector"); - nspaces(ncyl2_digits); - fmt_print("Size"); - nspaces(ncyl2_digits); - fmt_print("Last Sector\n"); + fmt_print("Part "); + fmt_print("Tag Flag "); + fmt_print("First Sector"); + nspaces(ncyl2_digits); + fmt_print("Size"); + nspaces(ncyl2_digits); + fmt_print("Last Sector\n"); } fmt_print(" %d ", partnum); - s = find_string(ptag_choices, - (int)map->efi_parts[partnum].p_tag); + s = find_string(ptag_choices, (int)map->efi_parts[partnum].p_tag); if (s == (char *)NULL) s = "-"; nspaces(10 - (int)strlen(s)); fmt_print("%s", s); - s = find_string(pflag_choices, - (int)map->efi_parts[partnum].p_flag); + s = find_string(pflag_choices, (int)map->efi_parts[partnum].p_flag); if (s == (char *)NULL) s = "-"; nspaces(6 - (int)strlen(s)); @@ -521,28 +508,27 @@ secsize = map->efi_parts[partnum].p_size; if (secsize == 0) { - fmt_print("%16llu", map->efi_parts[partnum].p_start); - nspaces(ncyl2_digits); - fmt_print(" 0 "); + fmt_print("%16llu", map->efi_parts[partnum].p_start); + nspaces(ncyl2_digits); + fmt_print(" 0 "); } else { - fmt_print("%16llu", map->efi_parts[partnum].p_start); - scaled = bn2mb(secsize); - nspaces(ncyl2_digits - 5); - if (scaled >= (float)1024.0 * 1024) { - fmt_print("%8.2fTB", scaled/((float)1024.0 * 1024)); - } else if (scaled >= (float)1024.0) { - fmt_print("%8.2fGB", scaled/(float)1024.0); - } else { - fmt_print("%8.2fMB", scaled); - } + fmt_print("%16llu", map->efi_parts[partnum].p_start); + scaled = bn2mb(secsize); + nspaces(ncyl2_digits - 5); + if (scaled >= (float)1024.0 * 1024) { + fmt_print("%8.2fTB", scaled/((float)1024.0 * 1024)); + } else if (scaled >= (float)1024.0) { + fmt_print("%8.2fGB", scaled/(float)1024.0); + } else { + fmt_print("%8.2fMB", scaled); + } } nspaces(ncyl2_digits); - if ((map->efi_parts[partnum].p_start+secsize - 1) == - UINT_MAX64) { - fmt_print(" 0 \n"); + if ((map->efi_parts[partnum].p_start + secsize - 1) == UINT_MAX64) { + fmt_print(" 0 \n"); } else { - fmt_print(" %llu \n", - map->efi_parts[partnum].p_start+secsize - 1); + fmt_print(" %llu \n", + map->efi_parts[partnum].p_start + secsize - 1); } } @@ -607,8 +593,7 @@ /* * Print the partition tag. If invalid, print - */ - s = find_string(ptag_choices, - (int)pinfo->vtoc.v_part[partnum].p_tag); + s = find_string(ptag_choices, (int)pinfo->vtoc.v_part[partnum].p_tag); if (s == (char *)NULL) s = "-"; nspaces(10 - (int)strlen(s)); @@ -617,9 +602,8 @@ /* * Print the partition flag. If invalid print - */ - s = find_string(pflag_choices, - (int)pinfo->vtoc.v_part[partnum].p_flag); - if (s == (char *)NULL) + s = find_string(pflag_choices, (int)pinfo->vtoc.v_part[partnum].p_flag); + if (s == NULL) s = "-"; nspaces(6 - (int)strlen(s)); fmt_print("%s", s); @@ -637,7 +621,7 @@ scaled = bn2mb(nblks); if (scaled > (float)1024.0 * 1024.0) { fmt_print("%8.2fTB ", - scaled/((float)1024.0 * 1024.0)); + scaled/((float)1024.0 * 1024.0)); } else if (scaled > (float)1024.0) { fmt_print("%8.2fGB ", scaled/(float)1024.0); } else { @@ -666,8 +650,7 @@ * Return true if a disk has a volume name */ int -chk_volname(disk) - struct disk_info *disk; +chk_volname(struct disk_info *disk) { return (disk->v_volume[0] != 0); } @@ -677,8 +660,7 @@ * Print the volume name, if it appears to be set */ void -print_volname(disk) - struct disk_info *disk; +print_volname(struct disk_info *disk) { int i; char *p; @@ -696,8 +678,7 @@ * Print a number of spaces */ static void -nspaces(n) - int n; +nspaces(int n) { while (n-- > 0) fmt_print(" "); @@ -707,8 +688,7 @@ * Return the number of digits required to print a number */ static int -ndigits(n) - uint64_t n; +ndigits(uint64_t n) { int i;
--- a/usr/src/cmd/format/modify_partition.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/modify_partition.c Mon Dec 09 14:15:34 2019 +0000 @@ -38,26 +38,11 @@ #include "label.h" #include "auto_sense.h" -#ifdef __STDC__ - -/* Function prototypes for ANSI C Compilers */ - static void adj_cyl_offset(struct dk_map32 *map); static int check_map(struct dk_map32 *map); static void get_user_map(struct dk_map32 *map, int float_part); static void get_user_map_efi(struct dk_gpt *map, int float_part); -#else /* __STDC__ */ - -/* Function prototypes for non-ANSI C Compilers */ - -static void adj_cyl_offset(); -static int check_map(); -static void get_user_map(); -static void get_user_map_efi(); - -#endif /* __STDC__ */ - static char *partn_list[] = { "0", "1", "2", "3", "4", "5", "6", "7", NULL }; static char *sel_list[] = { "0", "1", "2", "3", NULL }; @@ -69,7 +54,7 @@ * Modify/Create a predefined partition table. */ int -p_modify() +p_modify(void) { struct partition_info tmp_pinfo[1]; struct dk_map32 *map = tmp_pinfo->pinfo_map; @@ -134,89 +119,92 @@ */ if (cur_parts->pinfo_name != NULL) { (void) snprintf(tmpstr, sizeof (tmpstr), - "\t0. Current partition table (%s)", - cur_parts->pinfo_name); + "\t0. Current partition table (%s)", + cur_parts->pinfo_name); } else { (void) sprintf(tmpstr, - "\t0. Current partition table (unnamed)"); + "\t0. Current partition table (unnamed)"); } (void) snprintf(tmpstr2, sizeof (tmpstr2), -"Select partitioning base:\n%s\n" -"\t1. All Free Hog\n" -"Choose base (enter number) ", - tmpstr); + "Select partitioning base:\n%s\n" + "\t1. All Free Hog\n" + "Choose base (enter number) ", + tmpstr); ioparam.io_charlist = sel_list; sel_type = input(FIO_MSTR, tmpstr2, '?', &ioparam, - &sel_type, DATA_INPUT); + &sel_type, DATA_INPUT); switch (cur_label) { case L_TYPE_SOLARIS: - if (sel_type == 0) { - /* - * Check for invalid parameters but do - * not modify the table. - */ - if (check_map(cur_parts->pinfo_map)) { - err_print("\ -Warning: Fix, or select a different partition table.\n"); - return (0); - } - /* - * Create partition map from existing map - */ - tmp_pinfo->vtoc = cur_parts->vtoc; - for (i = 0; i < NDKMAP; i++) { - map[i].dkl_nblk = cur_parts->pinfo_map[i].dkl_nblk; - map[i].dkl_cylno = cur_parts->pinfo_map[i].dkl_cylno; - } - } else { - /* - * Make an empty partition map, with all the space - * in the c partition. - */ - set_vtoc_defaults(tmp_pinfo); - for (i = 0; i < NDKMAP; i++) { - map[i].dkl_nblk = 0; - map[i].dkl_cylno = 0; - } - map[C_PARTITION].dkl_nblk = ncyl * spc(); + if (sel_type == 0) { + /* + * Check for invalid parameters but do + * not modify the table. + */ + if (check_map(cur_parts->pinfo_map)) { + err_print("Warning: Fix, or select a " + "different partition table.\n"); + return (0); + } + /* + * Create partition map from existing map + */ + tmp_pinfo->vtoc = cur_parts->vtoc; + for (i = 0; i < NDKMAP; i++) { + map[i].dkl_nblk = + cur_parts->pinfo_map[i].dkl_nblk; + map[i].dkl_cylno = + cur_parts->pinfo_map[i].dkl_cylno; + } + } else { + /* + * Make an empty partition map, with all the space + * in the c partition. + */ + set_vtoc_defaults(tmp_pinfo); + for (i = 0; i < NDKMAP; i++) { + map[i].dkl_nblk = 0; + map[i].dkl_cylno = 0; + } + map[C_PARTITION].dkl_nblk = ncyl * spc(); #if defined(i386) - /* - * Adjust for the boot and possibly alternates partitions - */ - map[I_PARTITION].dkl_nblk = spc(); - map[I_PARTITION].dkl_cylno = 0; - if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) { - map[J_PARTITION].dkl_nblk = 2 * spc(); - map[J_PARTITION].dkl_cylno = spc() / spc(); - } + /* + * Adjust for the boot and possibly alternates + * partitions. + */ + map[I_PARTITION].dkl_nblk = spc(); + map[I_PARTITION].dkl_cylno = 0; + if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) { + map[J_PARTITION].dkl_nblk = 2 * spc(); + map[J_PARTITION].dkl_cylno = spc() / spc(); + } #endif /* defined(i386) */ - } - break; + } + break; case L_TYPE_EFI: - if (sel_type == 1) { - for (i = 0; i < cur_parts->etoc->efi_nparts; i++) { - cur_parts->etoc->efi_parts[i].p_start = 0; - cur_parts->etoc->efi_parts[i].p_size = 0; + if (sel_type == 1) { + for (i = 0; i < cur_parts->etoc->efi_nparts; i++) { + cur_parts->etoc->efi_parts[i].p_start = 0; + cur_parts->etoc->efi_parts[i].p_size = 0; + } } - } - break; + break; } fmt_print("\n"); if (cur_label == L_TYPE_SOLARIS) { - print_map(tmp_pinfo); + print_map(tmp_pinfo); } else { - print_map(cur_parts); + print_map(cur_parts); } ioparam.io_charlist = confirm_list; - if (input(FIO_MSTR, -"Do you wish to continue creating a new partition\ntable based on above table", - '?', &ioparam, &inpt_dflt, DATA_INPUT)) { + if (input(FIO_MSTR, "Do you wish to continue creating a new " + "partition\ntable based on above table", + '?', &ioparam, &inpt_dflt, DATA_INPUT)) { return (0); } @@ -228,11 +216,11 @@ free_hog = G_PARTITION; /* default to g partition */ ioparam.io_charlist = partn_list; free_hog = input(FIO_MSTR, "Free Hog partition", '?', - &ioparam, &free_hog, DATA_INPUT); + &ioparam, &free_hog, DATA_INPUT); /* disallow c partition */ if (free_hog == C_PARTITION) { fmt_print("'%c' cannot be the 'Free Hog' partition.\n", - C_PARTITION + PARTITION_BASE); + C_PARTITION + PARTITION_BASE); free_hog = -1; continue; } @@ -246,7 +234,7 @@ map[free_hog].dkl_nblk -= map[I_PARTITION].dkl_nblk; if (cur_ctype->ctype_ctype != DKC_SCSI_CCS) { map[free_hog].dkl_nblk -= - map[J_PARTITION].dkl_nblk; + map[J_PARTITION].dkl_nblk; } #endif /* defined(i386) */ break; @@ -256,11 +244,11 @@ * the float partition. */ if (map[free_hog].dkl_nblk == 0) { - err_print("\ -Warning: No space available from Free Hog partition.\n"); + err_print("Warning: No space available from Free Hog " + "partition.\n"); ioparam.io_charlist = confirm_list; if (input(FIO_MSTR, "Continue", '?', - &ioparam, &inpt_dflt, DATA_INPUT)) { + &ioparam, &inpt_dflt, DATA_INPUT)) { free_hog = -1; } } @@ -268,27 +256,27 @@ inpt_dflt = 0; if (cur_label == L_TYPE_EFI) { - free_hog = G_PARTITION; /* default to g partition */ - ioparam.io_charlist = partn_list; - free_hog = input(FIO_MSTR, "Free Hog partition", '?', - &ioparam, &free_hog, DATA_INPUT); - /* disallow c partition */ - if (free_hog == C_PARTITION) { - fmt_print("'%c' cannot be the 'Free Hog' partition.\n", - C_PARTITION + PARTITION_BASE); - return (-1); - } - get_user_map_efi(cur_parts->etoc, free_hog); - print_map(cur_parts); - if (check("Ready to label disk, continue")) { - return (-1); - } - fmt_print("\n"); - if (write_label()) { - err_print("Writing label failed\n"); - return (-1); - } - return (0); + free_hog = G_PARTITION; /* default to g partition */ + ioparam.io_charlist = partn_list; + free_hog = input(FIO_MSTR, "Free Hog partition", '?', + &ioparam, &free_hog, DATA_INPUT); + /* disallow c partition */ + if (free_hog == C_PARTITION) { + fmt_print("'%c' cannot be the 'Free Hog' partition.\n", + C_PARTITION + PARTITION_BASE); + return (-1); + } + get_user_map_efi(cur_parts->etoc, free_hog); + print_map(cur_parts); + if (check("Ready to label disk, continue")) { + return (-1); + } + fmt_print("\n"); + if (write_label()) { + err_print("Writing label failed\n"); + return (-1); + } + return (0); } /* * get user modified partition table @@ -304,9 +292,8 @@ print_map(tmp_pinfo); ioparam.io_charlist = confirm_list; - if (input(FIO_MSTR, "\ -Okay to make this the current partition table", '?', - &ioparam, &inpt_dflt, DATA_INPUT)) { + if (input(FIO_MSTR, "Okay to make this the current partition table", + '?', &ioparam, &inpt_dflt, DATA_INPUT)) { return (0); } else { make_partition(); @@ -318,9 +305,9 @@ cur_parts->pinfo_map[i].dkl_cylno = map[i].dkl_cylno; #ifdef i386 cur_parts->vtoc.v_part[i].p_start = - map[i].dkl_cylno * nhead * nsect; + map[i].dkl_cylno * nhead * nsect; cur_parts->vtoc.v_part[i].p_size = - map[i].dkl_nblk; + map[i].dkl_nblk; #endif } (void) p_name(); @@ -340,14 +327,11 @@ } } - - /* * Adjust cylinder offsets */ static void -adj_cyl_offset(map) - struct dk_map32 *map; +adj_cyl_offset(struct dk_map32 *map) { int i; int cyloffset = 0; @@ -390,8 +374,7 @@ * Check partition table */ static int -check_map(map) - struct dk_map32 *map; +check_map(struct dk_map32 *map) { int i; int cyloffset = 0; @@ -411,16 +394,16 @@ */ for (i = 0; i < NDKMAP; i++) { if (map[i].dkl_cylno > (blkaddr32_t)ncyl-1) { - err_print("\ -Warning: Partition %c starting cylinder %d is out of range.\n", - (PARTITION_BASE+i), map[i].dkl_cylno); + err_print("Warning: Partition %c starting cylinder " + "%d is out of range.\n", + (PARTITION_BASE+i), map[i].dkl_cylno); return (-1); } if (map[i].dkl_nblk > - (blkaddr32_t)(ncyl - map[i].dkl_cylno) * spc()) { - err_print("\ -Warning: Partition %c, specified # of blocks, %u, is out of range.\n", - (PARTITION_BASE+i), map[i].dkl_nblk); + (blkaddr32_t)(ncyl - map[i].dkl_cylno) * spc()) { + err_print("Warning: Partition %c, specified # of " + "blocks, %u, is out of range.\n", + (PARTITION_BASE+i), map[i].dkl_nblk); return (-1); } if (i != C_PARTITION && map[i].dkl_nblk) { @@ -429,21 +412,21 @@ continue; #endif if (map[i].dkl_cylno < cyloffset) { - err_print( -"Warning: Overlapping partition (%c) in table.\n", PARTITION_BASE+i); + err_print("Warning: Overlapping partition " + "(%c) in table.\n", PARTITION_BASE+i); return (-1); } else if (map[i].dkl_cylno > cyloffset) { - err_print( -"Warning: Non-contiguous partition (%c) in table.\n", PARTITION_BASE+i); + err_print("Warning: Non-contiguous partition " + "(%c) in table.\n", PARTITION_BASE+i); } cyloffset += (map[i].dkl_nblk + (spc()-1))/spc(); tot_blks = map[i].dkl_nblk; } } if (tot_blks > map[C_PARTITION].dkl_nblk) { - err_print("\ -Warning: Total blocks used is greater than number of blocks in '%c'\n\ -\tpartition.\n", C_PARTITION + PARTITION_BASE); + err_print("Warning: Total blocks used is greater than number " + "of blocks in '%c'\n\tpartition.\n", + C_PARTITION + PARTITION_BASE); return (-1); } return (0); @@ -455,9 +438,7 @@ * get user defined partitions */ static void -get_user_map(map, float_part) - struct dk_map32 *map; - int float_part; +get_user_map(struct dk_map32 *map, int float_part) { int i; blkaddr32_t newsize; @@ -471,24 +452,24 @@ for (i = 0; i < NDKMAP; i++) { if (partn_list[i] == NULL) break; - if ((i == C_PARTITION) || (i == float_part)) + if ((i == C_PARTITION) || (i == float_part)) { continue; - else { + } else { ioparam.io_bounds.lower = 0; ioparam.io_bounds.upper = map[i].dkl_nblk + - map[float_part].dkl_nblk; + map[float_part].dkl_nblk; deflt = map[i].dkl_nblk; if (ioparam.io_bounds.upper == 0) { - err_print("\ -Warning: no space available for '%s' from Free Hog partition\n", - partn_list[i]); + err_print("Warning: no space available for " + "'%s' from Free Hog partition\n", + partn_list[i]); continue; } (void) snprintf(tmpstr, sizeof (tmpstr), - "Enter size of partition '%s' ", - partn_list[i]); + "Enter size of partition '%s' ", + partn_list[i]); newsize = (blkaddr32_t)input(FIO_CYL, tmpstr, ':', - &ioparam, (int *)&deflt, DATA_INPUT); + &ioparam, (int *)&deflt, DATA_INPUT); map[float_part].dkl_nblk -= (newsize - map[i].dkl_nblk); map[i].dkl_nblk = newsize; } @@ -496,8 +477,7 @@ } static struct partition_info * -build_partition(tptr) -struct disk_type *tptr; +build_partition(struct disk_type *tptr) { struct partition_info *part; struct dk_label *label; @@ -524,21 +504,20 @@ if (!build_default_partition(label, cur_ctype->ctype_ctype)) return (NULL); - part = (struct partition_info *) - zalloc(sizeof (struct partition_info)); + part = zalloc(sizeof (struct partition_info)); part->pinfo_name = alloc_string(tptr->dtype_asciilabel); /* * Fill in the partition info from the label */ for (i = 0; i < NDKMAP; i++) { #if defined(_SUNOS_VTOC_8) - part->pinfo_map[i] = label->dkl_map[i]; + part->pinfo_map[i] = label->dkl_map[i]; #else - part->pinfo_map[i].dkl_cylno = - label->dkl_vtoc.v_part[i].p_start / - (blkaddr32_t)(tptr->dtype_nhead * tptr->dtype_nsect - apc); - part->pinfo_map[i].dkl_nblk = - label->dkl_vtoc.v_part[i].p_size; + part->pinfo_map[i].dkl_cylno = + label->dkl_vtoc.v_part[i].p_start / + (blkaddr32_t)(tptr->dtype_nhead * tptr->dtype_nsect - apc); + part->pinfo_map[i].dkl_nblk = + label->dkl_vtoc.v_part[i].p_size; #endif /* ifdefined(_SUNOS_VTOC_8) */ } part->vtoc = label->dkl_vtoc; @@ -549,11 +528,8 @@ * build new partition table for given disk type */ static void -get_user_map_efi(map, float_part) - struct dk_gpt *map; - int float_part; +get_user_map_efi(struct dk_gpt *map, int float_part) { - int i; efi_deflt_t efi_deflt; u_ioparam_t ioparam; @@ -591,7 +567,7 @@ } map->efi_parts[float_part].p_start = start_lba; map->efi_parts[float_part].p_size = map->efi_last_u_lba + 1 - - start_lba - reserved; + start_lba - reserved; map->efi_parts[float_part].p_tag = V_USR; if (map->efi_parts[float_part].p_size == 0) { map->efi_parts[float_part].p_size = 0; @@ -612,8 +588,7 @@ void -new_partitiontable(tptr, oldtptr) -struct disk_type *tptr, *oldtptr; +new_partitiontable(struct disk_type *tptr, struct disk_type *oldtptr) { struct partition_info *part; @@ -622,16 +597,15 @@ * partition table else copy the old partition table.(best guess). */ if ((oldtptr != NULL) && - (tptr->dtype_ncyl == oldtptr->dtype_ncyl) && - (tptr->dtype_nhead == oldtptr->dtype_nhead) && - (tptr->dtype_nsect == oldtptr->dtype_nsect)) { - - part = (struct partition_info *) - zalloc(sizeof (struct partition_info)); - bcopy((char *)cur_parts, (char *)part, - sizeof (struct partition_info)); - part->pinfo_next = tptr->dtype_plist; - tptr->dtype_plist = part; + (tptr->dtype_ncyl == oldtptr->dtype_ncyl) && + (tptr->dtype_nhead == oldtptr->dtype_nhead) && + (tptr->dtype_nsect == oldtptr->dtype_nsect)) { + part = (struct partition_info *) + zalloc(sizeof (struct partition_info)); + bcopy((char *)cur_parts, (char *)part, + sizeof (struct partition_info)); + part->pinfo_next = tptr->dtype_plist; + tptr->dtype_plist = part; } else { #ifdef DEBUG
--- a/usr/src/cmd/format/partition.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/format/partition.c Mon Dec 09 14:15:34 2019 +0000 @@ -80,31 +80,31 @@ int i; for (i = 0; i < map->efi_nparts - 1; i++) { - start[0] = map->efi_parts[i].p_start; - size[0] = map->efi_parts[i].p_size; - sec_no[0] = start[0] + size[0]; + start[0] = map->efi_parts[i].p_start; + size[0] = map->efi_parts[i].p_size; + sec_no[0] = start[0] + size[0]; - start[1] = map->efi_parts[i+1].p_start; - size[1] = map->efi_parts[i+1].p_size; - sec_no[1] = start[1] + size[1]; + start[1] = map->efi_parts[i + 1].p_start; + size[1] = map->efi_parts[i + 1].p_size; + sec_no[1] = start[1] + size[1]; - if (map->efi_parts[i].p_tag == V_BACKUP) { - sec_no[0] = 0; - } - if (map->efi_parts[i+1].p_tag == V_BACKUP) { - sec_no[1] = 0; - } - if (i == 0) { - max = sec_no[1]; - } - if (sec_no[0] > max) { - max = sec_no[0]; - } else { - max = max; - } + if (map->efi_parts[i].p_tag == V_BACKUP) { + sec_no[0] = 0; + } + if (map->efi_parts[i+1].p_tag == V_BACKUP) { + sec_no[1] = 0; + } + if (i == 0) { + max = sec_no[1]; + } + if (sec_no[0] > max) { + max = sec_no[0]; + } else { + max = max; + } } if (max == 0) - max = map->efi_first_u_lba; + max = map->efi_first_u_lba; return (max); } @@ -136,66 +136,66 @@ } if (cur_label == L_TYPE_EFI) { - if (num > cur_parts->etoc->efi_nparts - 1) { - err_print("Invalid partition for EFI label\n"); - return; - } - print_efi_partition(cur_parts->etoc, num, 1); - fmt_print("\n"); + if (num > cur_parts->etoc->efi_nparts - 1) { + err_print("Invalid partition for EFI label\n"); + return; + } + print_efi_partition(cur_parts->etoc, num, 1); + fmt_print("\n"); /* * Prompt for p_tag and p_flag values for this partition */ - deflt = cur_parts->etoc->efi_parts[num].p_tag; - if (deflt == V_UNASSIGNED) { - deflt = V_USR; - } - (void) sprintf(msg, "Enter partition id tag"); - ioparam.io_slist = ptag_choices; - tag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT); + deflt = cur_parts->etoc->efi_parts[num].p_tag; + if (deflt == V_UNASSIGNED) { + deflt = V_USR; + } + (void) sprintf(msg, "Enter partition id tag"); + ioparam.io_slist = ptag_choices; + tag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT); - deflt = cur_parts->etoc->efi_parts[num].p_flag; - (void) sprintf(msg, "Enter partition permission flags"); - ioparam.io_slist = pflag_choices; - flag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT); + deflt = cur_parts->etoc->efi_parts[num].p_flag; + (void) sprintf(msg, "Enter partition permission flags"); + ioparam.io_slist = pflag_choices; + flag = input(FIO_SLIST, msg, ':', &ioparam, &deflt, DATA_INPUT); - ioparam.io_bounds.lower = cur_parts->etoc->efi_first_u_lba; - ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba; + ioparam.io_bounds.lower = cur_parts->etoc->efi_first_u_lba; + ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba; - efi_deflt.start_sector = maxofN(cur_parts->etoc); - if ((cur_parts->etoc->efi_parts[num].p_start != 0) && - (cur_parts->etoc->efi_parts[num].p_size != 0)) { - efi_deflt.start_sector = - cur_parts->etoc->efi_parts[num].p_start; - } - efi_deflt.end_sector = ioparam.io_bounds.upper - - efi_deflt.start_sector; - i64 = input(FIO_INT64, "Enter new starting Sector", ':', &ioparam, - (int *)&efi_deflt, DATA_INPUT); + efi_deflt.start_sector = maxofN(cur_parts->etoc); + if ((cur_parts->etoc->efi_parts[num].p_start != 0) && + (cur_parts->etoc->efi_parts[num].p_size != 0)) { + efi_deflt.start_sector = + cur_parts->etoc->efi_parts[num].p_start; + } + efi_deflt.end_sector = ioparam.io_bounds.upper - + efi_deflt.start_sector; + i64 = input(FIO_INT64, "Enter new starting Sector", ':', + &ioparam, (int *)&efi_deflt, DATA_INPUT); - ioparam.io_bounds.lower = 0; - ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba; - efi_deflt.end_sector = cur_parts->etoc->efi_parts[num].p_size; - efi_deflt.start_sector = i64; - j64 = input(FIO_EFI, "Enter partition size", ':', &ioparam, - (int *)&efi_deflt, DATA_INPUT); - if (j64 == 0) { - tag = V_UNASSIGNED; - i64 = 0; - } else if ((j64 != 0) && (tag == V_UNASSIGNED)) { - tag = V_USR; - } + ioparam.io_bounds.lower = 0; + ioparam.io_bounds.upper = cur_parts->etoc->efi_last_u_lba; + efi_deflt.end_sector = cur_parts->etoc->efi_parts[num].p_size; + efi_deflt.start_sector = i64; + j64 = input(FIO_EFI, "Enter partition size", ':', &ioparam, + (int *)&efi_deflt, DATA_INPUT); + if (j64 == 0) { + tag = V_UNASSIGNED; + i64 = 0; + } else if ((j64 != 0) && (tag == V_UNASSIGNED)) { + tag = V_USR; + } - if (cur_parts->pinfo_name != NULL) - make_partition(); + if (cur_parts->pinfo_name != NULL) + make_partition(); - cur_parts->etoc->efi_parts[num].p_tag = tag; - cur_parts->etoc->efi_parts[num].p_flag = flag; - cur_parts->etoc->efi_parts[num].p_start = i64; - cur_parts->etoc->efi_parts[num].p_size = j64; - /* - * We are now done with EFI part, so return now - */ - return; + cur_parts->etoc->efi_parts[num].p_tag = tag; + cur_parts->etoc->efi_parts[num].p_flag = flag; + cur_parts->etoc->efi_parts[num].p_start = i64; + cur_parts->etoc->efi_parts[num].p_size = j64; + /* + * We are now done with EFI part, so return now + */ + return; } /* * Print out the given partition so the user knows what they're @@ -237,9 +237,11 @@ if (tag != V_ALTSCTR) { if (cur_parts->pinfo_map[J_PARTITION].dkl_nblk != 0) { cyl_offset = - cur_parts->pinfo_map[J_PARTITION].dkl_cylno + - ((cur_parts->pinfo_map[J_PARTITION].dkl_nblk + - (spc()-1)) / spc()); + cur_parts-> + pinfo_map[J_PARTITION].dkl_cylno + + ((cur_parts-> + pinfo_map[J_PARTITION].dkl_nblk + + (spc() - 1)) / spc()); } } } @@ -247,8 +249,7 @@ ioparam.io_bounds.lower = 0; ioparam.io_bounds.upper = ncyl - 1; - deflt = max(cur_parts->pinfo_map[num].dkl_cylno, - cyl_offset); + deflt = max(cur_parts->pinfo_map[num].dkl_cylno, cyl_offset); i = (uint_t)input(FIO_INT, "Enter new starting cyl", ':', &ioparam, &deflt, DATA_INPUT); @@ -257,9 +258,8 @@ /* fill in defaults for the current partition */ p_deflt.start_cyl = i; - p_deflt.deflt_size = - min(cur_parts->pinfo_map[num].dkl_nblk, - ioparam.io_bounds.upper); + p_deflt.deflt_size = min(cur_parts->pinfo_map[num].dkl_nblk, + ioparam.io_bounds.upper); /* call input, passing p_deflt's address, typecast to (int *) */ j = (uint_t)input(FIO_ECYL, "Enter partition size", ':', &ioparam, @@ -378,18 +378,18 @@ */ enter_critical(); for (pptr = parts; pptr != NULL; pptr = pptr->pinfo_next) { - if (cur_dtype->dtype_asciilabel) { - if (pptr->pinfo_name != NULL && strcmp(pptr->pinfo_name, - cur_dtype->dtype_asciilabel) == 0) { - /* - * Set current partition and name it. - */ - cur_disk->disk_parts = cur_parts = pptr; - cur_parts->pinfo_name = pptr->pinfo_name; - exit_critical(); - return (0); + if (cur_dtype->dtype_asciilabel) { + if (pptr->pinfo_name != NULL && strcmp(pptr->pinfo_name, + cur_dtype->dtype_asciilabel) == 0) { + /* + * Set current partition and name it. + */ + cur_disk->disk_parts = cur_parts = pptr; + cur_parts->pinfo_name = pptr->pinfo_name; + exit_critical(); + return (0); + } } - } } /* * If we couldn't find a match, take the first one. @@ -436,18 +436,19 @@ * If there was a current map, copy its values. */ if (cur_label == L_TYPE_EFI) { - struct dk_gpt *map; - int nparts; - int size; + struct dk_gpt *map; + int nparts; + int size; - nparts = cur_parts->etoc->efi_nparts; - size = sizeof (struct dk_part) * nparts + sizeof (struct dk_gpt); - map = zalloc(size); - (void) memcpy(map, cur_parts->etoc, size); - pptr->etoc = map; - cur_disk->disk_parts = cur_parts = pptr; - exit_critical(); - return; + nparts = cur_parts->etoc->efi_nparts; + size = sizeof (struct dk_part) * nparts + + sizeof (struct dk_gpt); + map = zalloc(size); + (void) memcpy(map, cur_parts->etoc, size); + pptr->etoc = map; + cur_disk->disk_parts = cur_parts = pptr; + exit_critical(); + return; } if (cur_parts != NULL) { for (i = 0; i < NDKMAP; i++) {
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Mon Dec 09 14:15:34 2019 +0000 @@ -55,6 +55,7 @@ #include <sys/zfs_acl.h> #include <sys/sa_impl.h> #include <sys/multilist.h> +#include <sys/btree.h> #ifdef _KERNEL #define ZFS_OBJ_NAME "zfs" @@ -1462,13 +1463,15 @@ 0, NULL)); } - - typedef struct mdb_range_tree { struct { - uint64_t avl_numnodes; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; } rt_root; uint64_t rt_space; + range_seg_type_t rt_type; + uint8_t rt_shift; + uint64_t rt_start; } mdb_range_tree_t; typedef struct mdb_metaslab_group { @@ -1566,15 +1569,13 @@ ms.ms_unflushed_frees, 0) == -1) return (DCMD_ERR); ufrees = rt.rt_space; - raw_uchanges_mem = rt.rt_root.avl_numnodes * - mdb_ctf_sizeof_by_name("range_seg_t"); + raw_uchanges_mem = rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE; if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", ms.ms_unflushed_allocs, 0) == -1) return (DCMD_ERR); uallocs = rt.rt_space; - raw_uchanges_mem += rt.rt_root.avl_numnodes * - mdb_ctf_sizeof_by_name("range_seg_t"); + raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE; mdb_nicenum(raw_uchanges_mem, uchanges_mem); raw_free = ms.ms_size; @@ -1644,14 +1645,12 @@ if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", ms.ms_unflushed_frees, 0) == -1) return (DCMD_ERR); - raw_uchanges_mem += - rt.rt_root.avl_numnodes * sizeof (range_seg_t); + raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE; if (mdb_ctf_vread(&rt, "range_tree_t", "mdb_range_tree_t", ms.ms_unflushed_allocs, 0) == -1) return (DCMD_ERR); - raw_uchanges_mem += - rt.rt_root.avl_numnodes * sizeof (range_seg_t); + raw_uchanges_mem += rt.rt_root.bt_num_nodes * BTREE_LEAF_SIZE; } mdb_nicenum(raw_uchanges_mem, uchanges_mem); mdb_printf("%10s\n", uchanges_mem); @@ -2669,6 +2668,202 @@ return (mdb_pwalk_dcmd("zio_root", "zio", argc, argv, addr)); } + +typedef struct mdb_zfs_btree_hdr { + uintptr_t bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} mdb_zfs_btree_hdr_t; + +typedef struct mdb_zfs_btree_core { + mdb_zfs_btree_hdr_t btc_hdr; + uintptr_t btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} mdb_zfs_btree_core_t; + +typedef struct mdb_zfs_btree_leaf { + mdb_zfs_btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} mdb_zfs_btree_leaf_t; + +typedef struct mdb_zfs_btree { + uintptr_t bt_root; + size_t bt_elem_size; +} mdb_zfs_btree_t; + +typedef struct btree_walk_data { + mdb_zfs_btree_t bwd_btree; + mdb_zfs_btree_hdr_t *bwd_node; + uint64_t bwd_offset; // In units of bt_node_size +} btree_walk_data_t; + +static uintptr_t +btree_leftmost_child(uintptr_t addr, mdb_zfs_btree_hdr_t *buf) +{ + size_t size = offsetof(zfs_btree_core_t, btc_children) + + sizeof (uintptr_t); + for (;;) { + if (mdb_vread(buf, size, addr) == -1) { + mdb_warn("failed to read at %p\n", addr); + return ((uintptr_t)0ULL); + } + if (!buf->bth_core) + return (addr); + mdb_zfs_btree_core_t *node = (mdb_zfs_btree_core_t *)buf; + addr = node->btc_children[0]; + } +} + +static int +btree_walk_step(mdb_walk_state_t *wsp) +{ + btree_walk_data_t *bwd = wsp->walk_data; + size_t elem_size = bwd->bwd_btree.bt_elem_size; + if (wsp->walk_addr == 0ULL) + return (WALK_DONE); + + if (!bwd->bwd_node->bth_core) { + /* + * For the first element in a leaf node, read in the full + * leaf, since we only had part of it read in before. + */ + if (bwd->bwd_offset == 0) { + if (mdb_vread(bwd->bwd_node, BTREE_LEAF_SIZE, + wsp->walk_addr) == -1) { + mdb_warn("failed to read at %p\n", + wsp->walk_addr); + return (WALK_ERR); + } + } + + int status = wsp->walk_callback((uintptr_t)(wsp->walk_addr + + offsetof(mdb_zfs_btree_leaf_t, btl_elems) + + bwd->bwd_offset * elem_size), bwd->bwd_node, + wsp->walk_cbdata); + if (status != WALK_NEXT) + return (status); + bwd->bwd_offset++; + + /* Find the next element, if we're at the end of the leaf. */ + while (bwd->bwd_offset == bwd->bwd_node->bth_count) { + uintptr_t par = bwd->bwd_node->bth_parent; + uintptr_t cur = wsp->walk_addr; + wsp->walk_addr = par; + if (par == 0ULL) + return (WALK_NEXT); + + size_t size = sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * elem_size; + if (mdb_vread(bwd->bwd_node, size, wsp->walk_addr) == + -1) { + mdb_warn("failed to read at %p\n", + wsp->walk_addr); + return (WALK_ERR); + } + mdb_zfs_btree_core_t *node = + (mdb_zfs_btree_core_t *)bwd->bwd_node; + int i; + for (i = 0; i <= bwd->bwd_node->bth_count; i++) { + if (node->btc_children[i] == cur) + break; + } + if (i > bwd->bwd_node->bth_count) { + mdb_warn("btree parent/child mismatch at " + "%#lx\n", cur); + return (WALK_ERR); + } + bwd->bwd_offset = i; + } + return (WALK_NEXT); + } + + if (!bwd->bwd_node->bth_core) { + mdb_warn("Invalid btree node at %#lx\n", wsp->walk_addr); + return (WALK_ERR); + } + mdb_zfs_btree_core_t *node = (mdb_zfs_btree_core_t *)bwd->bwd_node; + int status = wsp->walk_callback((uintptr_t)(wsp->walk_addr + + offsetof(mdb_zfs_btree_core_t, btc_elems) + bwd->bwd_offset * + elem_size), bwd->bwd_node, wsp->walk_cbdata); + if (status != WALK_NEXT) + return (status); + + uintptr_t new_child = node->btc_children[bwd->bwd_offset + 1]; + wsp->walk_addr = btree_leftmost_child(new_child, bwd->bwd_node); + if (wsp->walk_addr == 0ULL) + return (WALK_ERR); + + bwd->bwd_offset = 0; + return (WALK_NEXT); +} + +static int +btree_walk_init(mdb_walk_state_t *wsp) +{ + btree_walk_data_t *bwd; + + if (wsp->walk_addr == 0ULL) { + mdb_warn("must supply address of zfs_btree_t\n"); + return (WALK_ERR); + } + + bwd = mdb_zalloc(sizeof (btree_walk_data_t), UM_SLEEP); + if (mdb_ctf_vread(&bwd->bwd_btree, "zfs_btree_t", "mdb_zfs_btree_t", + wsp->walk_addr, 0) == -1) { + mdb_free(bwd, sizeof (*bwd)); + return (WALK_ERR); + } + + if (bwd->bwd_btree.bt_elem_size == 0) { + mdb_warn("invalid or uninitialized btree at %#lx\n", + wsp->walk_addr); + mdb_free(bwd, sizeof (*bwd)); + return (WALK_ERR); + } + + size_t size = MAX(BTREE_LEAF_SIZE, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * bwd->bwd_btree.bt_elem_size); + bwd->bwd_node = mdb_zalloc(size, UM_SLEEP); + + uintptr_t node = (uintptr_t)bwd->bwd_btree.bt_root; + if (node == 0ULL) { + wsp->walk_addr = 0ULL; + wsp->walk_data = bwd; + return (WALK_NEXT); + } + node = btree_leftmost_child(node, bwd->bwd_node); + if (node == 0ULL) { + mdb_free(bwd->bwd_node, size); + mdb_free(bwd, sizeof (*bwd)); + return (WALK_ERR); + } + bwd->bwd_offset = 0; + + wsp->walk_addr = node; + wsp->walk_data = bwd; + return (WALK_NEXT); +} + +static void +btree_walk_fini(mdb_walk_state_t *wsp) +{ + btree_walk_data_t *bwd = (btree_walk_data_t *)wsp->walk_data; + + if (bwd == NULL) + return; + + size_t size = MAX(BTREE_LEAF_SIZE, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * bwd->bwd_btree.bt_elem_size); + if (bwd->bwd_node != NULL) + mdb_free(bwd->bwd_node, size); + + mdb_free(bwd, sizeof (*bwd)); +} + typedef struct mdb_multilist { uint64_t ml_num_sublists; uintptr_t ml_sublists; @@ -4170,23 +4365,43 @@ return (rc); } -typedef struct mdb_range_seg { +typedef struct mdb_range_seg64 { uint64_t rs_start; uint64_t rs_end; -} mdb_range_seg_t; +} mdb_range_seg64_t; + +typedef struct mdb_range_seg32 { + uint32_t rs_start; + uint32_t rs_end; +} mdb_range_seg32_t; /* ARGSUSED */ static int range_tree_cb(uintptr_t addr, const void *unknown, void *arg) { - mdb_range_seg_t rs; - - if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg", "mdb_range_seg_t", - addr, 0) == -1) - return (DCMD_ERR); - - mdb_printf("\t[%llx %llx) (length %llx)\n", - rs.rs_start, rs.rs_end, rs.rs_end - rs.rs_start); + mdb_range_tree_t *rt = (mdb_range_tree_t *)arg; + uint64_t start, end; + + if (rt->rt_type == RANGE_SEG64) { + mdb_range_seg64_t rs; + + if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg64", + "mdb_range_seg64_t", addr, 0) == -1) + return (DCMD_ERR); + start = rs.rs_start; + end = rs.rs_end; + } else { + ASSERT3U(rt->rt_type, ==, RANGE_SEG32); + mdb_range_seg32_t rs; + + if (mdb_ctf_vread(&rs, ZFS_STRUCT "range_seg32", + "mdb_range_seg32_t", addr, 0) == -1) + return (DCMD_ERR); + start = ((uint64_t)rs.rs_start << rt->rt_shift) + rt->rt_start; + end = ((uint64_t)rs.rs_end << rt->rt_shift) + rt->rt_start; + } + + mdb_printf("\t[%llx %llx) (length %llx)\n", start, end, end - start); return (0); } @@ -4197,7 +4412,7 @@ const mdb_arg_t *argv) { mdb_range_tree_t rt; - uintptr_t avl_addr; + uintptr_t btree_addr; if (!(flags & DCMD_ADDRSPEC)) return (DCMD_USAGE); @@ -4207,12 +4422,12 @@ return (DCMD_ERR); mdb_printf("%p: range tree of %llu entries, %llu bytes\n", - addr, rt.rt_root.avl_numnodes, rt.rt_space); - - avl_addr = addr + + addr, rt.rt_root.bt_num_elems, rt.rt_space); + + btree_addr = addr + mdb_ctf_offsetof_by_name(ZFS_STRUCT "range_tree", "rt_root"); - if (mdb_pwalk("avl", range_tree_cb, NULL, avl_addr) != 0) { + if (mdb_pwalk("zfs_btree", range_tree_cb, &rt, btree_addr) != 0) { mdb_warn("can't walk range_tree segments"); return (DCMD_ERR); } @@ -4407,6 +4622,8 @@ { "zfs_acl_node_aces0", "given a zfs_acl_node_t, walk all ACEs as ace_t", zfs_acl_node_aces0_walk_init, zfs_aces_walk_step, NULL }, + { "zfs_btree", "given a zfs_btree_t *, walk all entries", + btree_walk_init, btree_walk_step, btree_walk_fini }, { NULL } };
--- a/usr/src/cmd/zdb/zdb.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/cmd/zdb/zdb.c Mon Dec 09 14:15:34 2019 +0000 @@ -86,21 +86,13 @@ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) -#ifndef lint extern int reference_tracking_enable; extern boolean_t zfs_recover; extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern int aok; extern boolean_t spa_load_verify_dryrun; -#else -int reference_tracking_enable; -boolean_t zfs_recover; -uint64_t zfs_arc_max, zfs_arc_meta_limit; -int zfs_vdev_async_read_max_active; -int aok; -boolean_t spa_load_verify_dryrun; -#endif +extern int zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; @@ -896,7 +888,7 @@ { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ @@ -905,7 +897,7 @@ zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", avl_numnodes(t), "maxsize", maxbuf, + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -3388,7 +3380,7 @@ ASSERT0(range_tree_space(svr->svr_allocd_segs)); - range_tree_t *allocs = range_tree_create(NULL, NULL); + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -5242,7 +5234,8 @@ } if (dump_opt['d'] || dump_opt['i']) { - mos_refd_objs = range_tree_create(NULL, NULL); + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { @@ -5759,6 +5752,13 @@ if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { switch (c) {
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c Mon Dec 09 14:15:34 2019 +0000 @@ -785,7 +785,9 @@ rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); - return (AVL_ISIGN(rv)); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); } void
--- a/usr/src/lib/libzfs/common/libzfs_iter.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/lib/libzfs/common/libzfs_iter.c Mon Dec 09 14:15:34 2019 +0000 @@ -284,7 +284,11 @@ lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - return (AVL_CMP(lcreate, rcreate)); + if (lcreate < rcreate) + return (-1); + if (lcreate > rcreate) + return (+1); + return (0); } int
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c Mon Dec 09 14:15:34 2019 +0000 @@ -493,7 +493,11 @@ const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; - return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); + if (fn1->fn_guid > fn2->fn_guid) + return (+1); + if (fn1->fn_guid < fn2->fn_guid) + return (-1); + return (0); } /*
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Mon Dec 09 14:15:34 2019 +0000 @@ -328,6 +328,15 @@ #define INGLOBALZONE(z) (1) extern uint32_t zone_get_hostid(void *zonep); +/* + * In ZoL the following defines were added to their sys/avl.h header, but + * we want to limit these to the ZFS code on illumos. + */ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr);
--- a/usr/src/uts/common/Makefile.files Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/Makefile.files Mon Dec 09 14:15:34 2019 +0000 @@ -1365,6 +1365,7 @@ bplist.o \ bpobj.o \ bptree.o \ + btree.o \ bqueue.o \ cityhash.o \ dbuf.o \
--- a/usr/src/uts/common/fs/zfs/arc.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/arc.c Mon Dec 09 14:15:34 2019 +0000 @@ -4943,7 +4943,7 @@ kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - extern kmem_cache_t *range_seg_cache; + extern kmem_cache_t *zfs_btree_leaf_cache; extern kmem_cache_t *abd_chunk_cache; #ifdef _KERNEL @@ -4976,7 +4976,7 @@ kmem_cache_reap_soon(buf_cache); kmem_cache_reap_soon(hdr_full_cache); kmem_cache_reap_soon(hdr_l2only_cache); - kmem_cache_reap_soon(range_seg_cache); + kmem_cache_reap_soon(zfs_btree_leaf_cache); if (zio_arena != NULL) { /*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/btree.c Mon Dec 09 14:15:34 2019 +0000 @@ -0,0 +1,2124 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include <sys/btree.h> +#include <sys/bitops.h> +#include <sys/zfs_context.h> + +kmem_cache_t *zfs_btree_leaf_cache; + +/* + * Control the extent of the verification that occurs when zfs_btree_verify is + * called. Primarily used for debugging when extending the btree logic and + * functionality. As the intensity is increased, new verification steps are + * added. These steps are cumulative; intensity = 3 includes the intensity = 1 + * and intensity = 2 steps as well. + * + * Intensity 1: Verify that the tree's height is consistent throughout. + * Intensity 2: Verify that a core node's children's parent pointers point + * to the core node. + * Intensity 3: Verify that the total number of elements in the tree matches the + * sum of the number of elements in each node. Also verifies that each node's + * count obeys the invariants (less than or equal to maximum value, greater than + * or equal to half the maximum minus one). + * Intensity 4: Verify that each element compares less than the element + * immediately after it and greater than the one immediately before it using the + * comparator function. For core nodes, also checks that each element is greater + * than the last element in the first of the two nodes it separates, and less + * than the first element in the second of the two nodes. + * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside + * of each node is poisoned appropriately. Note that poisoning always occurs if + * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal + * operation. + * + * Intensity 4 and 5 are particularly expensive to perform; the previous levels + * are a few memory operations per node, while these levels require multiple + * operations per element. In addition, when creating large btrees, these + * operations are called at every step, resulting in extremely slow operation + * (while the asymptotic complexity of the other steps is the same, the + * importance of the constant factors cannot be denied). + */ +int zfs_btree_verify_intensity = 0; + +/* + * A convenience function to silence warnings from memmove's return value and + * change argument order to src, dest. + */ +void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + +#ifdef _ILP32 +#define BTREE_POISON 0xabadb10c +#else +#define BTREE_POISON 0xabadb10cdeadbeef +#endif + +static void +zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - + hdr->bth_count * size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + node->btc_children[i] = + (zfs_btree_hdr_t *)BTREE_POISON; + } + (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, + (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } +#endif +} + +static inline void +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + ASSERT3U(offset, >=, hdr->bth_count); + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + offset * size, 0x0f, size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + node->btc_children[offset + 1] = + (zfs_btree_hdr_t *)BTREE_POISON; + (void) memset(node->btc_elems + offset * size, 0x0f, size); + } +#endif +} + +static inline void +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + uint8_t eval = 0x0f; + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; + VERIFY3P(node->btc_children[offset + 1], ==, cval); + for (int i = 0; i < size; i++) + VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + } else { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 0; i < size; i++) + VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + } +#endif +} + +void +zfs_btree_init(void) +{ + zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, + NULL, 0); +} + +void +zfs_btree_fini(void) +{ + kmem_cache_destroy(zfs_btree_leaf_cache); +} + +void +zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), + size_t size) +{ + /* + * We need a minimmum of 4 elements so that when we split a node we + * always have at least two elements in each node. This simplifies the + * logic in zfs_btree_bulk_finish, since it means the last leaf will + * always have a left sibling to share with (unless it's the root). + */ + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); + + bzero(tree, sizeof (*tree)); + tree->bt_compar = compar; + tree->bt_elem_size = size; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +/* + * Find value in the array of elements provided. Uses a simple binary search. + */ +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, zfs_btree_index_t *where) +{ + uint64_t max = nelems; + uint64_t min = 0; + while (max > min) { + uint64_t idx = (min + max) / 2; + uint8_t *cur = buf + idx * tree->bt_elem_size; + int comp = tree->bt_compar(cur, value); + if (comp == -1) { + min = idx + 1; + } else if (comp == 1) { + max = idx; + } else { + ASSERT0(comp); + where->bti_offset = idx; + where->bti_before = B_FALSE; + return (cur); + } + } + + where->bti_offset = max; + where->bti_before = B_TRUE; + return (NULL); +} + +/* + * Find the given value in the tree. where may be passed as null to use as a + * membership test or if the btree is being used as a map. + */ +void * +zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + if (where != NULL) { + where->bti_node = NULL; + where->bti_offset = 0; + } + ASSERT0(tree->bt_num_elems); + return (NULL); + } + + /* + * If we're in bulk-insert mode, we check the last spot in the tree + * and the last leaf in the tree before doing the normal search, + * because for most workloads the vast majority of finds in + * bulk-insert mode are to insert new elements. + */ + zfs_btree_index_t idx; + if (tree->bt_bulk != NULL) { + zfs_btree_leaf_t *last_leaf = tree->bt_bulk; + int compar = tree->bt_compar(last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), + value); + if (compar < 0) { + /* + * If what they're looking for is after the last + * element, it's not in the tree. + */ + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count; + where->bti_before = B_TRUE; + } + return (NULL); + } else if (compar == 0) { + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count - 1; + where->bti_before = B_FALSE; + } + return (last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * + tree->bt_elem_size)); + } + if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + /* + * If what they're looking for is after the first + * element in the last leaf, it's in the last leaf or + * it's not in the tree. + */ + void *d = zfs_btree_find_in_buf(tree, + last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, + value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)last_leaf; + *where = idx; + } + return (d); + } + } + + zfs_btree_core_t *node = NULL; + uint64_t child = 0; + uint64_t depth = 0; + + /* + * Iterate down the tree, finding which child the value should be in + * by comparing with the separators. + */ + for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (zfs_btree_core_t *)node->btc_children[child], depth++) { + ASSERT3P(node, !=, NULL); + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + node->btc_hdr.bth_count, value, &idx); + EQUIV(d != NULL, !idx.bti_before); + if (d != NULL) { + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)node; + *where = idx; + } + return (d); + } + ASSERT(idx.bti_before); + child = idx.bti_offset; + } + + /* + * The value is in this leaf, or it would be if it were in the + * tree. Find its proper location and return it. + */ + zfs_btree_leaf_t *leaf = (depth == 0 ? + (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, + leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)leaf; + *where = idx; + } + + return (d); +} + +/* + * To explain the following functions, it is useful to understand the four + * kinds of shifts used in btree operation. First, a shift is a movement of + * elements within a node. It is used to create gaps for inserting new + * elements and children, or cover gaps created when things are removed. A + * shift has two fundamental properties, each of which can be one of two + * values, making four types of shifts. There is the direction of the shift + * (left or right) and the shape of the shift (parallelogram or isoceles + * trapezoid (shortened to trapezoid hereafter)). The shape distinction only + * applies to shifts of core nodes. + * + * The names derive from the following imagining of the layout of a node: + * + * Elements: * * * * * * * ... * * * + * Children: * * * * * * * * ... * * * + * + * This layout follows from the fact that the elements act as separators + * between pairs of children, and that children root subtrees "below" the + * current node. A left and right shift are fairly self-explanatory; a left + * shift moves things to the left, while a right shift moves things to the + * right. A parallelogram shift is a shift with the same number of elements + * and children being moved, while a trapezoid shift is a shift that moves one + * more children than elements. An example follows: + * + * A parallelogram shift could contain the following: + * _______________ + * \* * * * \ * * * ... * * * + * * \ * * * *\ * * * ... * * * + * --------------- + * A trapezoid shift could contain the following: + * ___________ + * * / * * * \ * * * ... * * * + * * / * * * *\ * * * ... * * * + * --------------- + * + * Note that a parellelogram shift is always shaped like a "left-leaning" + * parallelogram, where the starting index of the children being moved is + * always one higher than the starting index of the elements being moved. No + * "right-leaning" parallelogram shifts are needed (shifts where the starting + * element index and starting child index being moved are the same) to achieve + * any btree operations, so we ignore them. + */ + +enum bt_shift_shape { + BSS_TRAPEZOID, + BSS_PARALLELOGRAM +}; + +enum bt_shift_direction { + BSD_LEFT, + BSD_RIGHT +}; + +/* + * Shift elements and children in the provided core node by off spots. The + * first element moved is idx, and count elements are moved. The shape of the + * shift is determined by shape. The direction is determined by dir. + */ +static inline void +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_shape shape, + enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(node->btc_hdr.bth_core); + + uint8_t *e_start = node->btc_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *e_out = e_start + sign * off * size; + uint64_t e_count = count; + bmov(e_start, e_out, e_count * size); + + zfs_btree_hdr_t **c_start = node->btc_children + idx + + (shape == BSS_TRAPEZOID ? 0 : 1); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : + c_start + off); + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(c_start, c_out, c_count * sizeof (*c_start)); +} + +/* + * Shift elements and children in the provided core node left by one spot. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. + */ +static inline void +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); +} + +/* + * Shift elements and children in the provided core node right by one spot. + * Starts with elements[idx] and children[idx] and one more child than element. + */ +static inline void +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); +} + +/* + * Shift elements and children in the provided leaf node by off spots. + * The first element moved is idx, and count elements are moved. The direction + * is determined by left. + */ +static inline void +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(!node->btl_hdr.bth_core); + + uint8_t *start = node->btl_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *out = start + sign * off * size; + bmov(start, out, count * size); +} + +static inline void +bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); +} + +static inline void +bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); +} + +/* + * Move children and elements from one core node to another. The shape + * parameter behaves the same as it does in the shift logic. + */ +static inline void +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, + uint64_t count, zfs_btree_core_t *dest, uint64_t didx, + enum bt_shift_shape shape) +{ + size_t size = tree->bt_elem_size; + ASSERT(source->btc_hdr.bth_core); + ASSERT(dest->btc_hdr.bth_core); + + bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + count * size); + + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), + c_count * sizeof (*source->btc_children)); +} + +static inline void +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, + uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) +{ + size_t size = tree->bt_elem_size; + ASSERT(!source->btl_hdr.bth_core); + ASSERT(!dest->btl_hdr.bth_core); + + bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + count * size); +} + +/* + * Find the first element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[0]) + ; + + ASSERT(!node->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = 0; + where->bti_before = B_FALSE; + } + return (&leaf->btl_elems[0]); +} + +/* Insert an element and a child into a core node at the given offset. */ +static void +zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, + uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) +{ + uint64_t size = tree->bt_elem_size; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + ASSERT3P(par_hdr, ==, new_node->bth_parent); + ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); + + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bmov(buf, parent->btc_elems + offset * size, size); + par_hdr->bth_count++; +} + +/* + * Insert new_node into the parent of old_node directly after old_node, with + * buf as the dividing element between the two. + */ +static void +zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, + zfs_btree_hdr_t *new_node, void *buf) +{ + ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); + uint64_t size = tree->bt_elem_size; + zfs_btree_core_t *parent = old_node->bth_parent; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + + /* + * If this is the root node we were splitting, we create a new root + * and increase the height of the tree. + */ + if (parent == NULL) { + ASSERT3P(old_node, ==, tree->bt_root); + tree->bt_num_nodes++; + zfs_btree_core_t *new_root = + kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * + size, KM_SLEEP); + zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + new_root_hdr->bth_parent = NULL; + new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_count = 1; + + old_node->bth_parent = new_node->bth_parent = new_root; + new_root->btc_children[0] = old_node; + new_root->btc_children[1] = new_node; + bmov(buf, new_root->btc_elems, size); + + tree->bt_height++; + tree->bt_root = new_root_hdr; + zfs_btree_poison_node(tree, new_root_hdr); + return; + } + + /* + * Since we have the new separator, binary search for where to put + * new_node. + */ + zfs_btree_index_t idx; + ASSERT(par_hdr->bth_core); + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + par_hdr->bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + uint64_t offset = idx.bti_offset; + ASSERT3U(offset, <=, par_hdr->bth_count); + ASSERT3P(parent->btc_children[offset], ==, old_node); + + /* + * If the parent isn't full, shift things to accomodate our insertions + * and return. + */ + if (par_hdr->bth_count != BTREE_CORE_ELEMS) { + zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); + return; + } + + /* + * We need to split this core node into two. Currently there are + * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for + * BTREE_CORE_ELEMS + 2. Some of the children will be part of the + * current node, and the others will be moved to the new core node. + * There are BTREE_CORE_ELEMS + 1 elements including the new one. One + * will be used as the new separator in our parent, and the others + * will be split among the two core nodes. + * + * Usually we will split the node in half evenly, with + * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we + * instead move only about a quarter of the elements (and children) to + * the new node. Since the average state after a long time is a 3/4 + * full node, shortcutting directly to that state improves efficiency. + * + * We do this in two stages: first we split into two nodes, and then we + * reuse our existing logic to insert the new element and child. + */ + uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4)) - 1, 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; + ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + new_par_hdr->bth_parent = par_hdr->bth_parent; + new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_par_hdr); + + par_hdr->bth_count = keep_count; + + bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, + 0, BSS_TRAPEZOID); + + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bmov(parent->btc_elems + keep_count * size, tmp_buf, + size); + zfs_btree_poison_node(tree, par_hdr); + + if (offset < keep_count) { + /* Insert the new node into the left half */ + zfs_btree_insert_core_impl(tree, parent, offset, new_node, + buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else if (offset > keep_count) { + /* Insert the new node into the right half */ + new_node->bth_parent = new_parent; + zfs_btree_insert_core_impl(tree, new_parent, + offset - keep_count - 1, new_node, buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else { + /* + * Move the new separator into the right half, and replace it + * with buf. We also need to shift back the elements in the + * right half to accomodate new_node. + */ + bt_shift_core_right(tree, new_parent, 0, move_count, + BSS_TRAPEZOID); + new_parent->btc_children[0] = new_node; + bmov(tmp_buf, new_parent->btc_elems, size); + new_par_hdr->bth_count++; + } + kmem_free(tmp_buf, size); + zfs_btree_poison_node(tree, par_hdr); + + for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + new_parent->btc_children[i]->bth_parent = new_parent; + + for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting. + */ + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, + &new_parent->btc_hdr, buf); +} + +/* Insert an element into a leaf node at the given offset. */ +static void +zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + uint64_t idx, const void *value) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + uint64_t count = leaf->btl_hdr.bth_count - idx; + ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + + bt_shift_leaf_right(tree, leaf, idx, count); + bmov(value, start, size); + hdr->bth_count++; +} + +/* Helper function for inserting a new value into leaf at the given index. */ +static void +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + const void *value, uint64_t idx) +{ + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * If the leaf isn't full, shift the elements after idx and insert + * value. + */ + if (leaf->btl_hdr.bth_count != capacity) { + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + return; + } + + /* + * Otherwise, we split the leaf node into two nodes. If we're not bulk + * inserting, each is of size (capacity / 2). If we are bulk + * inserting, we move a quarter of the elements to the new node so + * inserts into the old node don't cause immediate splitting but the + * tree stays relatively dense. Since the average state after a long + * time is a 3/4 full node, shortcutting directly to that state + * improves efficiency. At the end of the bulk insertion process + * we'll need to go through and fix up any nodes (the last leaf and + * its ancestors, potentially) that are below the minimum. + * + * In either case, we're left with one extra element. The leftover + * element will become the new dividing element between the two nodes. + */ + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - + 1, 2); + uint64_t keep_count = capacity - move_count - 1; + ASSERT3U(capacity - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + new_hdr->bth_parent = leaf->btl_hdr.bth_parent; + new_hdr->bth_core = B_FALSE; + new_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_hdr); + + leaf->btl_hdr.bth_count = keep_count; + + if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) + tree->bt_bulk = new_leaf; + + /* Copy the back part to the new leaf. */ + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, + 0); + + /* We store the new separator in a buffer we control for simplicity. */ + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(leaf->btl_elems + (keep_count * size), buf, size); + zfs_btree_poison_node(tree, &leaf->btl_hdr); + + if (idx < keep_count) { + /* Insert into the existing leaf. */ + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + } else if (idx > keep_count) { + /* Insert into the new leaf. */ + zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - + 1, value); + } else { + /* + * Shift the elements in the new leaf to make room for the + * separator, and use the new value as the new separator. + */ + bt_shift_leaf_right(tree, new_leaf, 0, move_count); + bmov(buf, new_leaf->btl_elems, size); + bmov(value, buf, size); + new_hdr->bth_count++; + } + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting, bur only of core nodes. + */ + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, + buf); + kmem_free(buf, size); +} + +static uint64_t +zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + void *buf; + if (hdr->bth_core) { + buf = ((zfs_btree_core_t *)hdr)->btc_elems; + } else { + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; + } + zfs_btree_index_t idx; + zfs_btree_core_t *parent = hdr->bth_parent; + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + parent->btc_hdr.bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); + ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); + return (idx.bti_offset); +} + +/* + * Take the b-tree out of bulk insert mode. During bulk-insert mode, some + * nodes may violate the invariant that non-root nodes must be at least half + * full. All nodes violating this invariant should be the last node in their + * particular level. To correct the invariant, we take values from their left + * neighbor until they are half full. They must have a left neighbor at their + * level because the last node at a level is not the first node unless it's + * the root. + */ +static void +zfs_btree_bulk_finish(zfs_btree_t *tree) +{ + ASSERT3P(tree->bt_bulk, !=, NULL); + ASSERT3P(tree->bt_root, !=, NULL); + zfs_btree_leaf_t *leaf = tree->bt_bulk; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * The invariant doesn't apply to the root node, if that's the only + * node in the tree we're done. + */ + if (parent == NULL) { + tree->bt_bulk = NULL; + return; + } + + /* First, take elements to rebalance the leaf node. */ + if (hdr->bth_count < capacity / 2) { + /* + * First, find the left neighbor. The simplest way to do this + * is to call zfs_btree_prev twice; the first time finds some + * ancestor of this node, and the second time finds the left + * neighbor. The ancestor found is the lowest common ancestor + * of leaf and the neighbor. + */ + zfs_btree_index_t idx = { + .bti_node = hdr, + .bti_offset = 0 + }; + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(idx.bti_node->bth_core); + zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; + uint64_t common_idx = idx.bti_offset; + + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(!idx.bti_node->bth_core); + zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; + zfs_btree_hdr_t *l_hdr = idx.bti_node; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + + /* First, shift elements in leaf back. */ + bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, + BSD_RIGHT); + + /* Next, move the separator from the common ancestor to leaf. */ + uint8_t *separator = common->btc_elems + (common_idx * size); + uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); + bmov(separator, out, size); + move_count--; + + /* + * Now we move elements from the tail of the left neighbor to + * fill the remaining spots in leaf. + */ + bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - + move_count, move_count, leaf, 0); + + /* + * Finally, move the new last element in the left neighbor to + * the separator. + */ + bmov(l_neighbor->btl_elems + (l_hdr->bth_count - + move_count - 1) * size, separator, size); + + /* Adjust the node's counts, and we're done. */ + l_hdr->bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_hdr->bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + zfs_btree_poison_node(tree, l_hdr); + } + + /* + * Now we have to rebalance any ancestors of leaf that may also + * violate the invariant. + */ + capacity = BTREE_CORE_ELEMS; + while (parent->btc_hdr.bth_parent != NULL) { + zfs_btree_core_t *cur = parent; + zfs_btree_hdr_t *hdr = &cur->btc_hdr; + parent = hdr->bth_parent; + /* + * If the invariant isn't violated, move on to the next + * ancestor. + */ + if (hdr->bth_count >= capacity / 2) + continue; + + /* + * Because the smallest number of nodes we can move when + * splitting is 2, we never need to worry about not having a + * left sibling (a sibling is a neighbor with the same parent). + */ + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + ASSERT3U(parent_idx, >, 0); + zfs_btree_core_t *l_neighbor = + (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + hdr->bth_count + i); + } + } + /* First, shift things in the right node back. */ + bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, + BSS_TRAPEZOID, BSD_RIGHT); + + /* Next, move the separator to the right node. */ + uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * + size); + uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); + bmov(separator, e_out, size); + + /* + * Now, move elements and children from the left node to the + * right. We move one more child than elements. + */ + move_count--; + uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, + BSS_TRAPEZOID); + + /* + * Finally, move the last element in the left node to the + * separator's position. + */ + move_idx--; + bmov(l_neighbor->btc_elems + move_idx * size, separator, size); + + l_neighbor->btc_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + + zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); + + for (int i = 0; i <= hdr->bth_count; i++) + cur->btc_children[i]->bth_parent = cur; + } + + tree->bt_bulk = NULL; +} + +/* + * Insert value into tree at the location specified by where. + */ +void +zfs_btree_insert(zfs_btree_t *tree, const void *value, + const zfs_btree_index_t *where) +{ + zfs_btree_index_t idx = {0}; + + /* If we're not inserting in the last leaf, end bulk insert mode. */ + if (tree->bt_bulk != NULL) { + if (where->bti_node != &tree->bt_bulk->btl_hdr) { + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); + where = &idx; + } + } + + tree->bt_num_elems++; + /* + * If this is the first element in the tree, create a leaf root node + * and add the value to it. + */ + if (where->bti_node == NULL) { + ASSERT3U(tree->bt_num_elems, ==, 1); + ASSERT3S(tree->bt_height, ==, -1); + ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0(where->bti_offset); + + tree->bt_num_nodes++; + zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + tree->bt_root = &leaf->btl_hdr; + tree->bt_height++; + + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + hdr->bth_parent = NULL; + hdr->bth_core = B_FALSE; + hdr->bth_count = 0; + zfs_btree_poison_node(tree, hdr); + + zfs_btree_insert_into_leaf(tree, leaf, value, 0); + tree->bt_bulk = leaf; + } else if (!where->bti_node->bth_core) { + /* + * If we're inserting into a leaf, go directly to the helper + * function. + */ + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)where->bti_node, value, + where->bti_offset); + } else { + /* + * If we're inserting into a core node, we can't just shift + * the existing element in that slot in the same node without + * breaking our ordering invariants. Instead we place the new + * value in the node at that spot and then insert the old + * separator into the first slot in the subtree to the right. + */ + ASSERT(where->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; + + /* + * We can ignore bti_before, because either way the value + * should end up in bti_offset. + */ + uint64_t off = where->bti_offset; + zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; + size_t size = tree->bt_elem_size; + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); + + /* + * Find the first slot in the subtree to the right, insert + * there. + */ + zfs_btree_index_t new_idx; + VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); + ASSERT0(new_idx.bti_offset); + ASSERT(!new_idx.bti_node->bth_core); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); + kmem_free(buf, size); + } + zfs_btree_verify(tree); +} + +/* + * Return the first element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_first_helper(tree->bt_root, where)); +} + +/* + * Find the last element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) + ; + + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = node->bth_count - 1; + where->bti_before = B_FALSE; + } + return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); +} + +/* + * Return the last element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_last_helper(tree, tree->bt_root, where)); +} + +/* + * This function contains the logic to find the next node in the tree. A + * helper function is used because there are multiple internal consumemrs of + * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each + * node after we've finished with it. + */ +static void * +zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx, + void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the next element of an element in a leaf, + * there are two cases. If the element isn't the last one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our ancestor isn't the last child + * of its parent. Once we do, the next element is the + * separator after our ancestor in its parent. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + if (leaf->btl_hdr.bth_count > new_off) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = new_off; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + new_off * tree->bt_elem_size); + } + + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (done_func != NULL) + done_func(tree, prev); + if (i == hdr->bth_count) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + i * tree->bt_elem_size); + } + if (done_func != NULL) + done_func(tree, prev); + /* + * We've traversed all the way up and been at the end of the + * node every time, so this was the last element in the tree. + */ + return (NULL); + } + + /* If we were before an element in a core node, return that element. */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + if (idx->bti_before) { + out_idx->bti_before = B_FALSE; + return (node->btc_elems + offset * tree->bt_elem_size); + } + + /* + * The next element from one in a core node is the first element in + * the subtree just to the right of the separator. + */ + zfs_btree_hdr_t *child = node->btc_children[offset + 1]; + return (zfs_btree_first_helper(child, out_idx)); +} + +/* + * Return the next valued node in the tree. The same address can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); +} + +/* + * Return the previous valued node in the tree. The same value can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the previous element of an element in a leaf, + * there are two cases. If the element isn't the first one in + * the leaf, in which case we just return the previous element + * in the leaf. Otherwise, we need to traverse up our parents + * until we find one where our previous ancestor isn't the + * first child. Once we do, the previous element is the + * separator after our previous ancestor. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + if (offset != 0) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = offset - 1; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + (offset - 1) * + tree->bt_elem_size); + } + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (i == 0) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i - 1; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + (i - 1) * tree->bt_elem_size); + } + /* + * We've traversed all the way up and been at the start of the + * node every time, so this was the first node in the tree. + */ + return (NULL); + } + + /* + * The previous element from one in a core node is the last element in + * the subtree just to the left of the separator. + */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + zfs_btree_hdr_t *child = node->btc_children[offset]; + return (zfs_btree_last_helper(tree, child, out_idx)); +} + +/* + * Get the value at the provided index in the tree. + * + * Note that the value returned from this function can be mutated, but only + * if it will not change the ordering of the element with respect to any other + * elements that could be in the tree. + */ +void * +zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) +{ + ASSERT(!idx->bti_before); + if (!idx->bti_node->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + } + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); +} + +/* Add the given value to the tree. Must not already be in the tree. */ +void +zfs_btree_add(zfs_btree_t *tree, const void *node) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); + zfs_btree_insert(tree, node, &where); +} + +/* Helper function to free a tree node. */ +static void +zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) +{ + tree->bt_num_nodes--; + if (!node->bth_core) { + kmem_cache_free(zfs_btree_leaf_cache, node); + } else { + kmem_free(node, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * tree->bt_elem_size); + } +} + +/* + * Remove the rm_hdr and the separator to its left from the parent node. The + * buffer that rm_hdr was stored in may already be freed, so its contents + * cannot be accessed. + */ +static void +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, + zfs_btree_hdr_t *rm_hdr) +{ + size_t size = tree->bt_elem_size; + uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; + zfs_btree_hdr_t *hdr = &node->btc_hdr; + /* + * If the node is the root node and rm_hdr is one of two children, + * promote the other child to the root. + */ + if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { + ASSERT3U(hdr->bth_count, ==, 1); + ASSERT3P(tree->bt_root, ==, node); + ASSERT3P(node->btc_children[1], ==, rm_hdr); + tree->bt_root = node->btc_children[0]; + node->btc_children[0]->bth_parent = NULL; + zfs_btree_node_destroy(tree, hdr); + tree->bt_height--; + return; + } + + uint64_t idx; + for (idx = 0; idx <= hdr->bth_count; idx++) { + if (node->btc_children[idx] == rm_hdr) + break; + } + ASSERT3U(idx, <=, hdr->bth_count); + + /* + * If the node is the root or it has more than the minimum number of + * children, just remove the child and separator, and return. + */ + if (hdr->bth_parent == NULL || + hdr->bth_count > min_count) { + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + hdr->bth_count--; + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + return; + } + + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a neighbor. We check left, then + * right. If the neighbor exists and has more than the minimum number + * of elements, we move the separator betweeen us and them to our + * node, move their closest element (last for left, first for right) + * to the separator, and move their closest child to our node. Along + * the way we need to collapse the gap made by idx, and (for our right + * neighbor) the gap made by removing their first element and child. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling (i.e. a neighbor with a different + * parent). This isn't critical functionality, but may be worth + * implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(l_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; + + /* + * Start by shifting the elements and children in the current + * node to the right by one spot. + */ + bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); + + /* + * Move the separator between node and neighbor to the first + * element slot in the current node. + */ + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, node->btc_elems, size); + + /* Move the last child of neighbor to our first child slot. */ + zfs_btree_hdr_t **take_child = neighbor->btc_children + + l_hdr->bth_count; + bmov(take_child, node->btc_children, sizeof (*take_child)); + node->btc_children[0]->bth_parent = node; + + /* Move the last element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems + + (l_hdr->bth_count - 1) * size; + bmov(take_elem, separator, size); + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(r_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; + + /* + * Shift elements in node left by one spot to overwrite rm_hdr + * and the separator before it. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + + /* + * Move the separator between node and neighbor to the last + * element spot in node. + */ + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, + size); + + /* + * Move the first child of neighbor to the last child spot in + * node. + */ + zfs_btree_hdr_t **take_child = neighbor->btc_children; + bmov(take_child, node->btc_children + hdr->bth_count, + sizeof (*take_child)); + node->btc_children[hdr->bth_count]->bth_parent = node; + + /* Move the first element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems; + bmov(take_elem, separator, size); + r_hdr->bth_count--; + + /* + * Shift the elements and children of neighbor to cover the + * stolen elements. + */ + bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, + BSS_TRAPEZOID); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + new_rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + new_rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(keep_hdr->bth_core); + ASSERT(new_rm_hdr->bth_core); + + zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; + zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + + /* Move the separator into the left node. */ + uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, e_out, size); + keep_hdr->bth_count++; + + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, + keep_hdr->bth_count, BSS_TRAPEZOID); + + uint64_t old_count = keep_hdr->bth_count; + + /* Update bookkeeping */ + keep_hdr->bth_count += new_rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); + + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); + bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, + BSS_PARALLELOGRAM); + keep_hdr->bth_count--; + + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + old_count - 1; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; + for (int i = 0; i <= keep_hdr->bth_count; i++) { + ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); + ASSERT3P(keep->btc_children[i], !=, rm_hdr); + } + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + new_rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, new_rm_hdr); + zfs_btree_remove_from_node(tree, parent, new_rm_hdr); +} + +/* Remove the element at the specific location. */ +void +zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + size_t size = tree->bt_elem_size; + zfs_btree_hdr_t *hdr = where->bti_node; + uint64_t idx = where->bti_offset; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + ASSERT(!where->bti_before); + if (tree->bt_bulk != NULL) { + /* + * Leave bulk insert mode. Note that our index would be + * invalid after we correct the tree, so we copy the value + * we're planning to remove and find it again after + * bulk_finish. + */ + uint8_t *value = zfs_btree_get(tree, where); + uint8_t *tmp = kmem_alloc(size, KM_SLEEP); + bmov(value, tmp, size); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); + kmem_free(tmp, size); + hdr = where->bti_node; + idx = where->bti_offset; + } + + tree->bt_num_elems--; + /* + * If the element happens to be in a core node, we move a leaf node's + * element into its place and then remove the leaf node element. This + * makes the rebalance logic not need to be recursive both upwards and + * downwards. + */ + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = zfs_btree_last_helper(tree, left_subtree, + where); + ASSERT3P(new_value, !=, NULL); + + bmov(new_value, node->btc_elems + idx * size, size); + + hdr = where->bti_node; + idx = where->bti_offset; + ASSERT(!where->bti_before); + } + + /* + * First, we'll update the leaf's metadata. Then, we shift any + * elements after the idx to the left. After that, we rebalance if + * needed. + */ + ASSERT(!hdr->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + ASSERT3U(hdr->bth_count, >, 0); + + uint64_t min_count = (capacity / 2) - 1; + + /* + * If we're over the minimum size or this is the root, just overwrite + * the value and return. + */ + if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { + hdr->bth_count--; + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + if (hdr->bth_parent == NULL) { + ASSERT0(tree->bt_height); + if (hdr->bth_count == 0) { + tree->bt_root = NULL; + tree->bt_height--; + zfs_btree_node_destroy(tree, &leaf->btl_hdr); + } + } + if (tree->bt_root != NULL) + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_verify(tree); + return; + } + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a sibling. We check left, then + * right. If they exist and have more than the minimum number of + * elements, we move the separator betweeen us and them to our node + * and move their closest element (last for left, first for right) to + * the separator. Along the way we need to collapse the gap made by + * idx, and (for our right neighbor) the gap made by removing their + * first element. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(!l_hdr->bth_core); + + /* + * Move our elements back by one spot to make room for the + * stolen element and overwrite the element being removed. + */ + bt_shift_leaf_right(tree, leaf, 0, idx); + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + + (l_hdr->bth_count - 1) * size; + /* Move the separator to our first spot. */ + bmov(separator, leaf->btl_elems, size); + + /* Move our neighbor's last element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + + zfs_btree_verify(tree); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(!r_hdr->bth_core); + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; + + /* + * Move our elements after the element being removed forwards + * by one spot to make room for the stolen element and + * overwrite the element being removed. + */ + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - + 1); + + uint8_t *separator = parent->btc_elems + parent_idx * size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; + /* Move the separator between us to our last spot. */ + bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + size); + + /* Move our neighbor's first element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + r_hdr->bth_count--; + + /* + * Move our neighbors elements forwards to overwrite the + * stolen element. + */ + bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_verify(tree); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; // 449 + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(!keep_hdr->bth_core); + ASSERT(!rm_hdr->bth_core); + ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT3U(rm_hdr->bth_count, ==, min_count); + + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, out, size); + keep_hdr->bth_count++; + + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, + keep_hdr->bth_count); + + /* Update the bookkeeping. */ + keep_hdr->bth_count += rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + + /* Remove the value from the node */ + keep_hdr->bth_count--; + bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - + new_idx); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, rm_hdr); + /* Remove the emptied node from the parent. */ + zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_verify(tree); +} + +/* Remove the given value from the tree. */ +void +zfs_btree_remove(zfs_btree_t *tree, const void *value) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); + zfs_btree_remove_from(tree, &where); +} + +/* Return the number of elements in the tree. */ +ulong_t +zfs_btree_numnodes(zfs_btree_t *tree) +{ + return (tree->bt_num_elems); +} + +/* + * This function is used to visit all the elements in the tree before + * destroying the tree. This allows the calling code to perform any cleanup it + * needs to do. This is more efficient than just removing the first element + * over and over, because it removes all rebalancing. Once the destroy_nodes() + * function has been called, no other btree operations are valid until it + * returns NULL, which point the only valid operation is zfs_btree_destroy(). + * + * example: + * + * zfs_btree_index_t *cookie = NULL; + * my_data_t *node; + * + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * free(node->ptr); + * zfs_btree_destroy(tree); + * + */ +void * +zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) +{ + if (*cookie == NULL) { + if (tree->bt_height == -1) + return (NULL); + *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); + return (zfs_btree_first(tree, *cookie)); + } + + void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, + zfs_btree_node_destroy); + if (rval == NULL) { + tree->bt_root = NULL; + tree->bt_height = -1; + tree->bt_num_elems = 0; + kmem_free(*cookie, sizeof (**cookie)); + tree->bt_bulk = NULL; + } + return (rval); +} + +static void +zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (hdr->bth_core) { + zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_clear_helper(tree, btc->btc_children[i]); + } + } + + zfs_btree_node_destroy(tree, hdr); +} + +void +zfs_btree_clear(zfs_btree_t *tree) +{ + if (tree->bt_root == NULL) { + ASSERT0(tree->bt_num_elems); + return; + } + + zfs_btree_clear_helper(tree, tree->bt_root); + tree->bt_num_elems = 0; + tree->bt_root = NULL; + tree->bt_num_nodes = 0; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +void +zfs_btree_destroy(zfs_btree_t *tree) +{ + ASSERT0(tree->bt_num_elems); + ASSERT3P(tree->bt_root, ==, NULL); +} + +/* Verify that every child of this node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) + return; + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); + zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); + } +} + +/* Verify that every node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers(zfs_btree_t *tree) +{ + if (tree->bt_height == -1) { + VERIFY3P(tree->bt_root, ==, NULL); + return; + } + VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + zfs_btree_verify_pointers_helper(tree, tree->bt_root); +} + +/* + * Verify that all the current node and its children satisfy the count + * invariants, and return the total count in the subtree rooted in this node. + */ +static uint64_t +zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) { + if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); + VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); + } + + return (hdr->bth_count); + } else { + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = hdr->bth_count; + if (tree->bt_root != hdr && tree->bt_bulk == NULL) + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_counts_helper(tree, + node->btc_children[i]); + } + + return (ret); + } +} + +/* + * Verify that all nodes satisfy the invariants and that the total number of + * elements is correct. + */ +static void +zfs_btree_verify_counts(zfs_btree_t *tree) +{ + EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); + if (tree->bt_height == -1) { + return; + } + VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, + tree->bt_num_elems); +} + +/* + * Check that the subtree rooted at this node has a uniform height. Returns + * the number of nodes under this node, to help verify bt_num_nodes. + */ +static uint64_t +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + int64_t height) +{ + if (!hdr->bth_core) { + VERIFY0(height); + return (1); + } + + VERIFY(hdr->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = 1; + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_height_helper(tree, + node->btc_children[i], height - 1); + } + return (ret); +} + +/* + * Check that the tree rooted at this node has a uniform height, and that the + * bt_height in the tree is correct. + */ +static void +zfs_btree_verify_height(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, + tree->bt_height), ==, tree->bt_num_nodes); +} + +/* + * Check that the elements in this node are sorted, and that if this is a core + * node, the separators are properly between the subtrees they separaate and + * that the children also satisfy this requirement. + */ +static void +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * + size, leaf->btl_elems + i * size), ==, -1); + } + return; + } + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, + node->btc_elems + i * size), ==, -1); + } + for (int i = 0; i < hdr->bth_count; i++) { + uint8_t *left_child_last = NULL; + zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; + if (left_child_hdr->bth_core) { + zfs_btree_core_t *left_child = + (zfs_btree_core_t *)left_child_hdr; + left_child_last = left_child->btc_elems + + (left_child_hdr->bth_count - 1) * size; + } else { + zfs_btree_leaf_t *left_child = + (zfs_btree_leaf_t *)left_child_hdr; + left_child_last = left_child->btl_elems + + (left_child_hdr->bth_count - 1) * size; + } + if (tree->bt_compar(node->btc_elems + i * size, + left_child_last) != 1) { + panic("btree: compar returned %d (expected 1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, left_child_last), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)left_child_last); + } + + uint8_t *right_child_first = NULL; + zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + if (right_child_hdr->bth_core) { + zfs_btree_core_t *right_child = + (zfs_btree_core_t *)right_child_hdr; + right_child_first = right_child->btc_elems; + } else { + zfs_btree_leaf_t *right_child = + (zfs_btree_leaf_t *)right_child_hdr; + right_child_first = right_child->btl_elems; + } + if (tree->bt_compar(node->btc_elems + i * size, + right_child_first) != -1) { + panic("btree: compar returned %d (expected -1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, right_child_first), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)right_child_first); + } + } + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_order_helper(tree, node->btc_children[i]); + } +} + +/* Check that all elements in the tree are in sorted order. */ +static void +zfs_btree_verify_order(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + zfs_btree_verify_order_helper(tree, tree->bt_root); +} + +#ifdef ZFS_DEBUG +/* Check that all unused memory is poisoned correctly. */ +static void +zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t); i++) { + VERIFY3U(leaf->btl_elems[i], ==, val); + } + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; + i++) { + VERIFY3U(node->btc_elems[i], ==, val); + } + + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + VERIFY3P(node->btc_children[i], ==, + (zfs_btree_hdr_t *)BTREE_POISON); + } + + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_poison_helper(tree, + node->btc_children[i]); + } + } +} +#endif + +/* Check that unused memory in the tree is still poisoned. */ +static void +zfs_btree_verify_poison(zfs_btree_t *tree) +{ +#ifdef ZFS_DEBUG + if (tree->bt_height == -1) + return; + zfs_btree_verify_poison_helper(tree, tree->bt_root); +#endif +} + +void +zfs_btree_verify(zfs_btree_t *tree) +{ + if (zfs_btree_verify_intensity == 0) + return; + zfs_btree_verify_height(tree); + if (zfs_btree_verify_intensity == 1) + return; + zfs_btree_verify_pointers(tree); + if (zfs_btree_verify_intensity == 2) + return; + zfs_btree_verify_counts(tree); + if (zfs_btree_verify_intensity == 3) + return; + zfs_btree_verify_order(tree); + + if (zfs_btree_verify_intensity == 4) + return; + zfs_btree_verify_poison(tree); +}
--- a/usr/src/uts/common/fs/zfs/ddt.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/ddt.c Mon Dec 09 14:15:34 2019 +0000 @@ -794,7 +794,7 @@ break; } - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } static ddt_t *
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Mon Dec 09 14:15:34 2019 +0000 @@ -1829,7 +1829,7 @@ */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } static void
--- a/usr/src/uts/common/fs/zfs/dmu_recv.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dmu_recv.c Mon Dec 09 14:15:34 2019 +0000 @@ -883,14 +883,10 @@ static int guid_compare(const void *arg1, const void *arg2) { - const guid_map_entry_t *gmep1 = arg1; - const guid_map_entry_t *gmep2 = arg2; - - if (gmep1->guid < gmep2->guid) - return (-1); - else if (gmep1->guid > gmep2->guid) - return (1); - return (0); + const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; + const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; + + return (TREE_CMP(gmep1->guid, gmep2->guid)); } static void
--- a/usr/src/uts/common/fs/zfs/dnode.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dnode.c Mon Dec 09 14:15:34 2019 +0000 @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 RackTop Systems. @@ -90,11 +90,11 @@ const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - int cmp = AVL_CMP(d1->db_level, d2->db_level); + int cmp = TREE_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); @@ -106,7 +106,7 @@ return (1); } - return (AVL_PCMP(d1, d2)); + return (TREE_PCMP(d1, d2)); } /* ARGSUSED */ @@ -2197,7 +2197,8 @@ mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
--- a/usr/src/uts/common/fs/zfs/dsl_deadlist.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c Mon Dec 09 14:15:34 2019 +0000 @@ -58,7 +58,7 @@ const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; - return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); + return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static void
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c Mon Dec 09 14:15:34 2019 +0000 @@ -390,7 +390,7 @@ val = strcmp(node1->p_setname, node2->p_setname); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } /*
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c Mon Dec 09 14:15:34 2019 +0000 @@ -295,7 +295,7 @@ /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - avl_tree_t q_exts_by_size; + zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -653,7 +653,8 @@ mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, + NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } @@ -1239,7 +1240,7 @@ queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ - mused += avl_numnodes(&queue->q_exts_by_size) * + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); @@ -2769,7 +2770,7 @@ srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; - SIO_SET_OFFSET(srch_sio, rs->rs_start); + SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, @@ -2781,10 +2782,12 @@ if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && - SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { - ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); - ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); + while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr) && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + queue->q_exts_by_addr)); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); @@ -2802,16 +2805,19 @@ * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); + SIO_GET_OFFSET(sio), rs_get_end(rs, + queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { - range_tree_remove(queue->q_exts_by_addr, rs->rs_start, - rs->rs_end - rs->rs_start); + uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); + range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); return (B_FALSE); } } @@ -2832,6 +2838,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; + range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); @@ -2839,9 +2846,16 @@ /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (zfs_scan_issue_strategy == 2) { - return (avl_first(&queue->q_exts_by_size)); + range_seg_t *size_rs = + zfs_btree_first(&queue->q_exts_by_size, NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, + size); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } } @@ -2855,9 +2869,15 @@ * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (scn->scn_clearing) { - return (avl_first(&queue->q_exts_by_size)); + range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, + NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, size); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } else { return (NULL); } @@ -3946,9 +3966,10 @@ static int ext_size_compare(const void *x, const void *y) { - const range_seg_t *rsa = x, *rsb = y; - uint64_t sa = rsa->rs_end - rsa->rs_start, - sb = rsb->rs_end - rsb->rs_start; + const range_seg_gap_t *rsa = x, *rsb = y; + + uint64_t sa = rsa->rs_end - rsa->rs_start; + uint64_t sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * @@ -3977,7 +3998,7 @@ { const scan_io_t *a = x, *b = y; - return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); + return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ @@ -3991,8 +4012,8 @@ q->q_vd = vd; q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
--- a/usr/src/uts/common/fs/zfs/metaslab.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Mon Dec 09 14:15:34 2019 +0000 @@ -38,6 +38,7 @@ #include <sys/zfeature.h> #include <sys/vdev_indirect_mapping.h> #include <sys/zap.h> +#include <sys/btree.h> #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) @@ -179,6 +180,13 @@ int metaslab_df_max_search = 16 * 1024 * 1024; /* + * Forces the metaslab_block_picker function to search for at least this many + * segments forwards until giving up on finding a segment that the allocation + * will fit into. + */ +uint32_t metaslab_min_search_count = 100; + +/* * If we are not searching forward (due to metaslab_df_max_search, * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable * controls what segment is used. If it is set, we will use the largest free @@ -274,17 +282,32 @@ int max_disabled_ms = 3; /* + * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. + * To avoid 64-bit overflow, don't set above UINT32_MAX. + */ +unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ + +/* * Maximum percentage of memory to use on storing loaded metaslabs. If loading * a metaslab would take it over this percentage, the oldest selected metaslab * is automatically unloaded. */ -int zfs_metaslab_mem_limit = 25; +int zfs_metaslab_mem_limit = 75; /* - * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. - * To avoid 64-bit overflow, don't set above UINT32_MAX. + * Force the per-metaslab range trees to use 64-bit integers to store + * segments. Used for debugging purposes. */ -unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ +boolean_t zfs_metaslab_force_large_segs = B_FALSE; + +/* + * By default we only store segments over a certain size in the size-sorted + * metaslab trees (ms_allocatable_by_size and + * ms_unflushed_frees_by_size). This dramatically reduces memory usage and + * improves load and unload times at the cost of causing us to use slightly + * larger segments than we would otherwise in some cases. + */ +uint32_t metaslab_by_size_min_shift = 14; static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); @@ -295,9 +318,56 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); +static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; +typedef struct metaslab_stats { + kstat_named_t metaslabstat_trace_over_limit; + kstat_named_t metaslabstat_df_find_under_floor; + kstat_named_t metaslabstat_reload_tree; +} metaslab_stats_t; + +static metaslab_stats_t metaslab_stats = { + { "trace_over_limit", KSTAT_DATA_UINT64 }, + { "df_find_under_floor", KSTAT_DATA_UINT64 }, + { "reload_tree", KSTAT_DATA_UINT64 }, +}; + +#define METASLABSTAT_BUMP(stat) \ + atomic_inc_64(&metaslab_stats.stat.value.ui64); + + +kstat_t *metaslab_ksp; + +void +metaslab_stat_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", + "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (metaslab_ksp != NULL) { + metaslab_ksp->ks_data = &metaslab_stats; + kstat_install(metaslab_ksp); + } +} + +void +metaslab_stat_fini(void) +{ + if (metaslab_ksp != NULL) { + kstat_delete(metaslab_ksp); + metaslab_ksp = NULL; + } + + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} + /* * ========================================================================== * Metaslab classes @@ -608,13 +678,13 @@ if (sort1 > sort2) return (1); - int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); - IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - - return (AVL_CMP(m1->ms_start, m2->ms_start)); + IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + + return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* @@ -711,17 +781,17 @@ const metaslab_t *a = va; const metaslab_t *b = vb; - int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; - cmp = AVL_CMP(a_vdev_id, b_vdev_id); + cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); - return (AVL_CMP(a->ms_id, b->ms_id)); + return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * @@ -1248,25 +1318,170 @@ */ /* - * Comparison function for the private size-ordered tree. Tree is sorted - * by size, larger sizes at the end of the tree. + * Comparison function for the private size-ordered tree using 32-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} + +/* + * Comparison function for the private size-ordered tree using 64-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ static int -metaslab_rangesize_compare(const void *x1, const void *x2) +metaslab_rangesize64_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - int cmp = AVL_CMP(rs_size1, rs_size2); + int cmp = TREE_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); - return (AVL_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} +typedef struct metaslab_rt_arg { + zfs_btree_t *mra_bt; + uint32_t mra_floor_shift; +} metaslab_rt_arg_t; + +struct mssa_arg { + range_tree_t *rt; + metaslab_rt_arg_t *mra; +}; + +static void +metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) +{ + struct mssa_arg *mssap = arg; + range_tree_t *rt = mssap->rt; + metaslab_rt_arg_t *mrap = mssap->mra; + range_seg_max_t seg = {0}; + rs_set_start(&seg, rt, start); + rs_set_end(&seg, rt, start + size); + metaslab_rt_add(rt, &seg, mrap); +} + +static void +metaslab_size_tree_full_load(range_tree_t *rt) +{ + metaslab_rt_arg_t *mrap = rt->rt_arg; +#ifdef _METASLAB_TRACING + METASLABSTAT_BUMP(metaslabstat_reload_tree); +#endif + ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); + mrap->mra_floor_shift = 0; + struct mssa_arg arg = {0}; + arg.rt = rt; + arg.mra = mrap; + range_tree_walk(rt, metaslab_size_sorted_add, &arg); } /* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ +/* ARGSUSED */ +static void +metaslab_rt_create(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + size_t size; + int (*compare) (const void *, const void *); + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = metaslab_rangesize32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = metaslab_rangesize64_compare; + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, compare, size); + mrap->mra_floor_shift = metaslab_by_size_min_shift; +} + +/* ARGSUSED */ +static void +metaslab_rt_destroy(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + zfs_btree_destroy(size_tree); + kmem_free(mrap, sizeof (*mrap)); +} + +/* ARGSUSED */ +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + (1 << mrap->mra_floor_shift)) + return; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + mrap->mra_floor_shift)) + return; + + zfs_btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_vacate(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + metaslab_rt_create(rt, arg); +} + +static range_tree_ops_t metaslab_rt_ops = { + .rtop_create = metaslab_rt_create, + .rtop_destroy = metaslab_rt_destroy, + .rtop_add = metaslab_rt_add, + .rtop_remove = metaslab_rt_remove, + .rtop_vacate = metaslab_rt_vacate +}; + +/* * ========================================================================== * Common allocator routines * ========================================================================== @@ -1278,16 +1493,20 @@ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); - rs = avl_last(t); + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + + rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); - return (rs->rs_end - rs->rs_start); + return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + msp->ms_allocatable)); } /* @@ -1302,7 +1521,10 @@ if (msp->ms_unflushed_frees == NULL) return (0); - range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_unflushed_frees); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + NULL); if (rs == NULL) return (0); @@ -1329,8 +1551,8 @@ * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ - uint64_t rstart = rs->rs_start; - uint64_t rsize = rs->rs_end - rstart; + uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; @@ -1354,44 +1576,52 @@ } static range_seg_t * -metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, + uint64_t size, zfs_btree_index_t *where) { - range_seg_t *rs, rsearch; - avl_index_t where; - - rsearch.rs_start = start; - rsearch.rs_end = start + size; - - rs = avl_find(t, &rsearch, &where); + range_seg_t *rs; + range_seg_max_t rsearch; + + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, start + size); + + rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, where, where); } return (rs); } /* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. + * This is a helper function that can be used by the allocator to find a + * suitable block to allocate. This will search the specified B-tree looking + * for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, +metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { - range_seg_t *rs = metaslab_block_find(t, *cursor, size); + if (*cursor == 0) + *cursor = rt->rt_start; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; + range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; + int count_searched = 0; if (rs != NULL) - first_found = rs->rs_start; - - while (rs != NULL && rs->rs_start - first_found <= max_search) { - uint64_t offset = rs->rs_start; - if (offset + size <= rs->rs_end) { + first_found = rs_get_start(rs, rt); + + while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + max_search || count_searched < metaslab_min_search_count)) { + uint64_t offset = rs_get_start(rs, rt); + if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } - rs = AVL_NEXT(t, rs); + rs = zfs_btree_next(bt, &where, &where); + count_searched++; } *cursor = 0; @@ -1435,8 +1665,6 @@ uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(&rt->rt_root), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); /* * If we're running low on space, find a segment based on size, @@ -1446,22 +1674,33 @@ free_pct < metaslab_df_free_pct) { offset = -1; } else { - offset = metaslab_block_picker(&rt->rt_root, + offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; + if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ - rs = avl_last(&msp->ms_allocatable_by_size); + rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { + zfs_btree_index_t where; /* use segment of this size, or next largest */ +#ifdef _METASLAB_TRACING + metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; + if (size < (1 << mrap->mra_floor_shift)) { + METASLABSTAT_BUMP( + metaslabstat_df_find_under_floor); + } +#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, - 0, size); + rt, msp->ms_start, size, &where); } - if (rs != NULL && rs->rs_start + size <= rs->rs_end) { - offset = rs->rs_start; + if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, + rt)) { + offset = rs_get_start(rs, rt); *cursor = offset + size; } } @@ -1486,25 +1725,27 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + rs = zfs_btree_last(t, NULL); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + size) return (-1ULL); - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; + *cursor = rs_get_start(rs, rt); + *cursor_end = rs_get_end(rs, rt); } offset = *cursor; @@ -1535,39 +1776,40 @@ static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_allocatable->rt_root; - avl_index_t where; - range_seg_t *rs, rsearch; + zfs_btree_t *t = &msp->ms_allocatable->rt_root; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; - - rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); + + rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; - rsearch.rs_start = 0; - rsearch.rs_end = MIN(max_size, - 1ULL << (hbit + metaslab_ndf_clump_shift)); - rs = avl_find(t, &rsearch, &where); + rs_set_start(&rsearch, rt, 0); + rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + metaslab_ndf_clump_shift))); + + rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); + if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { + *cursor = rs_get_start(rs, rt) + size; + return (rs_get_start(rs, rt)); } return (-1ULL); } @@ -1905,9 +2147,9 @@ { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); - extern kmem_cache_t *range_seg_cache; - uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); - uint64_t size = kmem_cache_stat(range_seg_cache, "buf_size"); + extern kmem_cache_t *zfs_btree_leaf_cache; + uint64_t inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse"); + uint64_t size = kmem_cache_stat(zfs_btree_leaf_cache, "buf_size"); int tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; @@ -1944,7 +2186,7 @@ */ if (msp->ms_loading) { msp = next_msp; - inuse = kmem_cache_stat(range_seg_cache, + inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse"); continue; } @@ -1967,7 +2209,8 @@ } mutex_exit(&msp->ms_lock); msp = next_msp; - inuse = kmem_cache_stat(range_seg_cache, "buf_inuse"); + inuse = kmem_cache_stat(zfs_btree_leaf_cache, + "buf_inuse"); } } #endif @@ -2010,11 +2253,40 @@ mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); + metaslab_rt_arg_t *mrap; + if (msp->ms_allocatable->rt_arg == NULL) { + mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + } else { + mrap = msp->ms_allocatable->rt_arg; + msp->ms_allocatable->rt_ops = NULL; + msp->ms_allocatable->rt_arg = NULL; + } + mrap->mra_bt = &msp->ms_allocatable_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); + + /* Now, populate the size-sorted tree. */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + + struct mssa_arg arg = {0}; + arg.rt = msp->ms_allocatable; + arg.mra = mrap; + range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, + &arg); } else { /* + * Add the size-sorted tree first, since we don't need to load + * the metaslab from the spacemap. + */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. @@ -2276,6 +2548,29 @@ metaslab_recalculate_weight_and_sort(msp); } +/* + * We want to optimize the memory use of the per-metaslab range + * trees. To do this, we store the segments in the range trees in + * units of sectors, zero-indexing from the start of the metaslab. If + * the vdev_ms_shift - the vdev_ashift is less than 32, we can store + * the ranges using two uint32_ts, rather than two uint64_ts. + */ +static range_seg_type_t +metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, + uint64_t *start, uint64_t *shift) +{ + if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && + !zfs_metaslab_force_large_segs) { + *shift = vdev->vdev_ashift; + *start = msp->ms_start; + return (RANGE_SEG32); + } else { + *shift = 0; + *start = 0; + return (RANGE_SEG64); + } +} + void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { @@ -2352,6 +2647,10 @@ ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + /* * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves @@ -2360,10 +2659,9 @@ * we'd data fault on any attempt to use this metaslab before * it's ready. */ - ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, - &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); - - ms->ms_trim = range_tree_create(NULL, NULL); + ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + + ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); @@ -2418,7 +2716,7 @@ { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * - sizeof (range_seg_t)); + ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void @@ -3207,7 +3505,7 @@ * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ - if (avl_is_empty(&msp->ms_allocatable_by_size) || + if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); @@ -3253,28 +3551,29 @@ * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * - * 1] We create a range tree (condense tree) that is 100% allocated. - * 2] We remove from it all segments found in the ms_defer trees + * 1] We create a range tree (condense tree) that is 100% empty. + * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same - * reason. Removing these segments should be a relatively + * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. - * 3] We vacate any unflushed allocs as they should already exist - * in the condense tree. Then we vacate any unflushed frees as - * they should already be part of ms_allocatable. - * 4] At this point, we would ideally like to remove all segments + * 3] We vacate any unflushed allocs, since they are not frees we + * need to add to the condense tree. Then we vacate any + * unflushed frees as they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the - * condensed space map, which would only contain allocated - * segments with everything else assumed to be freed. + * condensed space map, which would only contain freeed + * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can - * be large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we first sync out the - * condense_tree and then the ms_allocatable, in the condensed - * space map. While this is not optimal, it is typically close to - * optimal and more importantly much cheaper to compute. + * be large, and therefore computationally expensive to add to + * the condense_tree. Instead we first sync out an entry marking + * everything as allocated, then the condense_tree and then the + * ms_allocatable, in the condensed space map. While this is not + * optimal, it is typically close to optimal and more importantly + * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed @@ -3288,22 +3587,26 @@ "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, spa->spa_name, space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), + range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - condense_tree = range_tree_create(NULL, NULL); - range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, + &start, &shift); + + condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, @@ -3351,11 +3654,17 @@ * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, + shift); + range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); + range_tree_vacate(tmp_tree, NULL, NULL); + range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; @@ -3598,7 +3907,7 @@ return; - VERIFY(txg <= spa_final_dirty_txg(spa)); + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently @@ -3887,32 +4196,46 @@ * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, msp, &start, + &shift); + for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); - msp->ms_allocating[t] = range_tree_create(NULL, NULL); + msp->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); } ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, NULL); + msp->ms_freeing = range_tree_create(NULL, type, NULL, start, + shift); ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, NULL); + msp->ms_freed = range_tree_create(NULL, type, NULL, start, + shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT3P(msp->ms_defer[t], ==, NULL); - msp->ms_defer[t] = range_tree_create(NULL, NULL); + msp->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); } ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, NULL); + msp->ms_checkpointing = range_tree_create(NULL, type, NULL, + start, shift); ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); - msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, + start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &msp->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, - &msp->ms_unflushed_frees_by_size, - metaslab_rangesize_compare, 0); + msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -4091,36 +4414,6 @@ * Metaslab allocation tracing facility * ========================================================================== */ -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; - -void -metaslab_alloc_trace_init(void) -{ - ASSERT(metaslab_alloc_trace_cache == NULL); - metaslab_alloc_trace_cache = kmem_cache_create( - "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } -} - -void -metaslab_alloc_trace_fini(void) -{ - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; - } - kmem_cache_destroy(metaslab_alloc_trace_cache); - metaslab_alloc_trace_cache = NULL; -} /* * Add an allocation trace element to the allocation tracing list. @@ -4145,7 +4438,7 @@ #ifdef DEBUG panic("too many entries in allocation list"); #endif - atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next);
--- a/usr/src/uts/common/fs/zfs/range_tree.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/range_tree.c Mon Dec 09 14:15:34 2019 +0000 @@ -74,42 +74,38 @@ * support removing complete segments. */ -kmem_cache_t *range_seg_cache; - -/* Generic ops for managing an AVL tree alongside a range tree */ -struct range_tree_ops rt_avl_ops = { - .rtop_create = rt_avl_create, - .rtop_destroy = rt_avl_destroy, - .rtop_add = rt_avl_add, - .rtop_remove = rt_avl_remove, - .rtop_vacate = rt_avl_vacate, -}; - -void -range_tree_init(void) +static inline void +rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) { - ASSERT(range_seg_cache == NULL); - range_seg_cache = kmem_cache_create("range_seg_cache", - sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -range_tree_fini(void) -{ - kmem_cache_destroy(range_seg_cache); - range_seg_cache = NULL; + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + size_t size = 0; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + VERIFY(0); + } + bcopy(src, dest, size); } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; + zfs_btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; - for (rs = avl_first(&rt->rt_root); rs != NULL; - rs = AVL_NEXT(&rt->rt_root, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; @@ -128,7 +124,7 @@ static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -142,7 +138,7 @@ static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -153,14 +149,35 @@ rt->rt_histogram[idx]--; } -/* - * NOTE: caller is responsible for all locking. - */ +static int +range_tree_seg32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + static int -range_tree_seg_compare(const void *x1, const void *x2) +range_tree_seg64_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = (const range_seg_t *)x1; - const range_seg_t *r2 = (const range_seg_t *)x2; + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg_gap_compare(const void *x1, const void *x2) +{ + const range_seg_gap_t *r1 = x1; + const range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -169,18 +186,42 @@ } range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap) +range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, + uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), + uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - avl_create(&rt->rt_root, range_tree_seg_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + ASSERT3U(shift, <, 64); + ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + size_t size; + int (*compare) (const void *, const void *); + switch (type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = range_tree_seg32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = range_tree_seg64_compare; + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + compare = range_tree_seg_gap_compare; + break; + default: + panic("Invalid range seg type %d", type); + } + zfs_btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_arg = arg; rt->rt_gap = gap; - rt->rt_avl_compare = avl_compare; + rt->rt_type = type; + rt->rt_start = start; + rt->rt_shift = shift; + rt->rt_btree_compare = zfs_btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -189,9 +230,10 @@ } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_impl(ops, arg, NULL, 0)); + return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); } void @@ -202,19 +244,20 @@ if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - avl_destroy(&rt->rt_root); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { - ASSERT3U(rs->rs_fill + delta, !=, 0); - ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + ASSERT3U(rs_get_fill(rs, rt) + delta, !=, 0); + ASSERT3U(rs_get_fill(rs, rt) + delta, <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += delta; + rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } @@ -223,28 +266,20 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; - avl_index_t where; - range_seg_t rsearch, *rs_before, *rs_after, *rs; + zfs_btree_index_t where; + range_seg_t *rs_before, *rs_after, *rs; + range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); - - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); + ASSERT3U(start + size, >, start); - if (gap == 0 && rs != NULL && - rs->rs_start <= start && rs->rs_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we @@ -255,27 +290,28 @@ * the normal code paths. */ if (rs != NULL) { + ASSERT3U(rt->rt_gap, !=, 0); + uint64_t rstart = rs_get_start(rs, rt); + uint64_t rend = rs_get_end(rs, rt); ASSERT3U(gap, !=, 0); - if (rs->rs_start <= start && rs->rs_end >= end) { + if (rstart <= start && rend >= end) { range_tree_adjust_fill(rt, rs, fill); return; } - avl_remove(&rt->rt_root, rs); + zfs_btree_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); range_tree_stat_decr(rt, rs); - rt->rt_space -= rs->rs_end - rs->rs_start; + rt->rt_space -= rend - rstart; - fill += rs->rs_fill; - start = MIN(start, rs->rs_start); - end = MAX(end, rs->rs_end); + fill += rs_get_fill(rs, rt); + start = MIN(start, rstart); + end = MAX(end, rend); size = end - start; range_tree_add_impl(rt, start, size, fill); - - kmem_cache_free(range_seg_cache, rs); return; } @@ -286,19 +322,21 @@ * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ - rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); - rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + zfs_btree_index_t where_before, where_after; + rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); + rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); - merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); - merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + start - gap); + merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + + gap); if (merge_before && gap != 0) - bridge_size += start - rs_before->rs_end; + bridge_size += start - rs_get_end(rs_before, rt); if (merge_after && gap != 0) - bridge_size += rs_after->rs_start - end; + bridge_size += rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { - avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); @@ -307,9 +345,19 @@ range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += rs_before->rs_fill + fill; - rs_after->rs_start = rs_before->rs_start; - kmem_cache_free(range_seg_cache, rs_before); + rs_copy(rs_after, &tmp, rt); + uint64_t before_start = rs_get_start_raw(rs_before, rt); + uint64_t before_fill = rs_get_fill(rs_before, rt); + uint64_t after_fill = rs_get_fill(rs_after, rt); + zfs_btree_remove_from(&rt->rt_root, &where_before); + + /* + * We have to re-find the node because our old reference is + * invalid as soon as we do any mutating btree operations. + */ + rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + rs_set_start_raw(rs_after, rt, before_start); + rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -317,8 +365,9 @@ range_tree_stat_decr(rt, rs_before); - rs_before->rs_fill += fill; - rs_before->rs_end = end; + uint64_t before_fill = rs_get_fill(rs_before, rt); + rs_set_end(rs_before, rt, end); + rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -326,22 +375,26 @@ range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += fill; - rs_after->rs_start = start; + uint64_t after_fill = rs_get_fill(rs_after, rt); + rs_set_start(rs_after, rt, start); + rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { - rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + rs = &tmp; - rs->rs_fill = fill; - rs->rs_start = start; - rs->rs_end = end; - avl_insert(&rt->rt_root, rs, where); + rs_set_start(rs, rt, start); + rs_set_end(rs, rt, end); + rs_set_fill(rs, rt, fill); + zfs_btree_insert(&rt->rt_root, rs, &where); } - if (gap != 0) - ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); - else - ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + if (gap != 0) { + ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } else { + ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); @@ -360,22 +413,25 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { - avl_index_t where; - range_seg_t rsearch, *rs, *newseg; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", + zfs_panic_recover("zfs: removing nonexistent segment from " + "range tree (offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size); return; } @@ -388,30 +444,32 @@ */ if (rt->rt_gap != 0) { if (do_fill) { - if (rs->rs_fill == size) { - start = rs->rs_start; - end = rs->rs_end; + if (rs_get_fill(rs, rt) == size) { + start = rs_get_start(rs, rt); + end = rs_get_end(rs, rt); size = end - start; } else { range_tree_adjust_fill(rt, rs, -size); return; } - } else if (rs->rs_start != start || rs->rs_end != end) { + } else if (rs_get_start(rs, rt) != start || + rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llu size=%llu) of " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, + rt)); return; } } - VERIFY3U(rs->rs_start, <=, start); - VERIFY3U(rs->rs_end, >=, end); + VERIFY3U(rs_get_start(rs, rt), <=, start); + VERIFY3U(rs_get_end(rs, rt), >=, end); - left_over = (rs->rs_start != start); - right_over = (rs->rs_end != end); + left_over = (rs_get_start(rs, rt) != start); + right_over = (rs_get_end(rs, rt) != end); range_tree_stat_decr(rt, rs); @@ -419,24 +477,33 @@ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - newseg->rs_start = end; - newseg->rs_end = rs->rs_end; - newseg->rs_fill = newseg->rs_end - newseg->rs_start; - range_tree_stat_incr(rt, newseg); + range_seg_max_t newseg; + rs_set_start(&newseg, rt, end); + rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); + rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); + range_tree_stat_incr(rt, &newseg); - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); - avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); + rs_copy(rs, &rs_tmp, rt); + if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) + zfs_btree_insert(&rt->rt_root, &newseg, &where); + else + zfs_btree_add(&rt->rt_root, &newseg); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + rs_copy(rs, &rs_tmp, rt); } else if (right_over) { - rs->rs_start = end; + // This modifies the buffer already inside the range tree + rs_set_start(rs, rt, end); + rs_copy(rs, &rs_tmp, rt); } else { - avl_remove(&rt->rt_root, rs); - kmem_cache_free(range_seg_cache, rs); + zfs_btree_remove_from(&rt->rt_root, &where); rs = NULL; } @@ -446,11 +513,12 @@ * the size, since we do not support removing partial segments * of range trees with gaps. */ - rs->rs_fill = rs->rs_end - rs->rs_start; - range_tree_stat_incr(rt, rs); + rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - + rs_get_start_raw(rs, rt)); + range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); } rt->rt_space -= size; @@ -472,14 +540,14 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize) { - int64_t delta = newsize - (rs->rs_end - rs->rs_start); + int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_start = newstart; - rs->rs_end = newstart + newsize; + rs_set_start(rs, rt, newstart); + rs_set_end(rs, rt, newstart + newsize); range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) @@ -491,22 +559,27 @@ static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_t rsearch; + range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); - rsearch.rs_start = start; - rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, NULL)); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + if (rs != NULL && rs_get_start(rs, rt) <= start && + rs_get_end(rs, rt) >= start + size) { return (rs); + } return (NULL); } @@ -533,24 +606,28 @@ range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize) { - range_seg_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + 1; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - avl_index_t where; - range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + range_seg_max_t rsearch; + rs_set_start(&rsearch, rt, start); + rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); + + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; - *osize = MIN(size, rs->rs_end - start); + *osize = MIN(size, rs_get_end(rs, rt) - start); return (B_TRUE); } - rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); - if (rs == NULL || rs->rs_start > start + size) + rs = zfs_btree_next(&rt->rt_root, &where, &where); + if (rs == NULL || rs_get_start(rs, rt) > start + size) return (B_FALSE); - *ostart = rs->rs_start; - *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + *ostart = rs_get_start(rs, rt); + *osize = MIN(start + size, rs_get_end(rs, rt)) - + rs_get_start(rs, rt); return (B_TRUE); } @@ -566,9 +643,12 @@ if (size == 0) return; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs->rs_start, start); - uint64_t free_end = MIN(rs->rs_end, start + size); + uint64_t free_start = MAX(rs_get_start(rs, rt), start); + uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); range_tree_remove(rt, free_start, free_end - free_start); } } @@ -579,7 +659,7 @@ range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); - ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; @@ -589,17 +669,21 @@ void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - void *cookie = NULL; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); - while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - if (func != NULL) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); - kmem_cache_free(range_seg_cache, rs); + if (func != NULL) { + range_seg_t *rs; + zfs_btree_index_t *cookie = NULL; + + while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != + NULL) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + } else { + zfs_btree_clear(&rt->rt_root); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); @@ -609,16 +693,18 @@ void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); } } range_seg_t * range_tree_first(range_tree_t *rt) { - return (avl_first(&rt->rt_root)); + return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t @@ -630,52 +716,7 @@ uint64_t range_tree_numsegs(range_tree_t *rt) { - return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root)); -} - -/* Generic range tree functions for maintaining segments in an AVL tree. */ -void -rt_avl_create(range_tree_t *rt, void *arg) -{ - avl_tree_t *tree = arg; - - avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), - offsetof(range_seg_t, rs_pp_node)); -} - -void -rt_avl_destroy(range_tree_t *rt, void *arg) -{ - avl_tree_t *tree = arg; - - ASSERT0(avl_numnodes(tree)); - avl_destroy(tree); -} - -void -rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - avl_tree_t *tree = arg; - avl_add(tree, rs); -} - -void -rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - avl_tree_t *tree = arg; - avl_remove(tree, rs); -} - -void -rt_avl_vacate(range_tree_t *rt, void *arg) -{ - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - rt_avl_create(rt, arg); + return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); } boolean_t @@ -685,26 +726,76 @@ return (range_tree_space(rt) == 0); } -uint64_t -range_tree_min(range_tree_t *rt) +/* ARGSUSED */ +void +rt_btree_create(range_tree_t *rt, void *arg) { - range_seg_t *rs = avl_first(&rt->rt_root); - return (rs != NULL ? rs->rs_start : 0); + zfs_btree_t *size_tree = arg; + + size_t size; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, rt->rt_btree_compare, size); +} + +/* ARGSUSED */ +void +rt_btree_destroy(range_tree_t *rt, void *arg) +{ + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); + + zfs_btree_destroy(size_tree); } -uint64_t -range_tree_max(range_tree_t *rt) +/* ARGSUSED */ +void +rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) { - range_seg_t *rs = avl_last(&rt->rt_root); - return (rs != NULL ? rs->rs_end : 0); + zfs_btree_t *size_tree = arg; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +void +rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + zfs_btree_t *size_tree = arg; + + zfs_btree_remove(size_tree, rs); } -uint64_t -range_tree_span(range_tree_t *rt) +/* ARGSUSED */ +void +rt_btree_vacate(range_tree_t *rt, void *arg) { - return (range_tree_max(rt) - range_tree_min(rt)); + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + rt_btree_create(rt, arg); } +range_tree_ops_t rt_btree_ops = { + .rtop_create = rt_btree_create, + .rtop_destroy = rt_btree_destroy, + .rtop_add = rt_btree_add, + .rtop_remove = rt_btree_remove, + .rtop_vacate = rt_btree_vacate +}; + /* * Remove any overlapping ranges between the given segment [start, end) * from removefrom. Add non-overlapping leftovers to addto. @@ -713,42 +804,62 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto) { - avl_index_t where; - range_seg_t starting_rs = { - .rs_start = start, - .rs_end = start + 1 - }; + zfs_btree_index_t where; + range_seg_max_t starting_rs; + rs_set_start(&starting_rs, removefrom, start); + rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, + removefrom) + 1); - range_seg_t *curr = avl_find(&removefrom->rt_root, + range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) - curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER); + curr = zfs_btree_next(&removefrom->rt_root, &where, &where); range_seg_t *next; for (; curr != NULL; curr = next) { - next = AVL_NEXT(&removefrom->rt_root, curr); - if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ - if (end <= curr->rs_start) { + if (end <= rs_get_start(curr, removefrom)) { range_tree_add(addto, start, end - start); return; } - uint64_t overlap_start = MAX(curr->rs_start, start); - uint64_t overlap_end = MIN(curr->rs_end, end); + uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + start); + uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); + range_seg_max_t rs; + rs_copy(curr, &rs, removefrom); + range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) range_tree_add(addto, start, overlap_start - start); start = overlap_end; + next = zfs_btree_find(&removefrom->rt_root, &rs, &where); + /* + * If we find something here, we only removed part of the + * curr segment. Either there's some left at the end + * because we've reached the end of the range we're removing, + * or there's some left at the start because we started + * partway through the range. Either way, we continue with + * the loop. If it's the former, we'll return at the start of + * the loop, and if it's the latter we'll see if there is more + * area to process. + */ + if (next != NULL) { + ASSERT(start == end || start == rs_get_end(&rs, + removefrom)); + } + + next = zfs_btree_next(&removefrom->rt_root, &where, &where); } VERIFY3P(curr, ==, NULL); @@ -768,9 +879,30 @@ range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end, - removefrom, addto); + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + range_tree_remove_xor_add_segment(rs_get_start(rs, rt), + rs_get_end(rs, rt), removefrom, addto); } } + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_start(rs, rt) : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_end(rs, rt) : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +}
--- a/usr/src/uts/common/fs/zfs/sa.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sa.c Mon Dec 09 14:15:34 2019 +0000 @@ -252,7 +252,7 @@ const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - return (AVL_CMP(node1->lot_num, node2->lot_num)); + return (TREE_CMP(node1->lot_num, node2->lot_num)); } static int @@ -261,11 +261,11 @@ const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(node1->lot_instance, node2->lot_instance)); + return (TREE_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t
--- a/usr/src/uts/common/fs/zfs/spa.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/spa.c Mon Dec 09 14:15:34 2019 +0000 @@ -917,7 +917,7 @@ ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - return (AVL_ISIGN(ret)); + return (TREE_ISIGN(ret)); } /*
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Mon Dec 09 14:15:34 2019 +0000 @@ -57,6 +57,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include "zfs_prop.h" +#include <sys/btree.h> #include <sys/zfeature.h> /* @@ -601,7 +602,7 @@ const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; - return (AVL_CMP(a->sls_txg, b->sls_txg)); + return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* @@ -943,7 +944,7 @@ const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; - return (AVL_CMP(sa->aux_guid, sb->aux_guid)); + return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } void @@ -2058,7 +2059,7 @@ s = strcmp(s1->spa_name, s2->spa_name); - return (AVL_ISIGN(s)); + return (TREE_ISIGN(s)); } int @@ -2108,8 +2109,8 @@ zfs_refcount_init(); unique_init(); - range_tree_init(); - metaslab_alloc_trace_init(); + zfs_btree_init(); + metaslab_stat_init(); zio_init(); dmu_init(); zil_init(); @@ -2135,8 +2136,8 @@ zil_fini(); dmu_fini(); zio_fini(); - metaslab_alloc_trace_fini(); - range_tree_fini(); + metaslab_stat_fini(); + zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); scan_fini();
--- a/usr/src/uts/common/fs/zfs/space_map.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/space_map.c Mon Dec 09 14:15:34 2019 +0000 @@ -525,8 +525,9 @@ * dbuf must be dirty for the changes in sm_phys to take effect. */ static void -space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, - uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, + maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, + void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); @@ -550,14 +551,14 @@ ASSERT3P(block_cursor, <=, block_end); - uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t size = (rend - rstart) >> sm->sm_shift; + uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; - ASSERT3U(rs->rs_start, >=, sm->sm_start); - ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); - ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); - ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + ASSERT3U(rstart, >=, sm->sm_start); + ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); + ASSERT3U(rend - rstart, <=, sm->sm_size); + ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); @@ -675,10 +676,14 @@ dmu_buf_will_dirty(db, tx); - avl_tree_t *t = &rt->rt_root; - for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + zfs_btree_t *t = &rt->rt_root; + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + rs = zfs_btree_next(t, &where, &where)) { + uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> + sm->sm_shift; + uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + sm->sm_shift; uint8_t words = 1; /* @@ -703,8 +708,8 @@ spa_get_random(100) == 0))) words = 2; - space_map_write_seg(sm, rs, maptype, vdev_id, words, - &db, FTAG, tx); + space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, + rt), maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); @@ -753,7 +758,7 @@ else sm->sm_phys->smp_alloc -= range_tree_space(rt); - uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -762,7 +767,7 @@ * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ - VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); }
--- a/usr/src/uts/common/fs/zfs/space_reftree.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/space_reftree.c Mon Dec 09 14:15:34 2019 +0000 @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -57,11 +57,11 @@ const space_ref_t *sr1 = (const space_ref_t *)x1; const space_ref_t *sr2 = (const space_ref_t *)x2; - int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(sr1, sr2)); + return (TREE_PCMP(sr1, sr2)); } void @@ -109,10 +109,13 @@ void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { - range_seg_t *rs; + zfs_btree_index_t where; - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = + zfs_btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, + rt), refcnt); + } } /*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/bitops.h Mon Dec 09 14:15:34 2019 +0000 @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_BITOPS_H +#define _SYS_BITOPS_H + +#include <sys/zfs_context.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel; as a result, changing it to + * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. + */ +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITOPS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/btree.h Mon Dec 09 14:15:34 2019 +0000 @@ -0,0 +1,236 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#ifndef _BTREE_H +#define _BTREE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> + +/* + * This file defines the interface for a B-Tree implementation for ZFS. The + * tree can be used to store arbitrary sortable data types with low overhead + * and good operation performance. In addition the tree intelligently + * optimizes bulk in-order insertions to improve memory use and performance. + * + * Note that for all B-Tree functions, the values returned are pointers to the + * internal copies of the data in the tree. The internal data can only be + * safely mutated if the changes cannot change the ordering of the element + * with respect to any other elements in the tree. + * + * The major drawback of the B-Tree is that any returned elements or indexes + * are only valid until a side-effectful operation occurs, since these can + * result in reallocation or relocation of data. Side effectful operations are + * defined as insertion, removal, and zfs_btree_destroy_nodes. + * + * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core + * nodes have an array of children pointing to other nodes, and an array of + * elements that act as separators between the elements of the subtrees rooted + * at its children. Leaf nodes only contain data elements, and form the bottom + * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the + * elements in the core nodes are not copies of or references to leaf node + * elements. Each element occcurs only once in the tree, no matter what kind + * of node it is in. + * + * The tree's height is the same throughout, unlike many other forms of search + * tree. Each node (except for the root) must be between half minus one and + * completely full of elements (and children) at all times. Any operation that + * would put the node outside of that range results in a rebalancing operation + * (taking, merging, or splitting). + * + * This tree was implemented using descriptions from Wikipedia's articles on + * B-Trees and B+ Trees. + */ + +/* + * Decreasing these values results in smaller memmove operations, but more of + * them, and increased memory overhead. Increasing these values results in + * higher variance in operation time, and reduces memory overhead. + */ +#define BTREE_CORE_ELEMS 128 +#define BTREE_LEAF_SIZE 4096 + +typedef struct zfs_btree_hdr { + struct zfs_btree_core *bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} zfs_btree_hdr_t; + +typedef struct zfs_btree_core { + zfs_btree_hdr_t btc_hdr; + zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} zfs_btree_core_t; + +typedef struct zfs_btree_leaf { + zfs_btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} zfs_btree_leaf_t; + +typedef struct zfs_btree_index { + zfs_btree_hdr_t *bti_node; + uint64_t bti_offset; + /* + * True if the location is before the list offset, false if it's at + * the listed offset. + */ + boolean_t bti_before; +} zfs_btree_index_t; + +typedef struct btree { + zfs_btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading + int (*bt_compar) (const void *, const void *); +} zfs_btree_t; + +/* + * Allocate and deallocate caches for btree nodes. + */ +void zfs_btree_init(void); +void zfs_btree_fini(void); + +/* + * Initialize an B-Tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + */ +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), + size_t); + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with zfs_btree_insert() or zfs_btree_nearest(). + * + * node - node that has the value being looked for + * where - position for use with zfs_btree_nearest() or zfs_btree_insert(), + * may be NULL + */ +void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from zfs_btree_find() + */ +void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *); + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + */ +void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); +void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the next or previous valued node in the tree. + */ +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); + +/* + * Get a value from a tree and an index. + */ +void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Add a single value to the tree. The value must not compare equal to any + * other node already in the tree. + */ +void zfs_btree_add(zfs_btree_t *, const void *); + +/* + * Remove a single value from the tree. The value must be in the tree. The + * pointer passed in may be a pointer into a tree-controlled buffer, but it + * need not be. + */ +void zfs_btree_remove(zfs_btree_t *, const void *); + +/* + * Remove the value at the given location from the tree. + */ +void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the number of nodes in the tree + */ +ulong_t zfs_btree_numnodes(zfs_btree_t *); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it + * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. + * + * cookie - an index used to save state between calls to + * zfs_btree_destroy_nodes() + * + * EXAMPLE: + * zfs_btree_t *tree; + * struct my_data *node; + * zfs_btree_index_t *cookie; + * + * cookie = NULL; + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * data_destroy(node); + * zfs_btree_destroy(tree); + */ +void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); + +/* + * Destroys all nodes in the tree quickly. This doesn't give the caller an + * opportunity to iterate over each node and do its own cleanup; for that, use + * zfs_btree_destroy_nodes(). + */ +void zfs_btree_clear(zfs_btree_t *); + +/* + * Final destroy of an B-Tree. Arguments are: + * + * tree - the empty tree to destroy + */ +void zfs_btree_destroy(zfs_btree_t *tree); + +/* Runs a variety of self-checks on the btree to verify integrity. */ +void zfs_btree_verify(zfs_btree_t *tree); + +#ifdef __cplusplus +} +#endif + +#endif /* _BTREE_H */
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Mon Dec 09 14:15:34 2019 +0000 @@ -93,8 +93,8 @@ int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); -void metaslab_alloc_trace_init(void); -void metaslab_alloc_trace_fini(void); +void metaslab_stat_init(void); +void metaslab_stat_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Mon Dec 09 14:15:34 2019 +0000 @@ -517,8 +517,8 @@ * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ - avl_tree_t ms_allocatable_by_size; - avl_tree_t ms_unflushed_frees_by_size; + zfs_btree_t ms_allocatable_by_size; + zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */
--- a/usr/src/uts/common/fs/zfs/sys/range_tree.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h Mon Dec 09 14:15:34 2019 +0000 @@ -30,7 +30,7 @@ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H -#include <sys/avl.h> +#include <sys/btree.h> #include <sys/dmu.h> #ifdef __cplusplus @@ -41,19 +41,35 @@ typedef struct range_tree_ops range_tree_ops_t; +typedef enum range_seg_type { + RANGE_SEG32, + RANGE_SEG64, + RANGE_SEG_GAP, + RANGE_SEG_NUM_TYPES, +} range_seg_type_t; + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { - avl_tree_t rt_root; /* offset-ordered segment AVL tree */ + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ - uint64_t rt_gap; /* allowable inter-segment gap */ + range_seg_type_t rt_type; /* type of range_seg_t in use */ + /* + * All data that is stored in the range tree must have a start higher + * than or equal to rt_start, and all sizes and offsets must be + * multiples of 1 << rt_shift. + */ + uint8_t rt_shift; + uint64_t rt_start; range_tree_ops_t *rt_ops; - void *rt_arg; - /* rt_avl_compare should only be set it rt_arg is an AVL tree */ - int (*rt_avl_compare)(const void *, const void *); + /* rt_btree_compare should only be set if rt_arg is a b-tree */ + void *rt_arg; + int (*rt_btree_compare)(const void *, const void *); + + uint64_t rt_gap; /* allowable inter-segment gap */ /* * The rt_histogram maintains a histogram of ranges. Each bucket, @@ -63,36 +79,217 @@ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; -typedef struct range_seg { - avl_node_t rs_node; /* AVL node */ - avl_node_t rs_pp_node; /* AVL picker-private node */ +typedef struct range_seg32 { + uint32_t rs_start; /* starting offset of this segment */ + uint32_t rs_end; /* ending offset (non-inclusive) */ +} range_seg32_t; + +/* + * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may + * require 64-bit integers for ranges. + */ +typedef struct range_seg64 { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ +} range_seg64_t; + +typedef struct range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_t; +} range_seg_gap_t; + +/* + * This type needs to be the largest of the range segs, since it will be stack + * allocated and then cast the actual type to do tree operations. + */ +typedef range_seg_gap_t range_seg_max_t; + +/* + * This is just for clarity of code purposes, so we can make it clear that a + * pointer is to a range seg of some type; when we need to do the actual math, + * we'll figure out the real type. + */ +typedef void range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; +static inline uint64_t +rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_start); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_start); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_start); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_end); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_end); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_end); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: { + const range_seg32_t *r32 = rs; + return (r32->rs_end - r32->rs_start); + } + case RANGE_SEG64: { + const range_seg64_t *r64 = rs; + return (r64->rs_end - r64->rs_start); + } + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_fill); + default: + VERIFY(0); + return (0); + } + +} + +static inline uint64_t +rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +{ + return (rs_get_fill_raw(rs, rt) << rt->rt_shift); +} + +static inline void +rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(start, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_start = start; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_start = start; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(end, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_end = end; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_end = end; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + /* fall through */ + case RANGE_SEG64: + ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, + rt)); + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_fill = fill; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(start, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); + rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(end, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); + rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); + rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); +} + typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); -void range_tree_init(void); -void range_tree_fini(void); -range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare)(const void*, const void*), uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, + range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); @@ -119,19 +316,12 @@ void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto); -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; - -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; +void rt_btree_create(range_tree_t *rt, void *arg); +void rt_btree_destroy(range_tree_t *rt, void *arg); +void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_vacate(range_tree_t *rt, void *arg); +extern range_tree_ops_t rt_btree_ops; #ifdef __cplusplus }
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Mon Dec 09 14:15:34 2019 +0000 @@ -42,6 +42,7 @@ #include <sys/fs/zfs.h> #include <sys/dmu.h> #include <sys/space_map.h> +#include <sys/bitops.h> #ifdef __cplusplus extern "C" { @@ -65,45 +66,6 @@ struct dsl_crypto_params; /* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - -/* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the
--- a/usr/src/uts/common/fs/zfs/sys/space_reftree.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/space_reftree.h Mon Dec 09 14:15:34 2019 +0000 @@ -31,7 +31,7 @@ #define _SYS_SPACE_REFTREE_H #include <sys/range_tree.h> - +#include <sys/avl.h> #ifdef __cplusplus extern "C" { #endif
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h Mon Dec 09 14:15:34 2019 +0000 @@ -96,8 +96,8 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd); -extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, - range_seg_t *physical_rs); +extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Mon Dec 09 14:15:34 2019 +0000 @@ -87,8 +87,8 @@ * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, - range_seg_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, + range_seg64_t *res); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -517,8 +517,8 @@ /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, - range_seg_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, + range_seg64_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h Mon Dec 09 14:15:34 2019 +0000 @@ -26,6 +26,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -84,6 +85,16 @@ #define CPU_SEQID (CPU->cpu_seqid) +/* + * In ZoL the following defines were added to their sys/avl.h header, but + * we want to limit these to the ZFS code on illumos. + */ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + + #ifdef __cplusplus } #endif
--- a/usr/src/uts/common/fs/zfs/unique.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/unique.c Mon Dec 09 14:15:34 2019 +0000 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/avl.h> #include <sys/unique.h> @@ -45,7 +43,7 @@ const unique_t *una = (const unique_t *)a; const unique_t *unb = (const unique_t *)b; - return (AVL_CMP(una->un_value, unb->un_value)); + return (TREE_CMP(una->un_value, unb->un_value)); } void
--- a/usr/src/uts/common/fs/zfs/vdev.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev.c Mon Dec 09 14:15:34 2019 +0000 @@ -215,7 +215,7 @@ /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; @@ -496,7 +496,8 @@ rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); @@ -517,7 +518,8 @@ cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL); + vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); @@ -2434,14 +2436,11 @@ static uint64_t vdev_dtl_min(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_start - 1); + return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* @@ -2450,14 +2449,11 @@ static uint64_t vdev_dtl_max(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_end); + return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* @@ -2768,7 +2764,7 @@ ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = range_tree_create(NULL, NULL); + rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); @@ -4475,7 +4471,8 @@ * translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs) { /* * Walk up the vdev tree @@ -4502,7 +4499,7 @@ * range into its physical components by calling the * vdev specific translate function. */ - range_seg_t intermediate = { { { 0, 0 } } }; + range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); physical_rs->rs_start = intermediate.rs_start;
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c Mon Dec 09 14:15:34 2019 +0000 @@ -111,7 +111,7 @@ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); + return (TREE_CMP(ve1->ve_offset, ve2->ve_offset)); } static int @@ -120,7 +120,7 @@ const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); if (likely(cmp)) return (cmp);
--- a/usr/src/uts/common/fs/zfs/vdev_initialize.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c Mon Dec 09 14:15:34 2019 +0000 @@ -278,11 +278,13 @@ static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { - avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + range_tree_t *rt = vd->vdev_initialize_tree; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + rs = zfs_btree_next(bt, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = @@ -292,7 +294,7 @@ int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs->rs_start + + VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); @@ -328,7 +330,7 @@ * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -352,10 +354,14 @@ */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + zfs_btree_index_t where; + range_tree_t *rt = msp->ms_allocatable; + for (range_seg_t *rs = + zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, + &where)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -410,7 +416,7 @@ vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -469,7 +475,8 @@ abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) {
--- a/usr/src/uts/common/fs/zfs/vdev_label.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_label.c Mon Dec 09 14:15:34 2019 +0000 @@ -1021,12 +1021,12 @@ static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); @@ -1050,7 +1050,7 @@ if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); - return (AVL_CMP(seq1, seq2)); + return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata {
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Mon Dec 09 14:15:34 2019 +0000 @@ -216,12 +216,12 @@ const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_offset, z2->io_offset); + int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static inline avl_tree_t * @@ -248,12 +248,12 @@ const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp); + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } void
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c Mon Dec 09 14:15:34 2019 +0000 @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -1911,7 +1911,7 @@ vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = zio->io_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, zio->io_size); @@ -2655,7 +2655,7 @@ } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
--- a/usr/src/uts/common/fs/zfs/vdev_removal.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_removal.c Mon Dec 09 14:15:34 2019 +0000 @@ -189,11 +189,12 @@ spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, NULL); + svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -920,18 +921,15 @@ * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_t search; - avl_index_t where; - search.rs_start = start + maxalloc; - search.rs_end = search.rs_start; - range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); - if (rs == NULL) { - rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); - } else { - rs = AVL_PREV(&segs->rt_root, rs); - } + range_seg_max_t search; + zfs_btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); if (rs != NULL) { - size = rs->rs_end - start; + size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. @@ -963,20 +961,22 @@ * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); - range_seg_t *rs = avl_first(&segs->rt_root); - ASSERT3U(rs->rs_start, ==, start); - uint64_t prev_seg_end = rs->rs_end; - while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { - if (rs->rs_start >= start + size) { + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, - rs->rs_start - prev_seg_end); + rs_get_start(rs, segs) - prev_seg_end); } - prev_seg_end = rs->rs_end; + prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); @@ -1222,9 +1222,11 @@ * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - range_tree_t *segs = range_tree_create(NULL, NULL); + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { - range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); + if (rs == NULL) break; @@ -1232,17 +1234,17 @@ if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ - seg_length = - MIN(rs->rs_end - rs->rs_start, *max_alloc); + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); } else { - if (rs->rs_start - range_tree_max(segs) > + if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; - } else if (rs->rs_end - range_tree_min(segs) > + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past @@ -1251,13 +1253,14 @@ */ break; } else { - seg_length = rs->rs_end - rs->rs_start; + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); } } - range_tree_add(segs, rs->rs_start, seg_length); + range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, - rs->rs_start, seg_length); + rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { @@ -1420,7 +1423,7 @@ vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", - avl_numnodes(&svr->svr_allocd_segs->rt_root), + zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit &&
--- a/usr/src/uts/common/fs/zfs/vdev_trim.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/vdev_trim.c Mon Dec 09 14:15:34 2019 +0000 @@ -534,7 +534,8 @@ vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; - avl_tree_t *rt = &ta->trim_tree->rt_root; + zfs_btree_t *t = &ta->trim_tree->rt_root; + zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; @@ -542,9 +543,10 @@ ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, + ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { /* @@ -561,9 +563,9 @@ int error; error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + - rs->rs_start + (w * extent_bytes_max), - MIN(size - (w * extent_bytes_max), - extent_bytes_max)); + rs_get_start(rs, ta->trim_tree) + + (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { return (error); } @@ -601,7 +603,7 @@ * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -624,10 +626,13 @@ */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t idx; + for (range_seg_t *rs = zfs_btree_first(bt, &idx); + rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -719,7 +724,7 @@ { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -732,7 +737,7 @@ metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); - VERIFY(range_tree_find(msp->ms_allocatable, start, size)); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -811,7 +816,7 @@ ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = range_tree_create(NULL, NULL); + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1093,7 +1098,7 @@ VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); - VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } /* @@ -1191,7 +1196,8 @@ * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = range_tree_create(NULL, NULL); + trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(range_tree_is_empty(msp->ms_trim)); @@ -1245,7 +1251,8 @@ if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = range_tree_create(NULL, NULL); + ta->trim_tree = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); range_tree_walk(trim_tree, vdev_trim_range_add, ta); }
--- a/usr/src/uts/common/fs/zfs/zap_micro.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/zap_micro.c Mon Dec 09 14:15:34 2019 +0000 @@ -281,11 +281,11 @@ const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); } static void
--- a/usr/src/uts/common/fs/zfs/zfs_fuid.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/zfs_fuid.c Mon Dec 09 14:15:34 2019 +0000 @@ -75,7 +75,7 @@ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - return (AVL_CMP(node1->f_idx, node2->f_idx)); + return (TREE_CMP(node1->f_idx, node2->f_idx)); } /* @@ -90,7 +90,7 @@ val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } void
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c Mon Dec 09 14:15:34 2019 +0000 @@ -109,7 +109,7 @@ const locked_range_t *rl1 = (const locked_range_t *)arg1; const locked_range_t *rl2 = (const locked_range_t *)arg2; - return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); + return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); } /*
--- a/usr/src/uts/common/fs/zfs/zil.c Fri Dec 06 12:00:18 2019 -0600 +++ b/usr/src/uts/common/fs/zfs/zil.c Mon Dec 09 14:15:34 2019 +0000 @@ -139,11 +139,11 @@ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); - return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); + return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -526,7 +526,7 @@ const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - return (AVL_CMP(v1, v2)); + return (TREE_CMP(v1, v2)); } static lwb_t * @@ -1759,7 +1759,7 @@ const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - return (AVL_CMP(o1, o2)); + return (TREE_CMP(o1, o2)); } /*