Mercurial > illumos > illumos-gate
changeset 5530:4ed96167d864 onnv_79
6354519 stack overflow in zfs due to zio pipeline
6533726 single-threaded checksum & parity calculations limit write bandwidth
6547248 ztest detects a future leak when there is none
6604198 zfs only using single cpu for compression (part II)
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deleted_files/usr/src/uts/common/fs/zfs/rprwlock.c Tue Nov 27 22:58:05 2007 -0800 @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zfs_context.h> +#include <sys/refcount.h> +#include <sys/rprwlock.h> + +void +rprw_init(rprwlock_t *rwl) +{ + mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL); + rwl->rw_writer = NULL; + cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&rwl->rw_count); +} + +void +rprw_destroy(rprwlock_t *rwl) +{ + mutex_destroy(&rwl->rw_lock); + ASSERT(rwl->rw_writer == NULL); + cv_destroy(&rwl->rw_cv); + refcount_destroy(&rwl->rw_count); +} + +void +rprw_enter_read(rprwlock_t *rwl, void *tag) +{ + mutex_enter(&rwl->rw_lock); + + if (rwl->rw_writer != curthread) { + while (rwl->rw_writer != NULL) + cv_wait(&rwl->rw_cv, &rwl->rw_lock); + } + + (void) refcount_add(&rwl->rw_count, tag); + + mutex_exit(&rwl->rw_lock); +} + +void +rprw_enter_write(rprwlock_t *rwl, void *tag) +{ + mutex_enter(&rwl->rw_lock); + + if (rwl->rw_writer != curthread) { + while (!refcount_is_zero(&rwl->rw_count)) + cv_wait(&rwl->rw_cv, &rwl->rw_lock); + rwl->rw_writer = curthread; + } + + (void) refcount_add(&rwl->rw_count, tag); + + mutex_exit(&rwl->rw_lock); +} + +void +rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rprw_enter_read(rwl, tag); + else + rprw_enter_write(rwl, tag); +} + +void +rprw_exit(rprwlock_t *rwl, void *tag) +{ + mutex_enter(&rwl->rw_lock); + + ASSERT(!refcount_is_zero(&rwl->rw_count)); + ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer); + if (refcount_remove(&rwl->rw_count, tag) == 0) { + cv_broadcast(&rwl->rw_cv); + rwl->rw_writer = NULL; /* OK in either case */ + } + + mutex_exit(&rwl->rw_lock); +} + +boolean_t +rprw_held(rprwlock_t *rwl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rwl->rw_lock); + if (rw == RW_WRITER) + held = (rwl->rw_writer == curthread); + else + held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count); + mutex_exit(&rwl->rw_lock); + + return (held); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deleted_files/usr/src/uts/common/fs/zfs/sys/rprwlock.h Tue Nov 27 22:58:05 2007 -0800 @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_RPRWLOCK_H +#define _SYS_RPRWLOCK_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/inttypes.h> +#include <sys/list.h> +#include <sys/zfs_context.h> +#include <sys/refcount.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct rprwlock { + kmutex_t rw_lock; + kthread_t *rw_writer; + kcondvar_t rw_cv; + refcount_t rw_count; +} rprwlock_t; + +void rprw_init(rprwlock_t *rwl); +void rprw_destroy(rprwlock_t *rwl); +void rprw_enter_read(rprwlock_t *rwl, void *tag); +void rprw_enter_write(rprwlock_t *rwl, void *tag); +void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag); +void rprw_exit(rprwlock_t *rwl, void *tag); +boolean_t rprw_held(rprwlock_t *rwl, krw_t rw); +#define RPRW_READ_HELD(x) rprw_held(x, RW_READER) +#define RPRW_WRITE_HELD(x) rprw_held(x, RW_WRITER) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RPRWLOCK_H */
--- a/usr/src/cmd/zdb/zdb.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/cmd/zdb/zdb.c Tue Nov 27 22:58:05 2007 -0800 @@ -501,10 +501,8 @@ for (c = 0; c < rvd->vdev_children; c++) { vd = rvd->vdev_child[c]; - spa_config_enter(spa, RW_READER, FTAG); (void) printf("\n vdev %llu = %s\n\n", (u_longlong_t)vd->vdev_id, vdev_description(vd)); - spa_config_exit(spa, FTAG); if (dump_opt['d'] <= 5) { (void) printf("\t%10s %10s %5s\n", @@ -522,7 +520,6 @@ dump_dtl(vdev_t *vd, int indent) { avl_tree_t *t = &vd->vdev_dtl_map.sm_root; - spa_t *spa = vd->vdev_spa; space_seg_t *ss; vdev_t *pvd; int c; @@ -530,9 +527,7 @@ if (indent == 0) (void) printf("\nDirty time logs:\n\n"); - spa_config_enter(spa, RW_READER, FTAG); (void) printf("\t%*s%s\n", indent, "", vdev_description(vd)); - spa_config_exit(spa, FTAG); for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) { /* @@ -1730,6 +1725,8 @@ dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; + spa_config_enter(spa, RW_READER, FTAG); + if (dump_opt['u']) dump_uberblock(&spa->spa_uberblock); @@ -1751,6 +1748,8 @@ if (dump_opt['s']) show_pool_stats(spa); + spa_config_exit(spa, FTAG); + if (rc != 0) exit(rc); }
--- a/usr/src/cmd/ztest/ztest.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/cmd/ztest/ztest.c Tue Nov 27 22:58:05 2007 -0800 @@ -127,8 +127,18 @@ static int zopt_maxfaults; static uint16_t zopt_write_fail_shift = 5; +typedef struct ztest_block_tag { + uint64_t bt_objset; + uint64_t bt_object; + uint64_t bt_offset; + uint64_t bt_txg; + uint64_t bt_thread; + uint64_t bt_seq; +} ztest_block_tag_t; + typedef struct ztest_args { - char *za_pool; + char za_pool[MAXNAMELEN]; + spa_t *za_spa; objset_t *za_os; zilog_t *za_zilog; thread_t za_thread; @@ -141,6 +151,13 @@ hrtime_t za_stop; hrtime_t za_kill; traverse_handle_t *za_th; + /* + * Thread-local variables can go here to aid debugging. + */ + ztest_block_tag_t za_rbt; + ztest_block_tag_t za_wbt; + dmu_object_info_t za_doi; + dmu_buf_t *za_dbuf; } ztest_args_t; typedef void ztest_func_t(ztest_args_t *); @@ -167,6 +184,7 @@ typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ + uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every <interval> seconds */ uint64_t zi_calls; /* per-pass count */ uint64_t zi_call_time; /* per-pass time */ @@ -180,22 +198,22 @@ uint64_t zopt_rarely = 60; /* every 60 seconds */ ztest_info_t ztest_info[] = { - { ztest_dmu_read_write, &zopt_always }, - { ztest_dmu_write_parallel, &zopt_always }, - { ztest_dmu_object_alloc_free, &zopt_always }, - { ztest_zap, &zopt_always }, - { ztest_zap_parallel, &zopt_always }, - { ztest_traverse, &zopt_often }, - { ztest_dsl_prop_get_set, &zopt_sometimes }, - { ztest_dmu_objset_create_destroy, &zopt_sometimes }, - { ztest_dmu_snapshot_create_destroy, &zopt_rarely }, - { ztest_spa_create_destroy, &zopt_sometimes }, - { ztest_fault_inject, &zopt_sometimes }, - { ztest_spa_rename, &zopt_rarely }, - { ztest_vdev_attach_detach, &zopt_rarely }, - { ztest_vdev_LUN_growth, &zopt_rarely }, - { ztest_vdev_add_remove, &zopt_vdevtime }, - { ztest_scrub, &zopt_vdevtime }, + { ztest_dmu_read_write, 1, &zopt_always }, + { ztest_dmu_write_parallel, 30, &zopt_always }, + { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_zap, 30, &zopt_always }, + { ztest_zap_parallel, 100, &zopt_always }, + { ztest_traverse, 1, &zopt_often }, + { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, + { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes }, + { ztest_dmu_snapshot_create_destroy, 1, &zopt_rarely }, + { ztest_spa_create_destroy, 1, &zopt_sometimes }, + { ztest_fault_inject, 1, &zopt_sometimes }, + { ztest_spa_rename, 1, &zopt_rarely }, + { ztest_vdev_attach_detach, 1, &zopt_rarely }, + { ztest_vdev_LUN_growth, 1, &zopt_rarely }, + { ztest_vdev_add_remove, 1, &zopt_vdevtime }, + { ztest_scrub, 1, &zopt_vdevtime }, }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -214,21 +232,11 @@ hrtime_t zs_stop_time; uint64_t zs_alloc; uint64_t zs_space; - uint64_t zs_txg; ztest_info_t zs_info[ZTEST_FUNCS]; mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; uint64_t zs_seq[ZTEST_SYNC_LOCKS]; } ztest_shared_t; -typedef struct ztest_block_tag { - uint64_t bt_objset; - uint64_t bt_object; - uint64_t bt_offset; - uint64_t bt_txg; - uint64_t bt_thread; - uint64_t bt_seq; -} ztest_block_tag_t; - static char ztest_dev_template[] = "%s/%s.%llua"; static ztest_shared_t *ztest_shared; @@ -237,7 +245,7 @@ static boolean_t ztest_exiting = B_FALSE; -extern uint64_t zio_gang_bang; +extern uint64_t metaslab_gang_bang; extern uint16_t zio_zil_fail_shift; extern uint16_t zio_io_fail_shift; @@ -359,7 +367,7 @@ FILE *fp = requested ? stdout : stderr; nicenum(zopt_vdev_size, nice_vdev_size); - nicenum(zio_gang_bang, nice_gang_bang); + nicenum(metaslab_gang_bang, nice_gang_bang); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" @@ -432,7 +440,7 @@ uint64_t value; /* By default, test gang blocks for blocks 32K and greater */ - zio_gang_bang = 32 << 10; + metaslab_gang_bang = 32 << 10; /* Default value, fail every 32nd allocation */ zio_zil_fail_shift = 5; @@ -484,7 +492,7 @@ zopt_threads = MAX(1, value); break; case 'g': - zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); + metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zopt_init = value; @@ -835,7 +843,7 @@ void ztest_vdev_add_remove(ztest_args_t *za) { - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; nvlist_t *nvroot; int error; @@ -906,7 +914,7 @@ void ztest_vdev_attach_detach(ztest_args_t *za) { - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root, *file; @@ -1056,7 +1064,7 @@ void ztest_vdev_LUN_growth(ztest_args_t *za) { - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; char dev_name[MAXPATHLEN]; uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; uint64_t vdev; @@ -1106,7 +1114,7 @@ */ VERIFY(dmu_object_claim(os, ZTEST_DIROBJ, DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE, - DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0); + DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0); VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); @@ -1115,12 +1123,12 @@ DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); } -/* ARGSUSED */ static int ztest_destroy_cb(char *name, void *arg) { + ztest_args_t *za = arg; objset_t *os; - dmu_object_info_t doi; + dmu_object_info_t *doi = &za->za_doi; int error; /* @@ -1129,12 +1137,12 @@ error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_STANDARD | DS_MODE_READONLY, &os); ASSERT3U(error, ==, 0); - error = dmu_object_info(os, ZTEST_DIROBJ, &doi); + error = dmu_object_info(os, ZTEST_DIROBJ, doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT3U(error, ==, 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER); - ASSERT3S(doi.doi_physical_blks, >=, 0); + ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER); + ASSERT3S(doi->doi_physical_blks, >=, 0); } dmu_objset_close(os); @@ -1215,7 +1223,7 @@ * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ - (void) dmu_objset_find(name, ztest_destroy_cb, NULL, + (void) dmu_objset_find(name, ztest_destroy_cb, za, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* @@ -1428,7 +1436,7 @@ void ztest_traverse(ztest_args_t *za) { - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; traverse_handle_t *th = za->za_th; int rc, advance; uint64_t cbstart, cblimit; @@ -1500,7 +1508,7 @@ dmu_tx_t *tx; uint64_t batchobj, object, batchsize, endoff, temp; int b, c, error, bonuslen; - dmu_object_info_t doi; + dmu_object_info_t *doi = &za->za_doi; char osname[MAXNAMELEN]; dmu_objset_name(os, osname); @@ -1545,13 +1553,14 @@ * We expect the nth byte of the bonus buffer to be n. */ VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db)); - - dmu_object_info_from_db(db, &doi); - ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER); - ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER); - ASSERT3S(doi.doi_physical_blks, >=, 0); - - bonuslen = doi.doi_bonus_size; + za->za_dbuf = db; + + dmu_object_info_from_db(db, doi); + ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER); + ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER); + ASSERT3S(doi->doi_physical_blks, >=, 0); + + bonuslen = doi->doi_bonus_size; for (c = 0; c < bonuslen; c++) { if (((uint8_t *)db->db_data)[c] != @@ -1565,6 +1574,7 @@ } dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; /* * We expect the word at endoff to be our object number. @@ -1669,7 +1679,8 @@ /* * Write to both the bonus buffer and the regular data. */ - VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db)); + VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0); + za->za_dbuf = db; ASSERT3U(bonuslen, <=, db->db_size); dmu_object_size_from_db(db, &va_blksize, &va_nblocks); @@ -1685,6 +1696,7 @@ ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen); dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; /* * Write to a large offset to increase indirection. @@ -1939,244 +1951,229 @@ } void -ztest_dmu_check_future_leak(objset_t *os, uint64_t txg) +ztest_dmu_check_future_leak(ztest_args_t *za) { + objset_t *os = za->za_os; dmu_buf_t *db; - ztest_block_tag_t rbt; - - if (zopt_verbose >= 3) { - char osname[MAXNAMELEN]; - dmu_objset_name(os, osname); - (void) printf("checking %s for future leaks in txg %lld...\n", - osname, (u_longlong_t)txg); - } + ztest_block_tag_t *bt; + dmu_object_info_t *doi = &za->za_doi; /* * Make sure that, if there is a write record in the bonus buffer * of the ZTEST_DIROBJ, that the txg for this record is <= the * last synced txg of the pool. */ - - VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db)); - ASSERT3U(db->db_size, >=, sizeof (rbt)); - bcopy(db->db_data, &rbt, sizeof (rbt)); - if (rbt.bt_objset != 0) { - ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os)); - ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ); - ASSERT3U(rbt.bt_offset, ==, -1ULL); - if (rbt.bt_txg > txg) { - fatal(0, - "future leak: got %llx, last synced txg is %llx", - rbt.bt_txg, txg); - } + VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0); + za->za_dbuf = db; + VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0); + ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt)); + ASSERT3U(doi->doi_bonus_size, <=, db->db_size); + ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0); + bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt)); + if (bt->bt_objset != 0) { + ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); + ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ); + ASSERT3U(bt->bt_offset, ==, -1ULL); + ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa)); } dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; } void ztest_dmu_write_parallel(ztest_args_t *za) { objset_t *os = za->za_os; - dmu_tx_t *tx; + ztest_block_tag_t *rbt = &za->za_rbt; + ztest_block_tag_t *wbt = &za->za_wbt; + const size_t btsize = sizeof (ztest_block_tag_t); dmu_buf_t *db; - int i, b, error, do_free, bs; - uint64_t off, txg_how, txg; + int b, error; + int bs = ZTEST_DIROBJ_BLOCKSIZE; + int do_free = 0; + uint64_t off, txg_how; mutex_t *lp; char osname[MAXNAMELEN]; char iobuf[SPA_MAXBLOCKSIZE]; - ztest_block_tag_t rbt, wbt; + blkptr_t blk = { 0 }; + uint64_t blkoff; + zbookmark_t zb; + dmu_tx_t *tx = dmu_tx_create(os); dmu_objset_name(os, osname); - bs = ZTEST_DIROBJ_BLOCKSIZE; /* * Have multiple threads write to large offsets in ZTEST_DIROBJ * to verify that having multiple threads writing to the same object * in parallel doesn't cause any trouble. - * Also do parallel writes to the bonus buffer on occasion. */ - for (i = 0; i < 50; i++) { + if (ztest_random(4) == 0) { + /* + * Do the bonus buffer instead of a regular block. + * We need a lock to serialize resize vs. others, + * so we hash on the objset ID. + */ + b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS; + off = -1ULL; + dmu_tx_hold_bonus(tx, ZTEST_DIROBJ); + } else { b = ztest_random(ZTEST_SYNC_LOCKS); - lp = &ztest_shared->zs_sync_lock[b]; - - do_free = (ztest_random(4) == 0); - - off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT); - + off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT); if (ztest_random(4) == 0) { - /* - * Do the bonus buffer instead of a regular block. - */ - do_free = 0; - off = -1ULL; - } - - tx = dmu_tx_create(os); - - if (off == -1ULL) - dmu_tx_hold_bonus(tx, ZTEST_DIROBJ); - else if (do_free) + do_free = 1; dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs); - else + } else { dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs); - - txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT; - error = dmu_tx_assign(tx, txg_how); - if (error) { - if (error == ERESTART) { - ASSERT(txg_how == TXG_NOWAIT); - dmu_tx_wait(tx); - dmu_tx_abort(tx); - continue; - } - dmu_tx_abort(tx); - ztest_record_enospc("dmu write parallel"); - return; - } - txg = dmu_tx_get_txg(tx); - - if (do_free) { - (void) mutex_lock(lp); - VERIFY(0 == dmu_free_range(os, ZTEST_DIROBJ, off, - bs, tx)); - (void) mutex_unlock(lp); - dmu_tx_commit(tx); - continue; - } - - wbt.bt_objset = dmu_objset_id(os); - wbt.bt_object = ZTEST_DIROBJ; - wbt.bt_offset = off; - wbt.bt_txg = txg; - wbt.bt_thread = za->za_instance; - - if (off == -1ULL) { - dmu_object_info_t doi; - char *off; - - wbt.bt_seq = 0; - VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, - FTAG, &db)); - dmu_object_info_from_db(db, &doi); - ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt)); - off = (char *)db->db_data + - doi.doi_bonus_size - sizeof (wbt); - bcopy(off, &rbt, sizeof (wbt)); - if (rbt.bt_objset != 0) { - ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset); - ASSERT3U(rbt.bt_object, ==, wbt.bt_object); - ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset); - ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg); - } - if (ztest_random(10) == 0) { - int newsize = (ztest_random( - db->db_size / sizeof (wbt)) + 1) * - sizeof (wbt); - - ASSERT3U(newsize, >=, sizeof (wbt)); - ASSERT3U(newsize, <=, db->db_size); - error = dmu_set_bonus(db, newsize, tx); - ASSERT3U(error, ==, 0); - off = (char *)db->db_data + newsize - - sizeof (wbt); - } - dmu_buf_will_dirty(db, tx); - bcopy(&wbt, off, db->db_size); - dmu_buf_rele(db, FTAG); - dmu_tx_commit(tx); - continue; - } - - (void) mutex_lock(lp); - - wbt.bt_seq = ztest_shared->zs_seq[b]++; - - dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx); - - (void) mutex_unlock(lp); - - if (ztest_random(100) == 0) - (void) poll(NULL, 0, 1); /* open dn_notxholds window */ - - dmu_tx_commit(tx); - - if (ztest_random(1000) == 0) - txg_wait_synced(dmu_objset_pool(os), txg); - - if (ztest_random(2) == 0) { - blkptr_t blk = { 0 }; - uint64_t blkoff; - zbookmark_t zb; - - (void) mutex_lock(lp); - blkoff = P2ALIGN_TYPED(off, bs, uint64_t); - error = dmu_buf_hold(os, - ZTEST_DIROBJ, blkoff, FTAG, &db); - if (error) { - dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n", - osname, ZTEST_DIROBJ, blkoff, error); - (void) mutex_unlock(lp); - continue; - } - blkoff = off - blkoff; - error = dmu_sync(NULL, db, &blk, txg, NULL, NULL); - dmu_buf_rele(db, FTAG); - (void) mutex_unlock(lp); - if (error) { - dprintf("dmu_sync(%s, %d, %llx) = %d\n", - osname, ZTEST_DIROBJ, off, error); - continue; - } - - if (blk.blk_birth == 0) { /* concurrent free */ - continue; - } - txg_suspend(dmu_objset_pool(os)); - - ASSERT(blk.blk_fill == 1); - ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER); - ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); - ASSERT3U(BP_GET_LSIZE(&blk), ==, bs); - - /* - * Read the block that dmu_sync() returned to - * make sure its contents match what we wrote. - * We do this while still txg_suspend()ed to ensure - * that the block can't be reused before we read it. - */ - zb.zb_objset = dmu_objset_id(os); - zb.zb_object = ZTEST_DIROBJ; - zb.zb_level = 0; - zb.zb_blkid = off / bs; - error = zio_wait(zio_read(NULL, dmu_objset_spa(os), - &blk, iobuf, bs, NULL, NULL, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb)); - ASSERT(error == 0); - - txg_resume(dmu_objset_pool(os)); - - bcopy(&iobuf[blkoff], &rbt, sizeof (rbt)); - - if (rbt.bt_objset == 0) /* concurrent free */ - continue; - - ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset); - ASSERT3U(rbt.bt_object, ==, wbt.bt_object); - ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset); - - /* - * The semantic of dmu_sync() is that we always - * push the most recent version of the data, - * so in the face of concurrent updates we may - * see a newer version of the block. That's OK. - */ - ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg); - if (rbt.bt_thread == wbt.bt_thread) - ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq); - else - ASSERT3U(rbt.bt_seq, >, wbt.bt_seq); } } + + txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT; + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ztest_record_enospc("dmu write parallel"); + } + dmu_tx_abort(tx); + return; + } + + lp = &ztest_shared->zs_sync_lock[b]; + (void) mutex_lock(lp); + + wbt->bt_objset = dmu_objset_id(os); + wbt->bt_object = ZTEST_DIROBJ; + wbt->bt_offset = off; + wbt->bt_txg = dmu_tx_get_txg(tx); + wbt->bt_thread = za->za_instance; + wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */ + + if (off == -1ULL) { + dmu_object_info_t *doi = &za->za_doi; + char *dboff; + + VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0); + za->za_dbuf = db; + dmu_object_info_from_db(db, doi); + ASSERT3U(doi->doi_bonus_size, <=, db->db_size); + ASSERT3U(doi->doi_bonus_size, >=, btsize); + ASSERT3U(doi->doi_bonus_size % btsize, ==, 0); + dboff = (char *)db->db_data + doi->doi_bonus_size - btsize; + bcopy(dboff, rbt, btsize); + if (rbt->bt_objset != 0) { + ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset); + ASSERT3U(rbt->bt_object, ==, wbt->bt_object); + ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset); + ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg); + } + if (ztest_random(10) == 0) { + int newsize = (ztest_random(db->db_size / + btsize) + 1) * btsize; + + ASSERT3U(newsize, >=, btsize); + ASSERT3U(newsize, <=, db->db_size); + VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0); + dboff = (char *)db->db_data + newsize - btsize; + } + dmu_buf_will_dirty(db, tx); + bcopy(wbt, dboff, btsize); + dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; + } else if (do_free) { + VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0); + } else { + dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); + } + + (void) mutex_unlock(lp); + + if (ztest_random(1000) == 0) + (void) poll(NULL, 0, 1); /* open dn_notxholds window */ + + dmu_tx_commit(tx); + + if (ztest_random(10000) == 0) + txg_wait_synced(dmu_objset_pool(os), wbt->bt_txg); + + if (off == -1 || do_free) + return; + + if (ztest_random(2) != 0) + return; + + /* + * dmu_sync() the block we just wrote. + */ + (void) mutex_lock(lp); + + blkoff = P2ALIGN_TYPED(off, bs, uint64_t); + error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db); + za->za_dbuf = db; + if (error) { + dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n", + osname, ZTEST_DIROBJ, blkoff, error); + (void) mutex_unlock(lp); + return; + } + blkoff = off - blkoff; + error = dmu_sync(NULL, db, &blk, wbt->bt_txg, NULL, NULL); + dmu_buf_rele(db, FTAG); + za->za_dbuf = NULL; + + (void) mutex_unlock(lp); + + if (error) { + dprintf("dmu_sync(%s, %d, %llx) = %d\n", + osname, ZTEST_DIROBJ, off, error); + return; + } + + if (blk.blk_birth == 0) /* concurrent free */ + return; + + txg_suspend(dmu_objset_pool(os)); + + ASSERT(blk.blk_fill == 1); + ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER); + ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); + ASSERT3U(BP_GET_LSIZE(&blk), ==, bs); + + /* + * Read the block that dmu_sync() returned to make sure its contents + * match what we wrote. We do this while still txg_suspend()ed + * to ensure that the block can't be reused before we read it. + */ + zb.zb_objset = dmu_objset_id(os); + zb.zb_object = ZTEST_DIROBJ; + zb.zb_level = 0; + zb.zb_blkid = off / bs; + error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs, + NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb)); + ASSERT3U(error, ==, 0); + + txg_resume(dmu_objset_pool(os)); + + bcopy(&iobuf[blkoff], rbt, btsize); + + if (rbt->bt_objset == 0) /* concurrent free */ + return; + + ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset); + ASSERT3U(rbt->bt_object, ==, wbt->bt_object); + ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset); + + /* + * The semantic of dmu_sync() is that we always push the most recent + * version of the data, so in the face of concurrent updates we may + * see a newer version of the block. That's OK. + */ + ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg); + if (rbt->bt_thread == wbt->bt_thread) + ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq); + else + ASSERT3U(rbt->bt_seq, >, wbt->bt_seq); } /* @@ -2195,7 +2192,6 @@ uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; - int iters = 100; dmu_tx_t *tx; char propname[100], txgname[100]; int error; @@ -2259,122 +2255,113 @@ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); - while (--iters >= 0) { - prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); - bzero(value, sizeof (value)); - last_txg = 0; - - /* - * If these zap entries already exist, validate their contents. - */ - error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); - if (error == 0) { - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, 1); - - error = zap_lookup(os, object, txgname, zl_intsize, - zl_ints, &last_txg); - - ASSERT3U(error, ==, 0); - - error = zap_length(os, object, propname, &zl_intsize, - &zl_ints); - - ASSERT3U(error, ==, 0); - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, ints); - - error = zap_lookup(os, object, propname, zl_intsize, - zl_ints, value); - - ASSERT3U(error, ==, 0); - - for (i = 0; i < ints; i++) { - ASSERT3U(value[i], ==, last_txg + object + i); - } - } else { - ASSERT3U(error, ==, ENOENT); - } - - /* - * Atomically update two entries in our zap object. - * The first is named txg_%llu, and contains the txg - * in which the property was last updated. The second - * is named prop_%llu, and the nth element of its value - * should be txg + object + n. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create zap entry"); - dmu_tx_abort(tx); - return; + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + bzero(value, sizeof (value)); + last_txg = 0; + + /* + * If these zap entries already exist, validate their contents. + */ + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + if (error == 0) { + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + + VERIFY(zap_lookup(os, object, txgname, zl_intsize, + zl_ints, &last_txg) == 0); + + VERIFY(zap_length(os, object, propname, &zl_intsize, + &zl_ints) == 0); + + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, ints); + + VERIFY(zap_lookup(os, object, propname, zl_intsize, + zl_ints, value) == 0); + + for (i = 0; i < ints; i++) { + ASSERT3U(value[i], ==, last_txg + object + i); } - txg = dmu_tx_get_txg(tx); - - if (last_txg > txg) - fatal(0, "zap future leak: old %llu new %llu", - last_txg, txg); - - for (i = 0; i < ints; i++) - value[i] = txg + object + i; - - error = zap_update(os, object, txgname, sizeof (uint64_t), - 1, &txg, tx); - if (error) - fatal(0, "zap_update('%s', %llu, '%s') = %d", - osname, object, txgname, error); - - error = zap_update(os, object, propname, sizeof (uint64_t), - ints, value, tx); - if (error) - fatal(0, "zap_update('%s', %llu, '%s') = %d", - osname, object, propname, error); - - dmu_tx_commit(tx); - - /* - * Remove a random pair of entries. - */ - prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); - (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); - - error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); - - if (error == ENOENT) - continue; - - ASSERT3U(error, ==, 0); - - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("remove zap entry"); - dmu_tx_abort(tx); - return; - } - error = zap_remove(os, object, txgname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, txgname, error); - - error = zap_remove(os, object, propname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, propname, error); - - dmu_tx_commit(tx); + } else { + ASSERT3U(error, ==, ENOENT); + } + + /* + * Atomically update two entries in our zap object. + * The first is named txg_%llu, and contains the txg + * in which the property was last updated. The second + * is named prop_%llu, and the nth element of its value + * should be txg + object + n. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + ztest_record_enospc("create zap entry"); + dmu_tx_abort(tx); + return; } + txg = dmu_tx_get_txg(tx); + + if (last_txg > txg) + fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); + + for (i = 0; i < ints; i++) + value[i] = txg + object + i; + + error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx); + if (error) + fatal(0, "zap_update('%s', %llu, '%s') = %d", + osname, object, txgname, error); + + error = zap_update(os, object, propname, sizeof (uint64_t), + ints, value, tx); + if (error) + fatal(0, "zap_update('%s', %llu, '%s') = %d", + osname, object, propname, error); + + dmu_tx_commit(tx); + + /* + * Remove a random pair of entries. + */ + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + + if (error == ENOENT) + return; + + ASSERT3U(error, ==, 0); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + ztest_record_enospc("remove zap entry"); + dmu_tx_abort(tx); + return; + } + error = zap_remove(os, object, txgname, tx); + if (error) + fatal(0, "zap_remove('%s', %llu, '%s') = %d", + osname, object, txgname, error); + + error = zap_remove(os, object, propname, tx); + if (error) + fatal(0, "zap_remove('%s', %llu, '%s') = %d", + osname, object, propname, error); + + dmu_tx_commit(tx); /* * Once in a while, destroy the object. */ - if (ztest_random(100) != 0) + if (ztest_random(1000) != 0) return; tx = dmu_tx_create(os); @@ -2401,111 +2388,107 @@ { objset_t *os = za->za_os; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; - int iters = 100; dmu_tx_t *tx; int i, namelen, error; char name[20], string_value[20]; void *data; - while (--iters >= 0) { - /* - * Generate a random name of the form 'xxx.....' where each - * x is a random printable character and the dots are dots. - * There are 94 such characters, and the name length goes from - * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. - */ - namelen = ztest_random(sizeof (name) - 5) + 5 + 1; - - for (i = 0; i < 3; i++) - name[i] = '!' + ztest_random('~' - '!' + 1); - for (; i < namelen - 1; i++) - name[i] = '.'; - name[i] = '\0'; - - if (ztest_random(2) == 0) - object = ZTEST_MICROZAP_OBJ; - else - object = ZTEST_FATZAP_OBJ; - - if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) { - wsize = sizeof (txg); - wc = 1; - data = &txg; - } else { - wsize = 1; - wc = namelen; - data = string_value; + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if (ztest_random(2) == 0) + object = ZTEST_MICROZAP_OBJ; + else + object = ZTEST_FATZAP_OBJ; + + if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY(zap_count(os, object, &count) == 0); + ASSERT(count != -1ULL); + + /* + * Select an operation: length, lookup, add, update, remove. + */ + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + ztest_record_enospc("zap parallel"); + dmu_tx_abort(tx); + return; } - - count = -1ULL; - VERIFY(zap_count(os, object, &count) == 0); - ASSERT(count != -1ULL); - - /* - * Select an operation: length, lookup, add, update, remove. - */ - i = ztest_random(5); - - if (i >= 2) { - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("zap parallel"); - dmu_tx_abort(tx); - return; - } - txg = dmu_tx_get_txg(tx); - bcopy(name, string_value, namelen); + txg = dmu_tx_get_txg(tx); + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); } else { - tx = NULL; - txg = 0; - bzero(string_value, namelen); + ASSERT3U(error, ==, ENOENT); } - - switch (i) { - - case 0: - error = zap_length(os, object, name, &zl_wsize, &zl_wc); - if (error == 0) { - ASSERT3U(wsize, ==, zl_wsize); - ASSERT3U(wc, ==, zl_wc); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; - - case 1: - error = zap_lookup(os, object, name, wsize, wc, data); - if (error == 0) { - if (data == string_value && - bcmp(name, data, namelen) != 0) - fatal(0, "name '%s' != val '%s' len %d", - name, data, namelen); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; - - case 2: - error = zap_add(os, object, name, wsize, wc, data, tx); - ASSERT(error == 0 || error == EEXIST); - break; - - case 3: - VERIFY(zap_update(os, object, name, wsize, wc, - data, tx) == 0); - break; - - case 4: - error = zap_remove(os, object, name, tx); - ASSERT(error == 0 || error == ENOENT); - break; + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); } - - if (tx != NULL) - dmu_tx_commit(tx); + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; } + + if (tx != NULL) + dmu_tx_commit(tx); } void @@ -2590,7 +2573,7 @@ char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; vdev_t *vd0; @@ -2689,7 +2672,7 @@ void ztest_scrub(ztest_args_t *za) { - spa_t *spa = dmu_objset_spa(za->za_os); + spa_t *spa = za->za_spa; mutex_enter(&spa_namespace_lock); (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE); @@ -2739,7 +2722,7 @@ if (error != 0) fatal(0, "spa_open('%s') = %d", newname, error); - ASSERT(spa == dmu_objset_spa(za->za_os)); + ASSERT(spa == za->za_spa); spa_close(spa, FTAG); /* @@ -2757,7 +2740,7 @@ if (error != 0) fatal(0, "spa_open('%s') = %d", oldname, error); - ASSERT(spa == dmu_objset_spa(za->za_os)); + ASSERT(spa == za->za_spa); spa_close(spa, FTAG); umem_free(newname, strlen(newname) + 1); @@ -3038,29 +3021,15 @@ ztest_shared_t *zs = ztest_shared; hrtime_t now, functime; ztest_info_t *zi; - int f; + int f, i; while ((now = gethrtime()) < za->za_stop) { /* * See if it's time to force a crash. */ if (now > za->za_kill) { - dmu_tx_t *tx; - uint64_t txg; - - mutex_enter(&spa_namespace_lock); - tx = dmu_tx_create(za->za_os); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - txg = dmu_tx_get_txg(tx); - dmu_tx_commit(tx); - zs->zs_txg = txg; - if (zopt_verbose >= 3) - (void) printf( - "killing process after txg %lld\n", - (u_longlong_t)txg); - txg_wait_synced(dmu_objset_pool(za->za_os), txg); - zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os)); - zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os)); + zs->zs_alloc = spa_get_alloc(za->za_spa); + zs->zs_space = spa_get_space(za->za_spa); (void) kill(getpid(), SIGKILL); } @@ -3085,9 +3054,8 @@ ZTEST_DIRSIZE; za->za_diroff_shared = (1ULL << 63); - ztest_dmu_write_parallel(za); - - zi->zi_func(za); + for (i = 0; i < zi->zi_iters; i++) + zi->zi_func(za); functime = gethrtime() - now; @@ -3234,6 +3202,17 @@ for (t = 0; t < zopt_threads; t++) { d = t % zopt_datasets; + + (void) strcpy(za[t].za_pool, pool); + za[t].za_os = za[d].za_os; + za[t].za_spa = spa; + za[t].za_zilog = za[d].za_zilog; + za[t].za_instance = t; + za[t].za_random = ztest_random(-1ULL); + za[t].za_start = za[0].za_start; + za[t].za_stop = za[0].za_stop; + za[t].za_kill = za[0].za_kill; + if (t < zopt_datasets) { ztest_replay_t zr; int test_future = FALSE; @@ -3243,13 +3222,11 @@ ztest_create_cb, NULL); if (error == EEXIST) { test_future = TRUE; + } else if (error == ENOSPC) { + zs->zs_enospc_count++; + (void) rw_unlock(&ztest_shared->zs_name_lock); + break; } else if (error != 0) { - if (error == ENOSPC) { - zs->zs_enospc_count++; - (void) rw_unlock( - &ztest_shared->zs_name_lock); - break; - } fatal(0, "dmu_objset_create(%s) = %d", name, error); } @@ -3259,22 +3236,13 @@ fatal(0, "dmu_objset_open('%s') = %d", name, error); (void) rw_unlock(&ztest_shared->zs_name_lock); - if (test_future && ztest_shared->zs_txg > 0) - ztest_dmu_check_future_leak(za[d].za_os, - ztest_shared->zs_txg); + if (test_future) + ztest_dmu_check_future_leak(&za[t]); zr.zr_os = za[d].za_os; zil_replay(zr.zr_os, &zr, &zr.zr_assign, ztest_replay_vector); za[d].za_zilog = zil_open(za[d].za_os, NULL); } - za[t].za_pool = spa_strdup(pool); - za[t].za_os = za[d].za_os; - za[t].za_zilog = za[d].za_zilog; - za[t].za_instance = t; - za[t].za_random = ztest_random(-1ULL); - za[t].za_start = za[0].za_start; - za[t].za_stop = za[0].za_stop; - za[t].za_kill = za[0].za_kill; error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND, &za[t].za_thread); @@ -3282,7 +3250,6 @@ fatal(0, "can't create thread %d: error %d", t, error); } - ztest_shared->zs_txg = 0; while (--t >= 0) { error = thr_join(za[t].za_thread, NULL, NULL); @@ -3294,11 +3261,8 @@ zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); } - spa_strfree(za[t].za_pool); } - umem_free(za, zopt_threads * sizeof (ztest_args_t)); - if (zopt_verbose >= 3) show_pool_stats(spa); @@ -3308,15 +3272,15 @@ zs->zs_space = spa_get_space(spa); /* - * Did we have out-of-space errors? If so, destroy a random objset. + * If we had out-of-space errors, destroy a random objset. */ if (zs->zs_enospc_count != 0) { (void) rw_rdlock(&ztest_shared->zs_name_lock); - (void) snprintf(name, 100, "%s/%s_%d", pool, pool, - (int)ztest_random(zopt_datasets)); + d = (int)ztest_random(zopt_datasets); + (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); if (zopt_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); - (void) dmu_objset_find(name, ztest_destroy_cb, NULL, + (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); (void) rw_unlock(&ztest_shared->zs_name_lock); } @@ -3330,8 +3294,6 @@ for (t = 1; t < 50; t++) dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15); - spa_close(spa, FTAG); - /* Shutdown the suspend monitor thread */ zio_io_fail_shift = 0; ztest_exiting = B_TRUE; @@ -3342,6 +3304,10 @@ if (error) fatal(0, "thr_join(%d) = %d", tid, error); + umem_free(za, zopt_threads * sizeof (ztest_args_t)); + + spa_close(spa, FTAG); + kernel_fini(); }
--- a/usr/src/lib/libzpool/common/llib-lzpool Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/lib/libzpool/common/llib-lzpool Tue Nov 27 22:58:05 2007 -0800 @@ -48,6 +48,6 @@ #include <sys/bplist.h> #include <sys/zfs_znode.h> -extern uint64_t zio_gang_bang; +extern uint64_t metaslab_gang_bang; extern uint16_t zio_zil_fail_shift; extern uint16_t zio_io_fail_shift;
--- a/usr/src/uts/common/Makefile.files Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/Makefile.files Tue Nov 27 22:58:05 2007 -0800 @@ -1077,7 +1077,6 @@ lzjb.o \ metaslab.o \ refcount.o \ - rprwlock.o \ sha256.o \ spa.o \ spa_config.o \
--- a/usr/src/uts/common/fs/zfs/metaslab.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Tue Nov 27 22:58:05 2007 -0800 @@ -35,6 +35,7 @@ #include <sys/zio.h> uint64_t metaslab_aliquot = 512ULL << 10; +uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ /* * ========================================================================== @@ -728,6 +729,12 @@ ASSERT(!DVA_IS_VALID(&dva[d])); /* + * For testing, make some blocks above a certain size be gang blocks. + */ + if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + return (ENOSPC); + + /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mc_rotor or mc_allocated because * nothing actually breaks if we miss a few updates -- we just won't
--- a/usr/src/uts/common/fs/zfs/rprwlock.c Tue Nov 27 17:41:22 2007 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,118 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/zfs_context.h> -#include <sys/refcount.h> -#include <sys/rprwlock.h> - -void -rprw_init(rprwlock_t *rwl) -{ - mutex_init(&rwl->rw_lock, NULL, MUTEX_DEFAULT, NULL); - rwl->rw_writer = NULL; - cv_init(&rwl->rw_cv, NULL, CV_DEFAULT, NULL); - refcount_create(&rwl->rw_count); -} - -void -rprw_destroy(rprwlock_t *rwl) -{ - mutex_destroy(&rwl->rw_lock); - ASSERT(rwl->rw_writer == NULL); - cv_destroy(&rwl->rw_cv); - refcount_destroy(&rwl->rw_count); -} - -void -rprw_enter_read(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - if (rwl->rw_writer != curthread) { - while (rwl->rw_writer != NULL) - cv_wait(&rwl->rw_cv, &rwl->rw_lock); - } - - (void) refcount_add(&rwl->rw_count, tag); - - mutex_exit(&rwl->rw_lock); -} - -void -rprw_enter_write(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - if (rwl->rw_writer != curthread) { - while (!refcount_is_zero(&rwl->rw_count)) - cv_wait(&rwl->rw_cv, &rwl->rw_lock); - rwl->rw_writer = curthread; - } - - (void) refcount_add(&rwl->rw_count, tag); - - mutex_exit(&rwl->rw_lock); -} - -void -rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag) -{ - if (rw == RW_READER) - rprw_enter_read(rwl, tag); - else - rprw_enter_write(rwl, tag); -} - -void -rprw_exit(rprwlock_t *rwl, void *tag) -{ - mutex_enter(&rwl->rw_lock); - - ASSERT(!refcount_is_zero(&rwl->rw_count)); - ASSERT(rwl->rw_writer == NULL || curthread == rwl->rw_writer); - if (refcount_remove(&rwl->rw_count, tag) == 0) { - cv_broadcast(&rwl->rw_cv); - rwl->rw_writer = NULL; /* OK in either case */ - } - - mutex_exit(&rwl->rw_lock); -} - -boolean_t -rprw_held(rprwlock_t *rwl, krw_t rw) -{ - boolean_t held; - - mutex_enter(&rwl->rw_lock); - if (rw == RW_WRITER) - held = (rwl->rw_writer == curthread); - else - held = !rwl->rw_writer && !refcount_is_zero(&rwl->rw_count); - mutex_exit(&rwl->rw_lock); - - return (held); -}
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Tue Nov 27 22:58:05 2007 -0800 @@ -144,16 +144,9 @@ * zero. Must be called with spa_namespace_lock * held. * - * The spa_config_lock is manipulated using the following functions: - * - * spa_config_enter() Acquire the config lock as RW_READER or - * RW_WRITER. At least one reference on the spa_t - * must exist. - * - * spa_config_exit() Release the config lock. - * - * spa_config_held() Returns true if the config lock is currently - * held in the given state. + * The spa_config_lock is a form of rwlock. It must be held as RW_READER + * to perform I/O to the pool, and as RW_WRITER to change the vdev config. + * The spa_config_lock is manipulated with spa_config_{enter,exit,held}(). * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * @@ -202,6 +195,80 @@ /* * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_config_lock_t *scl) +{ + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + scl->scl_writer = NULL; + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&scl->scl_count); +} + +static void +spa_config_lock_destroy(spa_config_lock_t *scl) +{ + mutex_destroy(&scl->scl_lock); + ASSERT(scl->scl_writer == NULL); + cv_destroy(&scl->scl_cv); + refcount_destroy(&scl->scl_count); +} + +void +spa_config_enter(spa_t *spa, krw_t rw, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + if (rw == RW_READER) { + while (scl->scl_writer != NULL && scl->scl_writer != curthread) + cv_wait(&scl->scl_cv, &scl->scl_lock); + } else { + while (!refcount_is_zero(&scl->scl_count) && + scl->scl_writer != curthread) + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_writer = curthread; + } + + (void) refcount_add(&scl->scl_count, tag); + + mutex_exit(&scl->scl_lock); +} + +void +spa_config_exit(spa_t *spa, void *tag) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + mutex_enter(&scl->scl_lock); + + ASSERT(!refcount_is_zero(&scl->scl_count)); + + if (refcount_remove(&scl->scl_count, tag) == 0) { + cv_broadcast(&scl->scl_cv); + ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + } + + mutex_exit(&scl->scl_lock); +} + +boolean_t +spa_config_held(spa_t *spa, krw_t rw) +{ + spa_config_lock_t *scl = &spa->spa_config_lock; + + if (rw == RW_READER) + return (!refcount_is_zero(&scl->scl_count)); + else + return (scl->scl_writer == curthread); +} + +/* + * ========================================================================== * SPA namespace functions * ========================================================================== */ @@ -275,7 +342,7 @@ spa->spa_final_txg = UINT64_MAX; refcount_create(&spa->spa_refcount); - rprw_init(&spa->spa_config_lock); + spa_config_lock_init(&spa->spa_config_lock); avl_add(&spa_namespace_avl, spa); @@ -324,7 +391,7 @@ refcount_destroy(&spa->spa_refcount); - rprw_destroy(&spa->spa_config_lock); + spa_config_lock_destroy(&spa->spa_config_lock); rw_destroy(&spa->spa_traverse_lock); @@ -639,29 +706,6 @@ /* * ========================================================================== - * SPA config locking - * ========================================================================== - */ -void -spa_config_enter(spa_t *spa, krw_t rw, void *tag) -{ - rprw_enter(&spa->spa_config_lock, rw, tag); -} - -void -spa_config_exit(spa_t *spa, void *tag) -{ - rprw_exit(&spa->spa_config_lock, tag); -} - -boolean_t -spa_config_held(spa_t *spa, krw_t rw) -{ - return (rprw_held(&spa->spa_config_lock, rw)); -} - -/* - * ========================================================================== * SPA vdev locking * ========================================================================== */ @@ -1003,7 +1047,7 @@ * config lock, both of which are required to do a rename. */ ASSERT(MUTEX_HELD(&spa_namespace_lock) || - spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER)); + spa_config_held(spa, RW_READER)); return (spa->spa_name); }
--- a/usr/src/uts/common/fs/zfs/sys/rprwlock.h Tue Nov 27 17:41:22 2007 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,61 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_RPRWLOCK_H -#define _SYS_RPRWLOCK_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/inttypes.h> -#include <sys/list.h> -#include <sys/zfs_context.h> -#include <sys/refcount.h> - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct rprwlock { - kmutex_t rw_lock; - kthread_t *rw_writer; - kcondvar_t rw_cv; - refcount_t rw_count; -} rprwlock_t; - -void rprw_init(rprwlock_t *rwl); -void rprw_destroy(rprwlock_t *rwl); -void rprw_enter_read(rprwlock_t *rwl, void *tag); -void rprw_enter_write(rprwlock_t *rwl, void *tag); -void rprw_enter(rprwlock_t *rwl, krw_t rw, void *tag); -void rprw_exit(rprwlock_t *rwl, void *tag); -boolean_t rprw_held(rprwlock_t *rwl, krw_t rw); -#define RPRW_READ_HELD(x) rprw_held(x, RW_READER) -#define RPRW_WRITE_HELD(x) rprw_held(x, RW_WRITER) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_RPRWLOCK_H */
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Tue Nov 27 22:58:05 2007 -0800 @@ -37,7 +37,6 @@ #include <sys/zfs_context.h> #include <sys/avl.h> #include <sys/refcount.h> -#include <sys/rprwlock.h> #include <sys/bplist.h> #ifdef __cplusplus @@ -68,6 +67,14 @@ uint_t sav_npending; /* # pending devices */ }; +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + uint16_t scl_write_wanted; + kcondvar_t scl_cv; + refcount_t scl_count; +} spa_config_lock_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -157,7 +164,7 @@ * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ - rprwlock_t spa_config_lock; /* configuration changes */ + spa_config_lock_t spa_config_lock; /* configuration changes */ refcount_t spa_refcount; /* number of opens */ };
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h Tue Nov 27 22:58:05 2007 -0800 @@ -83,9 +83,6 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern void vdev_io_start(zio_t *zio); -extern void vdev_io_done(zio_t *zio); - extern int vdev_fault(spa_t *spa, uint64_t guid); extern int vdev_degrade(spa_t *spa, uint64_t guid); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Tue Nov 27 22:58:05 2007 -0800 @@ -62,8 +62,8 @@ typedef void vdev_close_func_t(vdev_t *vd); typedef int vdev_probe_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); -typedef void vdev_io_start_func_t(zio_t *zio); -typedef void vdev_io_done_func_t(zio_t *zio); +typedef int vdev_io_start_func_t(zio_t *zio); +typedef int vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef struct vdev_ops {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Tue Nov 27 22:58:05 2007 -0800 @@ -153,6 +153,7 @@ (ZIO_FLAG_CANFAIL | \ ZIO_FLAG_FAILFAST | \ ZIO_FLAG_CONFIG_HELD | \ + ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_DONT_RETRY | \ ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_SPECULATIVE | \ @@ -164,9 +165,11 @@ #define ZIO_FLAG_VDEV_INHERIT \ (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_DONT_CACHE | \ ZIO_FLAG_PHYSICAL) +#define ZIO_PIPELINE_CONTINUE 0x100 +#define ZIO_PIPELINE_STOP 0x101 + /* * We'll take the unused errno 'EBADE' (from the Convergent graveyard) * to indicate checksum errors. @@ -262,7 +265,6 @@ uint32_t io_numerrors; uint32_t io_pipeline; uint32_t io_orig_pipeline; - uint32_t io_async_stages; uint64_t io_children_notready; uint64_t io_children_notdone; void *io_waiter; @@ -319,21 +321,18 @@ extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); + +extern int zio_wait_for_children_ready(zio_t *zio); +extern int zio_wait_for_children_done(zio_t *zio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -/* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. - */ -extern void zio_next_stage(zio_t *zio); -extern void zio_next_stage_async(zio_t *zio); extern void zio_resubmit_stage_async(void *); -extern void zio_wait_children_done(zio_t *zio); /* * Delegate I/O to a child vdev.
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h Tue Nov 27 22:58:05 2007 -0800 @@ -38,16 +38,15 @@ /* * I/O Groups: pipeline stage definitions. */ - typedef enum zio_stage { ZIO_STAGE_OPEN = 0, /* RWFCI */ - ZIO_STAGE_WAIT_CHILDREN_READY, /* RWFCI */ + ZIO_STAGE_WAIT_FOR_CHILDREN_READY, /* RWFCI */ + ZIO_STAGE_READ_INIT, /* R---- */ + ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ ZIO_STAGE_WRITE_COMPRESS, /* -W--- */ ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ - ZIO_STAGE_GANG_PIPELINE, /* -WFC- */ - ZIO_STAGE_GET_GANG_HEADER, /* -WFC- */ ZIO_STAGE_REWRITE_GANG_MEMBERS, /* -W--- */ ZIO_STAGE_FREE_GANG_MEMBERS, /* --F-- */ @@ -61,13 +60,11 @@ ZIO_STAGE_READY, /* RWFCI */ - ZIO_STAGE_READ_INIT, /* R---- */ - ZIO_STAGE_VDEV_IO_START, /* RW--I */ ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ - ZIO_STAGE_WAIT_CHILDREN_DONE, /* RWFCI */ + ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, /* RWFCI */ ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ ZIO_STAGE_READ_GANG_MEMBERS, /* R---- */ @@ -77,30 +74,22 @@ ZIO_STAGE_DONE /* RWFCI */ } zio_stage_t; -/* - * The stages for which there's some performance value in going async. - * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well. - */ -#define ZIO_ASYNC_PIPELINE_STAGES \ - ((1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_READ_DECOMPRESS)) +#define ZIO_INTERLOCK_STAGES \ + ((1U << ZIO_STAGE_WAIT_FOR_CHILDREN_READY) | \ + (1U << ZIO_STAGE_READY) | \ + (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \ + (1U << ZIO_STAGE_ASSESS) | \ + (1U << ZIO_STAGE_DONE)) -#define ZIO_VDEV_IO_PIPELINE \ +#define ZIO_VDEV_IO_STAGES \ ((1U << ZIO_STAGE_VDEV_IO_START) | \ (1U << ZIO_STAGE_VDEV_IO_DONE) | \ (1U << ZIO_STAGE_VDEV_IO_ASSESS)) #define ZIO_READ_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_CHECKSUM_VERIFY)) #define ZIO_READ_GANG_PIPELINE \ ZIO_READ_PHYS_PIPELINE @@ -109,97 +98,66 @@ (1U << ZIO_STAGE_READ_INIT) | \ ZIO_READ_PHYS_PIPELINE +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + (1U << ZIO_STAGE_ISSUE_ASYNC) | \ + (1U << ZIO_STAGE_CHECKSUM_GENERATE)) + #define ZIO_WRITE_PHYS_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_WRITE_COMMON_PIPELINE \ - ZIO_WRITE_PHYS_PIPELINE + ZIO_WRITE_COMMON_STAGES #define ZIO_WRITE_PIPELINE \ - ((1U << ZIO_STAGE_WRITE_COMPRESS) | \ - ZIO_WRITE_COMMON_PIPELINE) + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_WRITE_COMPRESS)) -#define ZIO_GANG_STAGES \ +#define ZIO_GANG_REWRITE_STAGES \ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - (1U << ZIO_STAGE_READ_GANG_MEMBERS)) - -#define ZIO_REWRITE_PIPELINE \ - ((1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) | \ - ZIO_WRITE_COMMON_PIPELINE) - -#define ZIO_WRITE_ALLOCATE_PIPELINE \ - ((1U << ZIO_STAGE_DVA_ALLOCATE) | \ - ZIO_WRITE_COMMON_PIPELINE) + (1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE)) #define ZIO_GANG_FREE_STAGES \ ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ (1U << ZIO_STAGE_FREE_GANG_MEMBERS)) -#define ZIO_FREE_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_FREE_GANG_MEMBERS) | \ - (1U << ZIO_STAGE_DVA_FREE) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_GANG_CLAIM_STAGES \ + ((1U << ZIO_STAGE_GET_GANG_HEADER) | \ + (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS)) + +#define ZIO_REWRITE_PIPELINE(bp) \ + (ZIO_WRITE_COMMON_STAGES | \ + (BP_IS_GANG(bp) ? ZIO_GANG_REWRITE_STAGES : 0)) -#define ZIO_CLAIM_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_GANG_PIPELINE) | \ - (1U << ZIO_STAGE_GET_GANG_HEADER) | \ - (1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) | \ +#define ZIO_WRITE_ALLOCATE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + (1U << ZIO_STAGE_DVA_ALLOCATE)) + +#define ZIO_FREE_PIPELINE(bp) \ + (ZIO_INTERLOCK_STAGES | \ + (1U << ZIO_STAGE_DVA_FREE) | \ + (BP_IS_GANG(bp) ? ZIO_GANG_FREE_STAGES : 0)) + +#define ZIO_CLAIM_PIPELINE(bp) \ + (ZIO_INTERLOCK_STAGES | \ (1U << ZIO_STAGE_DVA_CLAIM) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (BP_IS_GANG(bp) ? ZIO_GANG_CLAIM_STAGES : 0)) #define ZIO_IOCTL_PIPELINE \ - ((1U << ZIO_STAGE_OPEN) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - ZIO_VDEV_IO_PIPELINE | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES) + #define ZIO_WAIT_FOR_CHILDREN_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_READY) | \ - (1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ (1U << ZIO_STAGE_ASSESS) | \ + (1U << ZIO_STAGE_WAIT_FOR_CHILDREN_DONE) | \ (1U << ZIO_STAGE_DONE)) -#define ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE \ - ((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) | \ - (1U << ZIO_STAGE_ASSESS) | \ - (1U << ZIO_STAGE_DONE)) - -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE | \ - ZIO_VDEV_IO_PIPELINE) - #define ZIO_ERROR_PIPELINE_MASK \ - ZIO_WAIT_FOR_CHILDREN_PIPELINE + ZIO_INTERLOCK_STAGES typedef struct zio_transform zio_transform_t; struct zio_transform {
--- a/usr/src/uts/common/fs/zfs/vdev.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev.c Tue Nov 27 22:58:05 2007 -0800 @@ -136,6 +136,9 @@ { vdev_t *rvd = spa->spa_root_vdev; + ASSERT(spa_config_held(spa, RW_READER) || + curthread == spa->spa_scrub_thread); + if (vdev < rvd->vdev_children) return (rvd->vdev_child[vdev]); @@ -1459,18 +1462,6 @@ return (vd->vdev_ops->vdev_op_asize(vd, psize)); } -void -vdev_io_start(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_start(zio); -} - -void -vdev_io_done(zio_t *zio) -{ - zio->io_vd->vdev_ops->vdev_op_io_done(zio); -} - const char * vdev_description(vdev_t *vd) {
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c Tue Nov 27 22:58:05 2007 -0800 @@ -231,7 +231,7 @@ zio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = zio->io_error; - zio_next_stage(dio); + zio_execute(dio); } } @@ -286,15 +286,10 @@ zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_next_stage(zio); + zio_execute(zio); return (0); } - if (!(zio->io_flags & ZIO_FLAG_METADATA)) { - mutex_exit(&vc->vc_lock); - return (EINVAL); - } - ve = vdev_cache_allocate(zio); if (ve == NULL) {
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c Tue Nov 27 22:58:05 2007 -0800 @@ -386,7 +386,7 @@ kmem_free(vdb, sizeof (vdev_disk_buf_t)); - zio_next_stage_async(zio); + zio_interrupt(zio); } static void @@ -396,10 +396,10 @@ zio->io_error = error; - zio_next_stage_async(zio); + zio_interrupt(zio); } -static void +static int vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -414,8 +414,7 @@ /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { @@ -444,8 +443,10 @@ * and will call vdev_disk_ioctl_done() * upon completion. */ - return; - } else if (error == ENOTSUP || error == ENOTTY) { + return (ZIO_PIPELINE_STOP); + } + + if (error == ENOTSUP || error == ENOTTY) { /* * If we get ENOTSUP or ENOTTY, we know that * no future attempts will ever succeed. @@ -463,15 +464,26 @@ zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; + return (ZIO_PIPELINE_STOP); if ((zio = vdev_queue_io(zio)) == NULL) - return; + return (ZIO_PIPELINE_STOP); + + if (zio->io_type == ZIO_TYPE_WRITE) + error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; + + if (error) { + zio->io_error = error; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + } flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); flags |= B_BUSY | B_NOCACHE; @@ -491,26 +503,14 @@ bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; - /* XXPOLICY */ - if (zio->io_type == ZIO_TYPE_WRITE) - error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - else - error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; - if (error) { - zio->io_error = error; - bioerror(bp, error); - bp->b_resid = bp->b_bcount; - bp->b_iodone(bp); - return; - } - error = ldi_strategy(dvd->vd_lh, bp); /* ldi_strategy() will return non-zero only on programming errors */ ASSERT(error == 0); + + return (ZIO_PIPELINE_STOP); } -static void +static int vdev_disk_io_done(zio_t *zio) { vdev_queue_io_done(zio); @@ -544,7 +544,7 @@ } } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } vdev_ops_t vdev_disk_ops = {
--- a/usr/src/uts/common/fs/zfs/vdev_file.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_file.c Tue Nov 27 22:58:05 2007 -0800 @@ -215,7 +215,7 @@ return (error); } -static void +static int vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -229,8 +229,7 @@ /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = ENXIO; - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } switch (zio->io_cmd) { @@ -244,8 +243,7 @@ zio->io_error = ENOTSUP; } - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } /* @@ -254,11 +252,11 @@ */ #ifndef _KERNEL if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return; + return (ZIO_PIPELINE_STOP); #endif if ((zio = vdev_queue_io(zio)) == NULL) - return; + return (ZIO_PIPELINE_STOP); /* XXPOLICY */ if (zio->io_type == ZIO_TYPE_WRITE) @@ -268,8 +266,8 @@ error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error; if (error) { zio->io_error = error; - zio_next_stage_async(zio); - return; + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? @@ -280,26 +278,25 @@ if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_next_stage_async(zio); + zio_interrupt(zio); + + return (ZIO_PIPELINE_STOP); } -static void +static int vdev_file_io_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + zio->io_error = zio_handle_device_injection(vd, EIO); /* * If an error has been encountered then attempt to probe the device * to determine if it's still accessible. */ - if (zio->io_error == EIO) { - vdev_t *vd = zio->io_vd; - - if (vdev_probe(vd) != 0) - vd->vdev_is_failing = B_TRUE; - } + if (zio->io_error == EIO && vdev_probe(vd) != 0) + vd->vdev_is_failing = B_TRUE; vdev_queue_io_done(zio); @@ -308,7 +305,7 @@ vdev_cache_write(zio); #endif - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } vdev_ops_t vdev_file_ops = {
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c Tue Nov 27 22:58:05 2007 -0800 @@ -253,7 +253,7 @@ return (-1); } -static void +static int vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; @@ -279,8 +279,7 @@ ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done, mc)); } - zio_wait_children_done(zio); - return; + return (zio_wait_for_children_done(zio)); } /* * For normal reads just pick one child. @@ -316,10 +315,10 @@ c++; } - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } -static void +static int vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; @@ -362,8 +361,7 @@ if (good_copies != 0) zio->io_error = 0; vdev_mirror_map_free(zio); - zio_next_stage(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -383,8 +381,7 @@ mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc)); - zio_wait_children_done(zio); - return; + return (zio_wait_for_children_done(zio)); } /* XXPOLICY */ @@ -441,12 +438,13 @@ } zio_nowait(rio); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } vdev_mirror_map_free(zio); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void
--- a/usr/src/uts/common/fs/zfs/vdev_missing.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c Tue Nov 27 22:58:05 2007 -0800 @@ -62,18 +62,18 @@ } /* ARGSUSED */ -static void +static int vdev_missing_io_start(zio_t *zio) { zio->io_error = ENOTSUP; - zio_next_stage_async(zio); + return (ZIO_PIPELINE_CONTINUE); } /* ARGSUSED */ -static void +static int vdev_missing_io_done(zio_t *zio) { - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* ARGSUSED */
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Tue Nov 27 22:58:05 2007 -0800 @@ -162,7 +162,7 @@ aio->io_delegate_list = dio->io_delegate_next; dio->io_delegate_next = NULL; dio->io_error = aio->io_error; - zio_next_stage(dio); + zio_execute(dio); } ASSERT3U(offset, ==, aio->io_size); @@ -172,11 +172,8 @@ #define IS_ADJACENT(io, nio) \ ((io)->io_offset + (io)->io_size == (nio)->io_offset) -typedef void zio_issue_func_t(zio_t *); - static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, - zio_issue_func_t **funcp) +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio; avl_tree_t *tree; @@ -184,8 +181,6 @@ ASSERT(MUTEX_HELD(&vq->vq_lock)); - *funcp = NULL; - if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); @@ -245,7 +240,6 @@ avl_add(&vq->vq_pending_tree, aio); - *funcp = zio_nowait; return (aio); } @@ -254,8 +248,6 @@ avl_add(&vq->vq_pending_tree, fio); - *funcp = zio_next_stage; - return (fio); } @@ -264,7 +256,6 @@ { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - zio_issue_func_t *func; ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); @@ -285,15 +276,19 @@ vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); mutex_exit(&vq->vq_lock); - if (nio == NULL || func != zio_nowait) - return (nio); + if (nio == NULL) + return (NULL); - func(nio); - return (NULL); + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } + + return (nio); } void @@ -301,7 +296,6 @@ { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - zio_issue_func_t *func; int i; mutex_enter(&vq->vq_lock); @@ -309,13 +303,16 @@ avl_remove(&vq->vq_pending_tree, zio); for (i = 0; i < zfs_vdev_ramp_rate; i++) { - nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); + nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) break; mutex_exit(&vq->vq_lock); - if (func == zio_next_stage) + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { zio_vdev_io_reissue(nio); - func(nio); + zio_execute(nio); + } mutex_enter(&vq->vq_lock); }
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c Tue Nov 27 22:58:05 2007 -0800 @@ -639,7 +639,7 @@ vdev_raidz_map_free(zio->io_private); } -static void +static int vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -672,8 +672,8 @@ zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_raidz_child_done, rc)); } - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -714,7 +714,7 @@ } } - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } /* @@ -783,7 +783,7 @@ static uint64_t raidz_corrected_q; static uint64_t raidz_corrected_pq; -static void +static int vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -840,8 +840,8 @@ zio->io_error = 0; vdev_raidz_map_free(zio); - zio_next_stage(zio); - return; + + return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -1022,8 +1022,8 @@ vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); dprintf("rereading\n"); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } /* @@ -1205,12 +1205,13 @@ } zio_nowait(rio); - zio_wait_children_done(zio); - return; + + return (zio_wait_for_children_done(zio)); } vdev_raidz_map_free(zio); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void
--- a/usr/src/uts/common/fs/zfs/zio.c Tue Nov 27 17:41:22 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Tue Nov 27 22:58:05 2007 -0800 @@ -61,9 +61,6 @@ char *zio_type_name[ZIO_TYPES] = { "null", "read", "write", "free", "claim", "ioctl" }; -/* At or above this size, force gang blocking - for testing */ -uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1; - /* Force an allocation failure when non-zero */ uint16_t zio_zil_fail_shift = 0; uint16_t zio_io_fail_shift = 0; @@ -170,8 +167,6 @@ align, NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NODEBUG); - dprintf("creating cache for size %5lx align %5lx\n", - size, align); } } @@ -356,9 +351,6 @@ zio->io_bp = bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; - if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata || - BP_GET_LEVEL(bp) != 0) - zio->io_flags |= ZIO_FLAG_METADATA; } zio->io_done = done; zio->io_private = private; @@ -366,10 +358,7 @@ zio->io_priority = priority; zio->io_stage = stage; zio->io_pipeline = pipeline; - zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES; zio->io_timestamp = lbolt64; - if (pio != NULL) - zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); zio_push_transform(zio, data, size, size); @@ -395,7 +384,7 @@ if (pio == NULL) { if (type != ZIO_TYPE_NULL && !(flags & ZIO_FLAG_CONFIG_HELD)) { - spa_config_enter(zio->io_spa, RW_READER, zio); + spa_config_enter(spa, RW_READER, zio); zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; } zio->io_root = zio; @@ -409,7 +398,7 @@ !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; - spa_config_enter(zio->io_spa, RW_READER, pio); + spa_config_enter(spa, RW_READER, pio); } if (stage < ZIO_STAGE_READY) pio->io_children_notready++; @@ -524,9 +513,6 @@ zio->io_compress = compress; zio->io_ndvas = ncopies; - if (compress != ZIO_COMPRESS_OFF) - zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS; - if (bp->blk_birth != txg) { /* XXX the bp usually (always?) gets re-zeroed later */ BP_ZERO(bp); @@ -551,7 +537,7 @@ zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); zio->io_bookmark = *zb; zio->io_checksum = checksum; @@ -612,7 +598,7 @@ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, - ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); + ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); zio->io_bp = &zio->io_bp_copy; @@ -641,7 +627,7 @@ zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, - ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); zio->io_bp = &zio->io_bp_copy; @@ -820,7 +806,7 @@ zio->io_waiter = curthread; - zio_next_stage_async(zio); + zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_stalled != ZIO_STAGE_DONE) @@ -838,7 +824,23 @@ void zio_nowait(zio_t *zio) { - zio_next_stage_async(zio); + zio_execute(zio); +} + +void +zio_interrupt(zio_t *zio) +{ + (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], + (task_func_t *)zio_execute, zio, TQ_SLEEP); +} + +static int +zio_issue_async(zio_t *zio) +{ + (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], + (task_func_t *)zio_execute, zio, TQ_SLEEP); + + return (ZIO_PIPELINE_STOP); } /* @@ -846,18 +848,20 @@ * I/O pipeline interlocks: parent/child dependency scoreboarding * ========================================================================== */ -static void +static int zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) { + int rv = ZIO_PIPELINE_CONTINUE; + mutex_enter(&zio->io_lock); - if (*countp == 0) { - ASSERT(zio->io_stalled == 0); - mutex_exit(&zio->io_lock); - zio_next_stage(zio); - } else { + ASSERT(zio->io_stalled == 0); + if (*countp != 0) { zio->io_stalled = stage; - mutex_exit(&zio->io_lock); + rv = ZIO_PIPELINE_STOP; } + mutex_exit(&zio->io_lock); + + return (rv); } static void @@ -872,48 +876,54 @@ if (--*countp == 0 && pio->io_stalled == stage) { pio->io_stalled = 0; mutex_exit(&pio->io_lock); - zio_next_stage_async(pio); + zio_execute(pio); } else { mutex_exit(&pio->io_lock); } } -static void -zio_wait_children_ready(zio_t *zio) +int +zio_wait_for_children_ready(zio_t *zio) { - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY, - &zio->io_children_notready); + return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, + &zio->io_children_notready)); } -void -zio_wait_children_done(zio_t *zio) +int +zio_wait_for_children_done(zio_t *zio) { - zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, - &zio->io_children_notdone); + return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, + &zio->io_children_notdone)); } -static void +static int zio_read_init(zio_t *zio) { - if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) { - uint64_t csize = BP_GET_PSIZE(zio->io_bp); + blkptr_t *bp = zio->io_bp; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint64_t csize = BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(csize); zio_push_transform(zio, cbuf, csize, csize); zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; } - if (BP_IS_GANG(zio->io_bp)) { + if (BP_IS_GANG(bp)) { uint64_t gsize = SPA_GANGBLOCKSIZE; void *gbuf = zio_buf_alloc(gsize); zio_push_transform(zio, gbuf, gsize, gsize); zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; } - zio_next_stage(zio); + + if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_ready(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -922,16 +932,16 @@ zio->io_ready(zio); if (pio != NULL) - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, + zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, &pio->io_children_notready); if (zio->io_bp) zio->io_bp_copy = *zio->io_bp; - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_vdev_retry_io(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -967,7 +977,7 @@ if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) pio->io_flags |= ZIO_FLAG_WRITE_RETRY; - ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE); + ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); mutex_exit(&pio->io_lock); } @@ -977,7 +987,8 @@ */ zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; zio->io_error = 0; - zio_next_stage_async(zio); + + return (ZIO_PIPELINE_CONTINUE); } int @@ -1029,7 +1040,7 @@ zio->io_stage = ZIO_STAGE_READY; } - (void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async, + (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, zio, TQ_SLEEP); } mutex_exit(&spa->spa_zio_lock); @@ -1049,7 +1060,7 @@ return (0); } -static void +static int zio_vdev_suspend_io(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1069,9 +1080,11 @@ cv_broadcast(&spa->spa_zio_cv); #endif mutex_exit(&spa->spa_zio_lock); + + return (ZIO_PIPELINE_STOP); } -static void +static int zio_assess(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1138,10 +1151,9 @@ * property. */ if (zio_write_retry && zio->io_error != ENOSPC && - IO_IS_ALLOCATING(zio)) { - zio_vdev_retry_io(zio); - return; - } + IO_IS_ALLOCATING(zio)) + return (zio_vdev_retry_io(zio)); + ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); /* @@ -1175,22 +1187,20 @@ "uncorrectable I/O failure and the " "failure mode property for this pool " "is set to panic.", spa_name(spa)); - } else { - cmn_err(CE_WARN, "Pool '%s' has encountered " - "an uncorrectable I/O error. Manual " - "intervention is required.", - spa_name(spa)); - zio_vdev_suspend_io(zio); } - return; + cmn_err(CE_WARN, "Pool '%s' has encountered " + "an uncorrectable I/O error. " + "Manual intervention is required.", spa_name(spa)); + return (zio_vdev_suspend_io(zio)); } } ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); ASSERT(zio->io_children_notready == 0); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_done(zio_t *zio) { zio_t *pio = zio->io_parent; @@ -1221,7 +1231,7 @@ pio->io_child = next; mutex_exit(&pio->io_lock); - zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE, + zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, &pio->io_children_notdone); } @@ -1243,6 +1253,8 @@ cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } + + return (ZIO_PIPELINE_STOP); } /* @@ -1250,7 +1262,7 @@ * Compression support * ========================================================================== */ -static void +static int zio_write_compress(zio_t *zio) { int compress = zio->io_compress; @@ -1300,7 +1312,7 @@ ASSERT(csize != 0); BP_SET_LSIZE(bp, lsize); BP_SET_COMPRESS(bp, compress); - zio->io_pipeline = ZIO_REWRITE_PIPELINE; + zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); } else { if (bp->blk_birth == zio->io_txg) BP_ZERO(bp); @@ -1316,10 +1328,10 @@ } } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_read_decompress(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1338,7 +1350,7 @@ zio_buf_free(data, bufsize); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1347,19 +1359,6 @@ * ========================================================================== */ static void -zio_gang_pipeline(zio_t *zio) -{ - /* - * By default, the pipeline assumes that we're dealing with a gang - * block. If we're not, strip out any gang-specific stages. - */ - if (!BP_IS_GANG(zio->io_bp)) - zio->io_pipeline &= ~ZIO_GANG_STAGES; - - zio_next_stage(zio); -} - -static void zio_gang_byteswap(zio_t *zio) { ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); @@ -1368,7 +1367,7 @@ byteswap_uint64_array(zio->io_data, zio->io_size); } -static void +static int zio_get_gang_header(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1384,10 +1383,10 @@ zio->io_flags & ZIO_FLAG_GANG_INHERIT, ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); - zio_wait_children_done(zio); + return (zio_wait_for_children_done(zio)); } -static void +static int zio_read_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1410,16 +1409,17 @@ ASSERT(!BP_IS_HOLE(gbp)); zio_nowait(zio_read(zio, zio->io_spa, gbp, - (char *)zio->io_data + loff, lsize, NULL, NULL, - zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, - &zio->io_bookmark)); + (char *)zio->io_data + loff, lsize, + NULL, NULL, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); } zio_buf_free(gbh, gbufsize); - zio_wait_children_done(zio); + + return (zio_wait_for_children_done(zio)); } -static void +static int zio_rewrite_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1446,15 +1446,16 @@ zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, - NULL, NULL, zio->io_priority, zio->io_flags, - &zio->io_bookmark)); + NULL, NULL, zio->io_priority, + zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); } zio_push_transform(zio, gbh, gsize, gbufsize); - zio_wait_children_ready(zio); + + return (zio_wait_for_children_ready(zio)); } -static void +static int zio_free_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1476,10 +1477,11 @@ } zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_claim_gang_members(zio_t *zio) { zio_gbh_phys_t *gbh; @@ -1500,7 +1502,8 @@ } zio_buf_free(gbh, gbufsize); - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } static void @@ -1549,8 +1552,10 @@ error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE); - if (error) - return (error); + if (error) { + zio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } for (d = 0; d < gbh_ndvas; d++) DVA_SET_GANG(&dva[d], 1); @@ -1560,10 +1565,6 @@ gbh = zio_buf_alloc(gsize); bzero(gbh, gsize); - /* We need to test multi-level gang blocks */ - if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0) - maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE); - for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, resid -= lsize, gbps_left--, i++) { blkptr_t *gbp = &gbh->zg_blkptr[i]; @@ -1579,8 +1580,10 @@ break; ASSERT3U(error, ==, ENOSPC); /* XXX - free up previous allocations? */ - if (maxalloc == SPA_MINBLOCKSIZE) - return (error); + if (maxalloc == SPA_MINBLOCKSIZE) { + zio->io_error = error; + return (ZIO_PIPELINE_CONTINUE); + } maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); } @@ -1614,14 +1617,14 @@ zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; zio_push_transform(zio, gbh, gsize, gsize); + /* - * As much as we'd like this to be zio_wait_children_ready(), + * As much as we'd like this to be 'ready' instead of 'done', * updating our ASIZE doesn't happen until the io_done callback, * so we have to wait for that to finish in order for our BP * to be stable. */ - zio_wait_children_done(zio); - return (0); + return (zio_wait_for_children_done(zio)); } /* @@ -1629,7 +1632,7 @@ * Allocate and free blocks * ========================================================================== */ -static void +static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1642,14 +1645,6 @@ ASSERT3U(zio->io_ndvas, >, 0); ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); - /* For testing, make some blocks above a certain size be gang blocks */ - if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) { - error = zio_write_allocate_gang_members(zio, mc); - if (error) - zio->io_error = error; - return; - } - /* * For testing purposes, we force I/Os to retry. We don't allow * retries beyond the first pass since those I/Os are non-allocating @@ -1668,17 +1663,15 @@ if (error == 0) { bp->blk_birth = zio->io_txg; } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { - error = zio_write_allocate_gang_members(zio, mc); - if (error == 0) - return; - zio->io_error = error; + return (zio_write_allocate_gang_members(zio, mc)); } else { zio->io_error = error; } - zio_next_stage(zio); + + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_dva_free(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1687,15 +1680,15 @@ BP_ZERO(bp); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_dva_claim(zio_t *zio) { zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1704,7 +1697,7 @@ * ========================================================================== */ -static void +static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1719,24 +1712,21 @@ * at that time. */ if (spa_state(spa) == POOL_STATE_IO_FAILURE && - zio->io_type == ZIO_TYPE_WRITE) { - zio_vdev_suspend_io(zio); - return; - } + zio->io_type == ZIO_TYPE_WRITE) + return (zio_vdev_suspend_io(zio)); - if (vd == NULL) { - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_start(zio); - return; - } + /* + * The mirror_ops handle multiple DVAs in a single BP + */ + if (vd == NULL) + return (vdev_mirror_ops.vdev_op_io_start(zio)); align = 1ULL << tvd->vdev_ashift; if (zio->io_retries == 0 && vd == tvd) zio->io_flags |= ZIO_FLAG_FAILFAST; - if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && - vd->vdev_children == 0) { + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { zio->io_flags |= ZIO_FLAG_PHYSICAL; zio->io_offset += VDEV_LABEL_START_SIZE; } @@ -1760,19 +1750,16 @@ P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); - vdev_io_start(zio); - - /* zio_next_stage_async() gets called from io completion interrupt */ + return (vd->vdev_ops->vdev_op_io_start(zio)); } -static void +static int zio_vdev_io_done(zio_t *zio) { if (zio->io_vd == NULL) - /* The mirror_ops handle multiple DVAs in a single BP */ - vdev_mirror_ops.vdev_op_io_done(zio); - else - vdev_io_done(zio); + return (vdev_mirror_ops.vdev_op_io_done(zio)); + + return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); } /* XXPOLICY */ @@ -1795,7 +1782,7 @@ return (B_TRUE); } -static void +static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1833,15 +1820,10 @@ zio->io_flags |= ZIO_FLAG_DONT_CACHE; zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; - dprintf("retry #%d for %s to %s offset %llx\n", - zio->io_retries, zio_type_name[zio->io_type], - vdev_description(vd), zio->io_offset); - - zio_next_stage_async(zio); - return; + return (ZIO_PIPELINE_CONTINUE); } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } void @@ -1876,7 +1858,7 @@ * Generate and verify checksums * ========================================================================== */ -static void +static int zio_checksum_generate(zio_t *zio) { int checksum = zio->io_checksum; @@ -1889,10 +1871,10 @@ zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_gang_checksum_generate(zio_t *zio) { zio_cksum_t zc; @@ -1905,10 +1887,10 @@ zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } -static void +static int zio_checksum_verify(zio_t *zio) { if (zio->io_bp != NULL) { @@ -1918,7 +1900,7 @@ zio->io_spa, zio->io_vd, zio, 0, 0); } - zio_next_stage(zio); + return (ZIO_PIPELINE_CONTINUE); } /* @@ -1949,20 +1931,15 @@ * Define the pipeline * ========================================================================== */ -typedef void zio_pipe_stage_t(zio_t *zio); - -static void -zio_badop(zio_t *zio) -{ - panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio); -} +typedef int zio_pipe_stage_t(zio_t *zio); zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { - zio_badop, - zio_wait_children_ready, + NULL, + zio_wait_for_children_ready, + zio_read_init, + zio_issue_async, zio_write_compress, zio_checksum_generate, - zio_gang_pipeline, zio_get_gang_header, zio_rewrite_gang_members, zio_free_gang_members, @@ -1972,116 +1949,63 @@ zio_dva_claim, zio_gang_checksum_generate, zio_ready, - zio_read_init, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, - zio_wait_children_done, + zio_wait_for_children_done, zio_checksum_verify, zio_read_gang_members, zio_read_decompress, zio_assess, zio_done, - zio_badop + NULL }; /* - * Move an I/O to the next stage of the pipeline and execute that stage. - * There's no locking on io_stage because there's no legitimate way for - * multiple threads to be attempting to process the same I/O. + * Execute the I/O pipeline until one of the following occurs: + * (1) the I/O completes; (2) the pipeline stalls waiting for + * dependent child I/Os; (3) the I/O issues, so we're waiting + * for an I/O completion interrupt; (4) the I/O is delegated by + * vdev-level caching or aggregation; (5) the I/O is deferred + * due to vdev-level queueing; (6) the I/O is handed off to + * another thread. In all cases, the pipeline stops whenever + * there's no CPU work; it never burns a thread in cv_wait(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. */ void -zio_next_stage(zio_t *zio) +zio_execute(zio_t *zio) { - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); - - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) - pipeline &= ZIO_ERROR_PIPELINE_MASK; - } - - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; - - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); + while (zio->io_stage < ZIO_STAGE_DONE) { + uint32_t pipeline = zio->io_pipeline; + int rv; - /* - * See the comment in zio_next_stage_async() about per-CPU taskqs. - */ - if (((1U << zio->io_stage) & zio->io_async_stages) && - (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) && - !(zio->io_flags & ZIO_FLAG_METADATA)) { - taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); - } -} - -void -zio_next_stage_async(zio_t *zio) -{ - taskq_t *tq; - uint32_t pipeline = zio->io_pipeline; - - ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(!MUTEX_HELD(&zio->io_lock)); - if (zio->io_error) { - dprintf("zio %p vdev %s offset %llx stage %d error %d\n", - zio, vdev_description(zio->io_vd), - zio->io_offset, zio->io_stage, zio->io_error); - if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0) + /* + * If an error occurred outside the vdev stack, + * just execute the interlock stages to clean up. + */ + if (zio->io_error && + ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) pipeline &= ZIO_ERROR_PIPELINE_MASK; - } - while (((1U << ++zio->io_stage) & pipeline) == 0) - continue; - - ASSERT(zio->io_stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stalled == 0); + while (((1U << ++zio->io_stage) & pipeline) == 0) + continue; - /* - * For performance, we'll probably want two sets of task queues: - * per-CPU issue taskqs and per-CPU completion taskqs. The per-CPU - * part is for read performance: since we have to make a pass over - * the data to checksum it anyway, we want to do this on the same CPU - * that issued the read, because (assuming CPU scheduling affinity) - * that thread is probably still there. Getting this optimization - * right avoids performance-hostile cache-to-cache transfers. - * - * Note that having two sets of task queues is also necessary for - * correctness: if all of the issue threads get bogged down waiting - * for dependent reads (e.g. metaslab freelist) to complete, then - * there won't be any threads available to service I/O completion - * interrupts. - */ - if ((1U << zio->io_stage) & zio->io_async_stages) { - if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE) - tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type]; - else - tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type]; - (void) taskq_dispatch(tq, - (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP); - } else { - zio_pipeline[zio->io_stage](zio); + ASSERT(zio->io_stage <= ZIO_STAGE_DONE); + ASSERT(zio->io_stalled == 0); + + rv = zio_pipeline[zio->io_stage](zio); + + if (rv == ZIO_PIPELINE_STOP) + return; + + ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } -void -zio_resubmit_stage_async(void *arg) -{ - zio_t *zio = (zio_t *)(uintptr_t)arg; - - zio_next_stage_async(zio); -} - static boolean_t zio_io_should_fail(uint16_t range) {