Mercurial > illumos > illumos-gate
changeset 3547:e396e0a440b1
6512391 DMU should leverage ZIO dependencies to achieve greater parallelism
line wrap: on
line diff
--- a/usr/src/cmd/zdb/zdb.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/cmd/zdb/zdb.c Fri Feb 02 15:36:58 2007 -0800 @@ -1017,21 +1017,21 @@ if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; - usedobjs = os->os->os_rootbp.blk_fill; + usedobjs = os->os->os_rootbp->blk_fill; refdbytes = os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } - ASSERT3U(usedobjs, ==, os->os->os_rootbp.blk_fill); + ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill); nicenum(refdbytes, numbuf); if (verbosity >= 4) { (void) strcpy(blkbuf, ", rootbp "); sprintf_blkptr(blkbuf + strlen(blkbuf), - BP_SPRINTF_LEN - strlen(blkbuf), &os->os->os_rootbp); + BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp); } else { blkbuf[0] = '\0'; }
--- a/usr/src/uts/common/fs/zfs/arc.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/arc.c Fri Feb 02 15:36:58 2007 -0800 @@ -315,14 +315,23 @@ typedef struct arc_callback arc_callback_t; struct arc_callback { + void *acb_private; arc_done_func_t *acb_done; - void *acb_private; arc_byteswap_func_t *acb_byteswap; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; @@ -2357,6 +2366,7 @@ atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); } hdr->b_datacnt -= 1; + arc_cksum_verify(buf); mutex_exit(hash_lock); @@ -2369,11 +2379,7 @@ nhdr->b_arc_access = 0; nhdr->b_flags = 0; nhdr->b_datacnt = 1; - if (hdr->b_freeze_cksum != NULL) { - nhdr->b_freeze_cksum = - kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - *nhdr->b_freeze_cksum = *hdr->b_freeze_cksum; - } + nhdr->b_freeze_cksum = NULL; buf->b_hdr = nhdr; buf->b_next = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); @@ -2390,10 +2396,10 @@ bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; + arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; - arc_buf_thaw(buf); } int @@ -2417,17 +2423,26 @@ #endif static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + + if (callback->awcb_ready) { + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + callback->awcb_ready(zio, buf, callback->awcb_private); + } + arc_cksum_compute(buf); +} + +static void arc_write_done(zio_t *zio) { - arc_buf_t *buf; - arc_buf_hdr_t *hdr; - arc_callback_t *acb; + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; - buf = zio->io_private; - hdr = buf->b_hdr; - acb = hdr->b_acb; hdr->b_acb = NULL; - ASSERT(acb != NULL); /* this buffer is on no lists and is not in the hash table */ ASSERT3P(hdr->b_state, ==, arc_anon); @@ -2469,7 +2484,7 @@ hdr->b_flags &= ~ARC_IO_IN_PROGRESS; arc_access(hdr, hash_lock); mutex_exit(hash_lock); - } else if (acb->acb_done == NULL) { + } else if (callback->awcb_done == NULL) { int destroy_hdr; /* * This is an anonymous buffer with no user callback, @@ -2485,23 +2500,23 @@ hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } - if (acb->acb_done) { + if (callback->awcb_done) { ASSERT(!refcount_is_zero(&hdr->b_refcnt)); - acb->acb_done(zio, buf, acb->acb_private); + callback->awcb_done(zio, buf, callback->awcb_private); } - kmem_free(acb, sizeof (arc_callback_t)); + kmem_free(callback, sizeof (arc_write_callback_t)); } -int +zio_t * arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags, zbookmark_t *zb) + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; - arc_callback_t *acb; - zio_t *rzio; + arc_write_callback_t *callback; + zio_t *zio; /* this is a private buffer - no locking required */ ASSERT3P(hdr->b_state, ==, arc_anon); @@ -2509,23 +2524,17 @@ ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); ASSERT(hdr->b_acb == 0); - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - acb->acb_byteswap = (arc_byteswap_func_t *)-1; - hdr->b_acb = acb; + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; hdr->b_flags |= ARC_IO_IN_PROGRESS; - arc_cksum_compute(buf); - rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, - buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); + zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, + buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, + priority, flags, zb); - if (arc_flags & ARC_WAIT) - return (zio_wait(rzio)); - - ASSERT(arc_flags & ARC_NOWAIT); - zio_nowait(rzio); - - return (0); + return (zio); } int
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,6 +39,9 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, + int compress, dmu_tx_t *tx); +static arc_done_func_t dbuf_write_ready; static arc_done_func_t dbuf_write_done; int zfs_mdcomp_disable = 0; @@ -46,7 +49,6 @@ /* * Global data structures and functions for the dbuf cache. */ -taskq_t *dbuf_tq; static kmem_cache_t *dbuf_cache; /* ARGSUSED */ @@ -210,31 +212,24 @@ { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_d.db_evict_func == NULL) + if (db->db_level != 0 || db->db_evict_func == NULL) return; - if (db->db_d.db_user_data_ptr_ptr) - *db->db_d.db_user_data_ptr_ptr = db->db.db_data; - db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); - db->db_d.db_user_ptr = NULL; - db->db_d.db_user_data_ptr_ptr = NULL; - db->db_d.db_evict_func = NULL; + if (db->db_user_data_ptr_ptr) + *db->db_user_data_ptr_ptr = db->db.db_data; + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; } void dbuf_evict(dmu_buf_impl_t *db) { - int i; - ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); + ASSERT(db->db_data_pending == NULL); -#ifdef ZFS_DEBUG - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&db->db_dirty_node[i])); - ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); - } -#endif dbuf_clear(db); dbuf_destroy(db); } @@ -267,8 +262,6 @@ dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, - TASKQ_PREPOPULATE); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); @@ -280,9 +273,6 @@ dbuf_hash_table_t *h = &dbuf_hash_table; int i; - taskq_destroy(dbuf_tq); - dbuf_tq = NULL; - for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); @@ -297,7 +287,6 @@ static void dbuf_verify(dmu_buf_impl_t *db) { - int i; dnode_t *dn = db->db_dnode; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -330,15 +319,13 @@ ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); } if (db->db.db_object == DMU_META_DNODE_OBJECT) { - for (i = 0; i < TXG_SIZE; i++) { - /* - * it should only be modified in syncing - * context, so make sure we only have - * one copy of the data. - */ - ASSERT(db->db_d.db_data_old[i] == NULL || - db->db_d.db_data_old[i] == db->db_buf); - } + dbuf_dirty_record_t *dr = db->db_data_pending; + /* + * it should only be modified in syncing + * context, so make sure we only have + * one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } } @@ -395,9 +382,9 @@ dbuf_update_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { + if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_d.db_user_data_ptr_ptr = db->db.db_data; + *db->db_user_data_ptr_ptr = db->db.db_data; } } @@ -444,12 +431,12 @@ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_d.db_freed_in_flight) { + if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); - db->db_d.db_freed_in_flight = FALSE; + db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { @@ -646,120 +633,69 @@ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { - arc_buf_t **quiescing, **syncing; - arc_buf_contents_t type; + dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); - ASSERT(db->db_blkid != DB_BONUS_BLKID); - - quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; - syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); - /* - * If this buffer is referenced from the current quiescing - * transaction group: either make a copy and reset the reference - * to point to the copy, or (if there a no active holders) just - * null out the current db_data pointer. - */ - if (*quiescing == db->db_buf) { - /* - * If the quiescing txg is "dirty", then we better not - * be referencing the same buffer from the syncing txg. - */ - ASSERT(*syncing != db->db_buf); - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - type = DBUF_GET_BUFC_TYPE(db); - *quiescing = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); - bcopy(db->db.db_data, (*quiescing)->b_data, size); - } else { - dbuf_set_data(db, NULL); - } + if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) return; - } /* - * If this buffer is referenced from the current syncing - * transaction group: either - * 1 - make a copy and reset the reference, or - * 2 - if there are no holders, just null the current db_data. + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. */ - if (*syncing == db->db_buf) { - ASSERT3P(*quiescing, ==, NULL); - ASSERT3U(db->db_dirtycnt, ==, 1); - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - int size = db->db.db_size; - type = DBUF_GET_BUFC_TYPE(db); - /* we can't copy if we have already started a write */ - ASSERT(*syncing != db->db_data_pending); - *syncing = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); - bcopy(db->db.db_data, (*syncing)->b_data, size); - } else { - dbuf_set_data(db, NULL); - } - } -} - -/* - * This is the "bonus buffer" version of the above routine - */ -static void -dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg) -{ - arc_buf_t **quiescing, **syncing; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_blkid == DB_BONUS_BLKID); - - quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; - syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; - - if (*quiescing == db->db.db_data) { - ASSERT(*syncing != db->db.db_data); - *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN); - bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN); - } else if (*syncing == db->db.db_data) { - ASSERT3P(*quiescing, ==, NULL); - ASSERT3U(db->db_dirtycnt, ==, 1); - *syncing = zio_buf_alloc(DN_MAX_BONUSLEN); - bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN); + ASSERT(dr->dr_txg >= txg - 2); + if (db->db_blkid == DB_BONUS_BLKID) { + /* Note that the data bufs here are zio_bufs */ + dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dr->dt.dl.dr_data = arc_buf_alloc( + db->db_dnode->dn_objset->os_spa, size, db, type); + bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + } else { + dbuf_set_data(db, NULL); } } void -dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) +dbuf_unoverride(dbuf_dirty_record_t *dr) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC); + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; - if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { - /* free this block */ - ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || - db->db_dnode->dn_free_txg == txg); - if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { - /* XXX can get silent EIO here */ - (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, - txg, db->db_d.db_overridden_by[txg&TXG_MASK], - NULL, NULL, ARC_WAIT); - } - kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], - sizeof (blkptr_t)); - db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; - /* - * Release the already-written buffer, so we leave it in - * a consistent dirty state. Note that all callers are - * modifying the buffer, so they will immediately do - * another (redundant) arc_release(). Therefore, leave - * the buf thawed to save the effort of freezing & - * immediately re-thawing it. - */ - arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); + /* free this block */ + if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { + /* XXX can get silent EIO here */ + (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); } + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); } void @@ -793,7 +729,7 @@ } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ - db->db_d.db_freed_in_flight = TRUE; + db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } @@ -802,26 +738,31 @@ dbuf_clear(db); continue; } - /* The dbuf is CACHED and referenced */ + /* The dbuf is referenced */ + + if (db->db_last_dirty != NULL) { + dbuf_dirty_record_t *dr = db->db_last_dirty; - if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) { - /* - * This dbuf is not currently dirty. Either - * uncache it (if its not referenced in the open - * context) or reset its contents to empty. - */ - dbuf_fix_old_data(db, txg); - } else { - if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) { + if (dr->dr_txg == txg) { /* - * This dbuf is overridden. Clear that state. + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. */ - dbuf_unoverride(db, txg); + if (db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); } - if (db->db_blkid > dn->dn_maxblkid) - dn->dn_maxblkid = db->db_blkid; } - /* fill in with appropriate data */ + /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); @@ -846,13 +787,13 @@ /* * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_dirtied will be set so we'll - * ignore db_blkptr. + * If it's syncing, then db_last_dirty will be set + * so we'll ignore db_blkptr. */ - ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ + ASSERT(MUTEX_HELD(&db->db_mtx)); /* If we have been dirtied since the last snapshot, its not new */ - if (db->db_dirtied) - birth_txg = db->db_dirtied; + if (db->db_last_dirty) + birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; @@ -901,18 +842,21 @@ VERIFY(arc_buf_remove_ref(obuf, db) == 1); db->db.db_size = size; - if (db->db_level == 0) - db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; + if (db->db_level == 0) { + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + db->db_last_dirty->dt.dl.dr_data = buf; + } mutex_exit(&db->db_mtx); dnode_willuse_space(db->db_dnode, size-osize, tx); } -void +dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; + dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; @@ -927,12 +871,11 @@ * XXX We may want to prohibit dirtying in syncing context even * if they did pre-dirty. */ - ASSERT(!(dmu_tx_is_syncing(tx) && - !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && - dn->dn_object != DMU_META_DNODE_OBJECT && - dn->dn_objset->os_dsl_dataset != NULL && - !dsl_dir_is_private( - dn->dn_objset->os_dsl_dataset->ds_dir))); + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_objset->os_dsl_dataset == NULL || + dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); /* * We make this assert for private objects as well, but after we @@ -940,23 +883,17 @@ * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || - dn->dn_dirtyctx == + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); - /* XXX make this true for indirects too? */ - ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || - db->db_state == DB_FILL); - /* - * If this buffer is currently part of an "overridden" region, - * we now need to remove it from that region. + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - db->db_d.db_overridden_by[txgoff] != NULL) { - dbuf_unoverride(db, tx->tx_txg); - } + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL); mutex_enter(&dn->dn_mtx); /* @@ -964,7 +901,7 @@ * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && - !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { + !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); @@ -975,13 +912,23 @@ /* * If this buffer is already dirty, we're done. */ - if (list_link_active(&db->db_dirty_node[txgoff])) { - if (db->db_blkid != DB_BONUS_BLKID && db->db_level == 0 && - db->db.db_object != DMU_META_DNODE_OBJECT) - arc_buf_thaw(db->db_buf); - + drp = &db->db_last_dirty; + ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + while (*drp && (*drp)->dr_txg > tx->tx_txg) + drp = &(*drp)->dr_next; + if (*drp && (*drp)->dr_txg == tx->tx_txg) { + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(*drp); + if (db->db.db_object != DMU_META_DNODE_OBJECT) + arc_buf_thaw(db->db_buf); + } mutex_exit(&db->db_mtx); - return; + return (*drp); } /* @@ -1007,7 +954,7 @@ ASSERT(!dmu_tx_is_syncing(tx) || os->os_dsl_dataset == NULL || !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || - !BP_IS_HOLE(&os->os_rootbp)); + !BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1017,44 +964,50 @@ * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT(db->db.db_data != NULL); - ASSERT(db->db_d.db_data_old[txgoff] == NULL); - dbuf_fix_old_bonus_data(db, tx->tx_txg); - db->db_d.db_data_old[txgoff] = db->db.db_data; - } else if (db->db_level == 0) { - /* - * Release the data buffer from the cache so that we - * can modify it without impacting possible other users - * of this cached data block. Note that indirect blocks - * and private objects are not released until the syncing - * state (since they are only modified then). - */ - ASSERT(db->db_buf != NULL); - ASSERT(db->db_d.db_data_old[txgoff] == NULL); - if (db->db.db_object != DMU_META_DNODE_OBJECT) { + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_blkid == DB_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so that we + * can modify it without impacting possible other users + * of this cached data block. Note that indirect + * blocks and private objects are not released until the + * syncing state (since they are only modified then). + */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); - ASSERT(db->db_buf != NULL); + data_old = db->db_buf; } - db->db_d.db_data_old[txgoff] = db->db_buf; + ASSERT(data_old != NULL); + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + dr->dr_next = *drp; + *drp = dr; - mutex_enter(&dn->dn_mtx); /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); dnode_clear_range(dn, db->db_blkid, 1, tx); - db->db_d.db_freed_in_flight = FALSE; + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; } - db->db_dirtied = tx->tx_txg; - list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); - mutex_exit(&dn->dn_mtx); - if (db->db_blkid != DB_BONUS_BLKID) { /* * Update the accounting. @@ -1084,8 +1037,12 @@ mutex_exit(&db->db_mtx); if (db->db_blkid == DB_BONUS_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); - return; + return (dr); } if (db->db_level == 0) { @@ -1099,30 +1056,61 @@ } if (db->db_level+1 < dn->dn_nlevels) { - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - dmu_buf_impl_t *parent; - parent = dbuf_hold_level(dn, db->db_level+1, - db->db_blkid >> epbs, FTAG); + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + parent = dbuf_hold_level(dn, db->db_level+1, + db->db_blkid >> epbs, FTAG); + parent_held = TRUE; + } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); - dbuf_dirty(parent, tx); - dbuf_rele(parent, FTAG); + ASSERT3U(db->db_level+1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* possible race with dbuf_undirty() */ + if (db->db_last_dirty == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); } else { + ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || + db->db_parent == db->db_dnode->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); + return (dr); } static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; - int txgoff = tx->tx_txg & TXG_MASK; - int64_t holds; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr; - ASSERT(tx->tx_txg != 0); + ASSERT(txg != 0); ASSERT(db->db_blkid != DB_BONUS_BLKID); mutex_enter(&db->db_mtx); @@ -1130,10 +1118,14 @@ /* * If this buffer is not dirty, we're done. */ - if (!list_link_active(&db->db_dirty_node[txgoff])) { + for (dr = db->db_last_dirty; dr; dr = dr->dr_next) + if (dr->dr_txg <= txg) + break; + if (dr == NULL || dr->dr_txg < txg) { mutex_exit(&db->db_mtx); return (0); } + ASSERT(dr->dr_txg == txg); /* * If this buffer is currently held, we cannot undirty @@ -1152,31 +1144,41 @@ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - dbuf_unoverride(db, tx->tx_txg); + ASSERT(db->db.db_size != 0); + + /* XXX would be nice to fix up dn_towrite_space[] */ + + db->db_last_dirty = dr->dr_next; - ASSERT(db->db.db_size != 0); - if (db->db_level == 0) { - ASSERT(db->db_buf != NULL); - ASSERT(db->db_d.db_data_old[txgoff] != NULL); - if (db->db_d.db_data_old[txgoff] != db->db_buf) - VERIFY(arc_buf_remove_ref( - db->db_d.db_data_old[txgoff], db) == 1); - db->db_d.db_data_old[txgoff] = NULL; + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_level+1 == dn->dn_nlevels) { + ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); } - /* XXX would be nice to fix up dn_towrite_space[] */ - /* XXX undo db_dirtied? but how? */ - /* db->db_dirtied = tx->tx_txg; */ + if (db->db_level == 0) { + dbuf_unoverride(dr); - mutex_enter(&dn->dn_mtx); - list_remove(&dn->dn_dirty_dbufs[txgoff], db); - mutex_exit(&dn->dn_mtx); + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + } else { + ASSERT(db->db_buf != NULL); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + /* XXX - mutex and list destroy? */ + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - if ((holds = refcount_remove(&db->db_holds, - (void *)(uintptr_t)tx->tx_txg)) == 0) { + if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; ASSERT(arc_released(buf)); @@ -1185,7 +1187,6 @@ dbuf_evict(db); return (1); } - ASSERT(holds > 0); mutex_exit(&db->db_mtx); return (0); @@ -1203,7 +1204,7 @@ if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; (void) dbuf_read(db, NULL, rf); - dbuf_dirty(db, tx); + (void) dbuf_dirty(db, tx); } void @@ -1220,7 +1221,7 @@ dmu_tx_private_ok(tx)); dbuf_noread(db); - dbuf_dirty(db, tx); + (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done @@ -1232,12 +1233,12 @@ DBUF_VERIFY(db); if (db->db_state == DB_FILL) { - if (db->db_level == 0 && db->db_d.db_freed_in_flight) { + if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DB_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); - db->db_d.db_freed_in_flight = FALSE; + db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); @@ -1374,13 +1375,17 @@ db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_dirtied = 0; + db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode = dn; db->db_parent = parent; db->db_blkptr = blkptr; - bzero(&db->db_d, sizeof (db->db_d)); + db->db_user_ptr = NULL; + db->db_user_data_ptr_ptr = NULL; + db->db_evict_func = NULL; + db->db_immediate_evict = 0; + db->db_freed_in_flight = 0; if (blkid == DB_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -1586,22 +1591,24 @@ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* - * If this buffer is currently syncing out, and we are - * are still referencing it from db_data, we need to make - * a copy of it in case we decide we want to dirty it - * again in this txg. + * If this buffer is currently syncing out, and we are are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_state == DB_CACHED && + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_data_pending == db->db_buf) { - int size = (db->db_blkid == DB_BONUS_BLKID) ? - DN_MAX_BONUSLEN : db->db.db_size; - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + + if (dr->dt.dl.dr_data == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - size, db, type)); - bcopy(db->db_data_pending->b_data, db->db.db_data, - db->db.db_size); + dbuf_set_data(db, + arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + db->db.db_size, db, type)); + bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, + db->db.db_size); + } } (void) refcount_add(&db->db_holds, tag); @@ -1669,11 +1676,15 @@ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); - if (db->db_buf && holds == db->db_dirtycnt) + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_d.db_immediate_evict) + db->db_level == 0 && db->db_immediate_evict) dbuf_evict_user(db); if (holds == 0) { @@ -1725,7 +1736,7 @@ { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_d.db_immediate_evict = TRUE; + db->db_immediate_evict = TRUE; return (dmu_buf_update_user(db_fake, NULL, user_ptr, user_data_ptr_ptr, evict_func)); } @@ -1741,14 +1752,14 @@ mutex_enter(&db->db_mtx); - if (db->db_d.db_user_ptr == old_user_ptr) { - db->db_d.db_user_ptr = user_ptr; - db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_d.db_evict_func = evict_func; + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_user_data_ptr_ptr = user_data_ptr_ptr; + db->db_evict_func = evict_func; dbuf_update_data(db); } else { - old_user_ptr = db->db_d.db_user_ptr; + old_user_ptr = db->db_user_ptr; } mutex_exit(&db->db_mtx); @@ -1761,21 +1772,106 @@ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_d.db_user_ptr); + return (db->db_user_ptr); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mis-match). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_hold_impl(dn, db->db_level+1, + db->db_blkid >> epbs, FALSE, db, &parent); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } } -void -dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) +static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { - arc_buf_t **data; - uint64_t txg = tx->tx_txg; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + ASSERT(db->db_buf != NULL); + + dbuf_check_blkptr(dn, db); + + db->db_data_pending = dr; + mutex_exit(&db->db_mtx); + + arc_release(db->db_buf, db); + + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, + zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; - int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t txg = tx->tx_txg; int checksum, compress; - zbookmark_t zb; int blksz; - arc_buf_contents_t type; ASSERT(dmu_tx_is_syncing(tx)); @@ -1791,25 +1887,20 @@ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); + ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } DBUF_VERIFY(db); /* - * Don't need a lock on db_dirty (dn_mtx), because it can't - * be modified yet. + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). */ - if (db->db_blkid == DB_BONUS_BLKID) { - arc_buf_t **datap = &db->db_d.db_data_old[txg&TXG_MASK]; - /* - * Simply copy the bonus data into the dnode. It will - * be written out when the dnode is synced (and it will - * be synced, since it must have been dirty for dbuf_sync - * to be called). - */ + dbuf_dirty_record_t **drp; /* * Use dn_phys->dn_bonuslen since db.db_size is the length * of the bonus buffer in the open transaction rather than @@ -1821,10 +1912,13 @@ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); if (*datap != db->db.db_data) zio_buf_free(*datap, DN_MAX_BONUSLEN); - db->db_d.db_data_old[txg&TXG_MASK] = NULL; db->db_data_pending = NULL; - if (db->db_dirtied == txg) - db->db_dirtied = 0; + drp = &db->db_last_dirty; + while (*drp != dr) + drp = &(*drp)->dr_next; + ASSERT((*drp)->dr_next == NULL); + *drp = NULL; + kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); @@ -1832,20 +1926,51 @@ return; } - if (db->db_level == 0) { - type = DBUF_GET_BUFC_TYPE(db); - data = &db->db_d.db_data_old[txg&TXG_MASK]; - blksz = arc_buf_size(*data); + /* + * If this buffer is in the middle of an immdiate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + dbuf_check_blkptr(dn, db); + + /* + * If this dbuf has already been written out via an immediate write, + * just complete the write by copying over the new block pointer and + * updating the accounting via the write-completion functions. + */ + if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + zio_t zio_fake; - /* - * This buffer is in the middle of an immdiate write. - * Wait for the synchronous IO to complete. - */ - while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { - ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); - cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]); - } + zio_fake.io_private = &db; + zio_fake.io_error = 0; + zio_fake.io_bp = db->db_blkptr; + zio_fake.io_bp_orig = *db->db_blkptr; + zio_fake.io_txg = txg; + + *db->db_blkptr = dr->dt.dl.dr_overridden_by; + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + db->db_data_pending = dr; + dr->dr_zio = &zio_fake; + mutex_exit(&db->db_mtx); + + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) + dsl_dataset_block_kill(os->os_dsl_dataset, + &zio_fake.io_bp_orig, dn->dn_zio, tx); + + dbuf_write_ready(&zio_fake, db->db_buf, db); + dbuf_write_done(&zio_fake, db->db_buf, db); + + return; + } + + blksz = arc_buf_size(*datap); + + if (dn->dn_object != DMU_META_DNODE_OBJECT) { /* * If this buffer is currently "in use" (i.e., there are * active holds and db_data still references it), then make @@ -1853,326 +1978,154 @@ * from the open txg will not leak into this write. * * NOTE: this copy does not need to be made for objects only - * modified in the syncing context (e.g. DNONE_DNODE blocks) - * or if there is no actual write involved (bonus blocks). + * modified in the syncing context (e.g. DNONE_DNODE blocks). */ - if (dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) { - if (refcount_count(&db->db_holds) > 1 && - *data == db->db_buf) { - *data = arc_buf_alloc(os->os_spa, blksz, db, - type); - bcopy(db->db.db_data, (*data)->b_data, blksz); - } - db->db_data_pending = *data; - } else if (dn->dn_object == DMU_META_DNODE_OBJECT) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(db->db_buf, db); + if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); } } else { - data = &db->db_buf; - if (*data == NULL) { - /* - * This can happen if we dirty and then free - * the level-0 data blocks in the same txg. So - * this indirect remains unchanged. - */ - if (db->db_dirtied == txg) - db->db_dirtied = 0; - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); - return; - } - blksz = db->db.db_size; - ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(db->db_buf, db); + } + + ASSERT(*datap != NULL); + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + /* + * Allow dnode settings to override objset settings, + * except for metadata checksums. + */ + if (dmu_ot[dn->dn_type].ot_metadata) { + checksum = os->os_md_checksum; + compress = zio_compress_select(dn->dn_compress, + os->os_md_compress); + } else { + checksum = zio_checksum_select(dn->dn_checksum, + os->os_checksum); + compress = zio_compress_select(dn->dn_compress, + os->os_compress); } - ASSERT(*data != NULL); + dbuf_write(dr, *datap, checksum, compress, tx); - if (db->db_level > 0 && !arc_released(db->db_buf)) { - /* - * This indirect buffer was marked dirty, but - * never modified (if it had been modified, then - * we would have released the buffer). There is - * no reason to write anything. - */ - db->db_data_pending = NULL; - if (db->db_dirtied == txg) - db->db_dirtied = 0; - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); - return; - } else if (db->db_blkptr == NULL && - db->db_level == dn->dn_phys->dn_nlevels-1 && - db->db_blkid < dn->dn_phys->dn_nblkptr) { - /* - * This buffer was allocated at a time when there was - * no available blkptrs from the dnode, or it was - * inappropriate to hook it in (i.e., nlevels mis-match). - */ - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_parent == NULL); - db->db_parent = dn->dn_dbuf; - db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; - DBUF_VERIFY(db); - mutex_exit(&db->db_mtx); - } else if (db->db_blkptr == NULL) { - dmu_buf_impl_t *parent = db->db_parent; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) + list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); + else + zio_nowait(dr->dr_zio); +} - mutex_exit(&db->db_mtx); - ASSERT(dn->dn_phys->dn_nlevels > 1); - if (parent == NULL) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, FTAG, &parent); - rw_exit(&dn->dn_struct_rwlock); - dbuf_add_ref(parent, db); - db->db_parent = parent; - dbuf_rele(parent, FTAG); - } - (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED); - } else { - mutex_exit(&db->db_mtx); - } - - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL); +void +dbuf_sync_list(list_t *list, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; - if (db->db_level > 0 && - db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { - /* - * Don't write indirect blocks past EOF. - * We get these when we truncate a file *after* dirtying - * blocks in the truncate range (we undirty the level 0 - * blocks in dbuf_free_range(), but not the indirects). - */ -#ifdef ZFS_DEBUG - /* - * Verify that this indirect block is empty. - */ - blkptr_t *bplist; - int i; - - mutex_enter(&db->db_mtx); - bplist = db->db.db_data; - for (i = 0; i < (1 << epbs); i++) { - if (!BP_IS_HOLE(&bplist[i])) { - panic("data past EOF: " - "db=%p level=%d id=%llu i=%d\n", - db, db->db_level, - (u_longlong_t)db->db_blkid, i); - } + while (dr = list_head(list)) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; } - mutex_exit(&db->db_mtx); -#endif - ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)); - mutex_enter(&db->db_mtx); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); - return; + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); } +} - if (db->db_parent != dn->dn_dbuf) { - dmu_buf_impl_t *parent = db->db_parent; +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, + int compress, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_impl_t *os = dn->dn_objset; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_t *zio; - mutex_enter(&db->db_mtx); + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); ASSERT(db->db_level == parent->db_level-1); - ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); - /* - * We may have read this indirect block after we dirtied it, - * so never released it from the cache. - */ - arc_release(parent->db_buf, parent); - - db->db_blkptr = (blkptr_t *)parent->db.db_data + - (db->db_blkid & ((1ULL << epbs) - 1)); - DBUF_VERIFY(db); - mutex_exit(&db->db_mtx); -#ifdef ZFS_DEBUG + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; } else { - /* - * We don't need to dnode_setdirty(dn) because if we got - * here then the parent is already dirty. - */ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); -#endif - } - ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); - - if (db->db_level == 0 && - db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { - arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; - blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; - int old_size = bp_get_dasize(os->os_spa, db->db_blkptr); - int new_size = bp_get_dasize(os->os_spa, *bpp); - - ASSERT(db->db_blkid != DB_BONUS_BLKID); - - dnode_diduse_space(dn, new_size-old_size); - mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid) - dn->dn_phys->dn_maxblkid = db->db_blkid; - mutex_exit(&dn->dn_mtx); - - dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); - if (!BP_IS_HOLE(db->db_blkptr)) - dsl_dataset_block_kill(os->os_dsl_dataset, - db->db_blkptr, os->os_synctx); - - mutex_enter(&db->db_mtx); - *db->db_blkptr = **bpp; - kmem_free(*bpp, sizeof (blkptr_t)); - *bpp = NULL; - - if (*old != db->db_buf) - VERIFY(arc_buf_remove_ref(*old, db) == 1); - else if (!BP_IS_HOLE(db->db_blkptr)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); - else - ASSERT(arc_released(db->db_buf)); - *old = NULL; - db->db_data_pending = NULL; - - cv_broadcast(&db->db_changed); - - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); - return; + zio = dn->dn_zio; } - if (db->db_level > 0) { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - checksum = ZIO_CHECKSUM_FLETCHER_4; - if (zfs_mdcomp_disable) - compress = ZIO_COMPRESS_EMPTY; - else - compress = ZIO_COMPRESS_LZJB; - } else { - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } - } -#ifdef ZFS_DEBUG - if (db->db_parent) { - ASSERT(list_link_active( - &db->db_parent->db_dirty_node[txg&TXG_MASK])); - ASSERT(db->db_parent == dn->dn_dbuf || - db->db_parent->db_level > 0); - if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0) - ASSERT(*data == db->db_buf); - } -#endif - ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - (void) arc_write(zio, os->os_spa, checksum, compress, - dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg, - db->db_blkptr, *data, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb); - /* - * We can't access db after arc_write, since it could finish - * and be freed, and we have no locks on it. - */ -} - -struct dbuf_arg { - objset_impl_t *os; - blkptr_t bp; -}; + if (BP_IS_OLDER(db->db_blkptr, txg)) + dsl_dataset_block_kill( + os->os_dsl_dataset, db->db_blkptr, zio, tx); -static void -dbuf_do_born(void *arg) -{ - struct dbuf_arg *da = arg; - dsl_dataset_block_born(da->os->os_dsl_dataset, - &da->bp, da->os->os_synctx); - kmem_free(da, sizeof (struct dbuf_arg)); -} - -static void -dbuf_do_kill(void *arg) -{ - struct dbuf_arg *da = arg; - dsl_dataset_block_kill(da->os->os_dsl_dataset, - &da->bp, da->os->os_synctx); - kmem_free(da, sizeof (struct dbuf_arg)); + dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, + dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg, + db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } /* ARGSUSED */ static void -dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; - uint64_t txg = zio->io_txg; + blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t fill = 0; - int i; - int old_size, new_size; + int old_size, new_size, i; - ASSERT3U(zio->io_error, ==, 0); + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); - dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); - - old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig); + old_size = bp_get_dasize(os->os_spa, bp_orig); new_size = bp_get_dasize(os->os_spa, zio->io_bp); dnode_diduse_space(dn, new_size-old_size); - mutex_enter(&db->db_mtx); + if (BP_IS_HOLE(zio->io_bp)) { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; - ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); + if (bp_orig->blk_birth == tx->tx_txg) + dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + return; + } - if (db->db_dirtied == txg) - db->db_dirtied = 0; + mutex_enter(&db->db_mtx); if (db->db_level == 0) { - arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; - - ASSERT(db->db_blkid != DB_BONUS_BLKID); - - if (*old != db->db_buf) - VERIFY(arc_buf_remove_ref(*old, db) == 1); - else if (!BP_IS_HOLE(db->db_blkptr)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); - else - ASSERT(arc_released(db->db_buf)); - *old = NULL; - db->db_data_pending = NULL; - mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid && - !BP_IS_HOLE(db->db_blkptr)) + if (db->db_blkid > dn->dn_phys->dn_maxblkid) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); @@ -2184,22 +2137,11 @@ fill++; } } else { - if (!BP_IS_HOLE(db->db_blkptr)) - fill = 1; + fill = 1; } } else { blkptr_t *bp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - if (!BP_IS_HOLE(db->db_blkptr)) { - int epbs = - dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, - db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); - arc_set_callback(db->db_buf, dbuf_do_evict, db); - } for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { if (BP_IS_HOLE(bp)) continue; @@ -2210,40 +2152,78 @@ } } - if (!BP_IS_HOLE(db->db_blkptr)) { - db->db_blkptr->blk_fill = fill; - BP_SET_TYPE(db->db_blkptr, dn->dn_type); - BP_SET_LEVEL(db->db_blkptr, db->db_level); + db->db_blkptr->blk_fill = fill; + BP_SET_TYPE(db->db_blkptr, dn->dn_type); + BP_SET_LEVEL(db->db_blkptr, db->db_level); + + mutex_exit(&db->db_mtx); + + /* We must do this after we've set the bp's type and level */ + if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + if (bp_orig->blk_birth == tx->tx_txg) + dsl_dataset_block_kill(ds, bp_orig, NULL, tx); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + uint64_t txg = zio->io_txg; + dbuf_dirty_record_t **drp, *dr; + + ASSERT3U(zio->io_error, ==, 0); + + mutex_enter(&db->db_mtx); + + drp = &db->db_last_dirty; + while (*drp != db->db_data_pending) + drp = &(*drp)->dr_next; + ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); + ASSERT((*drp)->dr_txg == txg); + ASSERT((*drp)->dr_next == NULL); + dr = *drp; + *drp = NULL; + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + if (dr->dt.dl.dr_data != db->db_buf) + VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); + else if (!BP_IS_HOLE(db->db_blkptr)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + else + ASSERT(arc_released(db->db_buf)); } else { - ASSERT3U(fill, ==, 0); - ASSERT3U(db->db_blkptr->blk_fill, ==, 0); - } + dnode_t *dn = db->db_dnode; - dprintf_dbuf_bp(db, db->db_blkptr, - "wrote %llu bytes to blkptr:", zio->io_size); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs = + dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + ASSERT3U(dn->dn_phys->dn_maxblkid + >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); + } + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); - ASSERT(db->db_parent == NULL || - list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; + db->db_data_pending = NULL; mutex_exit(&db->db_mtx); - /* We must do this after we've set the bp's type and level */ - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), - BP_IDENTITY(&zio->io_bp_orig))) { - struct dbuf_arg *da; - da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); - da->os = os; - da->bp = *zio->io_bp; - (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); - if (!BP_IS_HOLE(&zio->io_bp_orig)) { - da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); - da->os = os; - da->bp = zio->io_bp_orig; - (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); - } - } + dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); dbuf_rele(db, (void *)(uintptr_t)txg); }
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu.c Fri Feb 02 15:36:58 2007 -0800 @@ -567,27 +567,19 @@ #endif typedef struct { - uint64_t txg; - dmu_buf_impl_t *db; - dmu_sync_cb_t *done; - void *arg; -} dmu_sync_cbin_t; - -typedef union { - dmu_sync_cbin_t data; - blkptr_t blk; -} dmu_sync_cbarg_t; + dbuf_dirty_record_t *dr; + dmu_sync_cb_t *done; + void *arg; +} dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { - dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg; - dmu_buf_impl_t *db = in->db; - uint64_t txg = in->txg; + dmu_sync_arg_t *in = varg; + dbuf_dirty_record_t *dr = in->dr; + dmu_buf_impl_t *db = dr->dr_dbuf; dmu_sync_cb_t *done = in->done; - void *arg = in->arg; - blkptr_t *blk = (blkptr_t *)varg; if (!BP_IS_HOLE(zio->io_bp)) { zio->io_bp->blk_fill = 1; @@ -595,16 +587,17 @@ BP_SET_LEVEL(zio->io_bp, 0); } - *blk = *zio->io_bp; /* structure assignment */ - mutex_enter(&db->db_mtx); - ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC); - db->db_d.db_overridden_by[txg&TXG_MASK] = blk; + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); if (done) - done(&(db->db), arg); + done(&(db->db), in->arg); + + kmem_free(in, sizeof (dmu_sync_arg_t)); } /* @@ -637,10 +630,10 @@ objset_impl_t *os = db->db_objset; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; tx_state_t *tx = &dp->dp_tx; - dmu_sync_cbin_t *in; - blkptr_t *blk; + dbuf_dirty_record_t *dr; + dmu_sync_arg_t *in; zbookmark_t zb; - uint32_t arc_flag; + zio_t *zio; int err; ASSERT(BP_IS_HOLE(bp)); @@ -674,25 +667,6 @@ mutex_enter(&db->db_mtx); - blk = db->db_d.db_overridden_by[txg&TXG_MASK]; - if (blk == IN_DMU_SYNC) { - /* - * We have already issued a sync write for this buffer. - */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (EALREADY); - } else if (blk != NULL) { - /* - * This buffer had already been synced. It could not - * have been dirtied since, or we would have cleared blk. - */ - *bp = *blk; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - if (txg == tx->tx_syncing_txg) { while (db->db_data_pending) { /* @@ -726,7 +700,10 @@ } } - if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) { + dr = db->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + if (dr == NULL || dr->dr_txg < txg) { /* * This dbuf isn't dirty, must have been free_range'd. * There's no need to log writes to freed blocks, so we're done. @@ -736,35 +713,52 @@ return (ENOENT); } - ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); - db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; - /* - * XXX - a little ugly to stash the blkptr in the callback - * buffer. We always need to make sure the following is true: - * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t)); - */ - in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - in->db = db; - in->txg = txg; + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + /* + * We have already issued a sync write for this buffer. + */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (EALREADY); + } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * This buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ + mutex_exit(&db->db_mtx); + txg_resume(dp); + return (0); + } + + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + in->dr = dr; in->done = done; in->arg = arg; mutex_exit(&db->db_mtx); txg_resume(dp); - arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT; zb.zb_objset = os->os_dsl_dataset->ds_object; zb.zb_object = db->db.db_object; zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - err = arc_write(pio, os->os_spa, + zio = arc_write(pio, os->os_spa, zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), zio_compress_select(db->db_dnode->dn_compress, os->os_compress), dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type), - txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb); - ASSERT(err == 0); + txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0); + if (pio) { + zio_nowait(zio); + err = EINPROGRESS; + } else { + err = zio_wait(zio); + ASSERT(err == 0); + } + return (err); } int
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -139,10 +139,8 @@ osi->os.os = osi; osi->os_dsl_dataset = ds; osi->os_spa = spa; - if (bp) - osi->os_rootbp = *bp; - osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t)); - if (!BP_IS_HOLE(&osi->os_rootbp)) { + osi->os_rootbp = bp; + if (!BP_IS_HOLE(osi->os_rootbp)) { uint32_t aflags = ARC_WAIT; zbookmark_t zb; zb.zb_objset = ds ? ds->ds_object : 0; @@ -150,17 +148,21 @@ zb.zb_level = -1; zb.zb_blkid = 0; - dprintf_bp(&osi->os_rootbp, "reading %s", ""); - err = arc_read(NULL, spa, &osi->os_rootbp, + dprintf_bp(osi->os_rootbp, "reading %s", ""); + err = arc_read(NULL, spa, osi->os_rootbp, dmu_ot[DMU_OT_OBJSET].ot_byteswap, - arc_bcopy_func, osi->os_phys, + arc_getbuf_func, &osi->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { - zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); kmem_free(osi, sizeof (objset_impl_t)); return (err); } + osi->os_phys = osi->os_phys_buf->b_data; + arc_release(osi->os_phys_buf, &osi->os_phys_buf); } else { + osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), + &osi->os_phys_buf, ARC_BUFC_METADATA); + osi->os_phys = osi->os_phys_buf->b_data; bzero(osi->os_phys, sizeof (objset_phys_t)); } @@ -177,7 +179,8 @@ err = dsl_prop_register(ds, "compression", compression_changed_cb, osi); if (err) { - zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, + &osi->os_phys_buf) == 1); kmem_free(osi, sizeof (objset_impl_t)); return (err); } @@ -252,11 +255,8 @@ osi = dsl_dataset_get_user_ptr(ds); if (osi == NULL) { - blkptr_t bp; - - dsl_dataset_get_blkptr(ds, &bp); err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &bp, &osi); + ds, &ds->ds_phys->ds_bp, &osi); if (err) { dsl_dataset_close(ds, mode, os); kmem_free(os, sizeof (objset_t)); @@ -364,7 +364,7 @@ dnode_special_close(osi->os_meta_dnode); zil_free(osi->os_zil); - zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); + VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_obj_lock); kmem_free(osi, sizeof (objset_impl_t)); @@ -372,14 +372,14 @@ /* called from dsl for meta-objset */ objset_impl_t * -dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type, - dmu_tx_t *tx) +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) { objset_impl_t *osi; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi)); + VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); mdn = osi->os_meta_dnode; dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, @@ -467,7 +467,7 @@ dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; dsl_dataset_t *ds; - blkptr_t bp; + blkptr_t *bp; uint64_t dsobj; ASSERT(dmu_tx_is_syncing(tx)); @@ -477,13 +477,13 @@ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); - dsl_dataset_get_blkptr(ds, &bp); - if (BP_IS_HOLE(&bp)) { + bp = dsl_dataset_get_blkptr(ds); + if (BP_IS_HOLE(bp)) { objset_impl_t *osi; /* This is an empty dmu_objset; not a clone. */ osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, oa->type, tx); + ds, bp, oa->type, tx); if (oa->userfunc) oa->userfunc(&osi->os, oa->userarg, tx); @@ -660,41 +660,41 @@ } static void -dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx) +dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) { - dnode_t *dn = list_head(list); - int level, err; + dnode_t *dn; - for (level = 0; dn = list_head(list); level++) { - zio_t *zio; - zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - - ASSERT3U(level, <=, DN_MAX_LEVELS); - - while (dn) { - dnode_t *next = list_next(list, dn); + while (dn = list_head(list)) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() + * to accomodate meta-dnode + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); - list_remove(list, dn); - if (dnode_sync(dn, level, zio, tx) == 0) { - /* - * This dnode requires syncing at higher - * levels; put it back onto the list. - */ - if (next) - list_insert_before(list, next, dn); - else - list_insert_tail(list, dn); - } - dn = next; - } + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + list_remove(list, dn); + dnode_sync(dn, tx); + } +} - DTRACE_PROBE1(wait__begin, zio_t *, zio); - err = zio_wait(zio); - DTRACE_PROBE4(wait__end, zio_t *, zio, - uint64_t, tx->tx_txg, objset_impl_t *, os, int, level); +/* ARGSUSED */ +static void +ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + objset_impl_t *os = arg; + blkptr_t *bp = os->os_rootbp; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + int i; - ASSERT(err == 0); - } + /* + * Update rootbp fill count. + */ + bp->blk_fill = 1; /* count the meta-dnode */ + for (i = 0; i < dnp->dn_nblkptr; i++) + bp->blk_fill += dnp->dn_blkptr[i].blk_fill; } /* ARGSUSED */ @@ -702,90 +702,81 @@ killer(zio_t *zio, arc_buf_t *abuf, void *arg) { objset_impl_t *os = arg; - objset_phys_t *osphys = zio->io_data; - dnode_phys_t *dnp = &osphys->os_meta_dnode; - int i; ASSERT3U(zio->io_error, ==, 0); - /* - * Update rootbp fill count. - */ - os->os_rootbp.blk_fill = 1; /* count the meta-dnode */ - for (i = 0; i < dnp->dn_nblkptr; i++) - os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill; - BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET); BP_SET_LEVEL(zio->io_bp, 0); if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(&zio->io_bp_orig))) { - dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig, - os->os_synctx); + if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) + dsl_dataset_block_kill(os->os_dsl_dataset, + &zio->io_bp_orig, NULL, os->os_synctx); dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp, os->os_synctx); } + arc_release(os->os_phys_buf, &os->os_phys_buf); + + if (os->os_dsl_dataset) + dmu_buf_rele(os->os_dsl_dataset->ds_dbuf, os->os_dsl_dataset); } /* called from dsl */ void -dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx) +dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) { - extern taskq_t *dbuf_tq; int txgoff; - list_t *dirty_list; - int err; zbookmark_t zb; - arc_buf_t *abuf = - arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG, - ARC_BUFC_METADATA); + zio_t *zio; + list_t *list; + dbuf_dirty_record_t *dr; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(os->os_synctx == NULL); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; - dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + /* + * Create the root block IO + */ + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) + dsl_dataset_block_kill(os->os_dsl_dataset, + os->os_rootbp, pio, tx); + zio = arc_write(pio, os->os_spa, os->os_md_checksum, + os->os_md_compress, + dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET), + tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + /* + * Sync meta-dnode - the parent IO for the sync is the root block + */ + os->os_meta_dnode->dn_zio = zio; + dnode_sync(os->os_meta_dnode, tx); txgoff = tx->tx_txg & TXG_MASK; - dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx); + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + while (dr = list_head(list)) { + ASSERT(dr->dr_dbuf->db_level == 0); + list_remove(list, dr); + if (dr->dr_zio) + zio_nowait(dr->dr_zio); + } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); - - /* - * Sync meta-dnode - */ - dirty_list = &os->os_dirty_dnodes[txgoff]; - ASSERT(list_head(dirty_list) == NULL); - list_insert_tail(dirty_list, os->os_meta_dnode); - dmu_objset_sync_dnodes(os, dirty_list, tx); - - /* - * Sync the root block. - */ - bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t)); - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - err = arc_write(NULL, os->os_spa, os->os_md_checksum, - os->os_md_compress, - dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET), - tx->tx_txg, &os->os_rootbp, abuf, killer, os, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); - ASSERT(err == 0); - VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1); - - dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx); - - ASSERT3P(os->os_synctx, ==, tx); - taskq_wait(dbuf_tq); - os->os_synctx = NULL; + zio_nowait(zio); } void
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -382,7 +382,7 @@ DS_MODE_EXCLUSIVE, FTAG, &ds)); (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, drrb->drr_type, tx); + ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
--- a/usr/src/uts/common/fs/zfs/dnode.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dnode.c Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -65,9 +65,9 @@ avl_create(&dn->dn_ranges[i], free_range_compar, sizeof (free_range_t), offsetof(struct free_range, fr_node)); - list_create(&dn->dn_dirty_dbufs[i], - sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_dirty_node[i])); + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); } list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), @@ -91,7 +91,7 @@ for (i = 0; i < TXG_SIZE; i++) { avl_destroy(&dn->dn_ranges[i]); - list_destroy(&dn->dn_dirty_dbufs[i]); + list_destroy(&dn->dn_dirty_records[i]); } list_destroy(&dn->dn_dbufs); @@ -296,7 +296,7 @@ for (i = 0; i < TXG_SIZE; i++) { ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_dbufs[i])); + ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); } ASSERT(NULL == list_head(&dn->dn_dbufs)); @@ -362,7 +362,7 @@ ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); } @@ -461,7 +461,7 @@ ASSERT(db->db.db_data != NULL); db->db.db_size = bonuslen; mutex_exit(&db->db_mtx); - dbuf_dirty(db, tx); + (void) dbuf_dirty(db, tx); } /* change bonus size and type */ @@ -714,7 +714,7 @@ */ dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); - dbuf_dirty(dn->dn_dbuf, tx); + (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } @@ -855,17 +855,35 @@ if (new_nlevels > dn->dn_nlevels) { int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; - /* Dirty the left indirects. */ + /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); - dbuf_dirty(db, tx); + new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); } out: @@ -973,7 +991,7 @@ caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_dirtied || + if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dbuf_will_dirty(db, tx); @@ -1023,7 +1041,7 @@ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), TRUE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ - if (db->db_dirtied || + if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock);
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Feb 02 15:36:58 2007 -0800 @@ -33,78 +33,81 @@ #include <sys/dmu_objset.h> #include <sys/dsl_dataset.h> #include <sys/spa.h> -#include <sys/zio.h> static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) { dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + int nblkptr = dn->dn_phys->dn_nblkptr; + int old_toplvl = dn->dn_phys->dn_nlevels - 1; + int new_level = dn->dn_next_nlevels[txgoff]; int i; - uint64_t txg = tx->tx_txg; + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - /* this dnode can't be paged out because it's dirty */ + ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); ASSERT(db != NULL); - for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) - break; - if (i != dn->dn_phys->dn_nblkptr) { - ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK])); - (void) dbuf_read(db, NULL, - DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); - /* copy dnode's block pointers to new indirect block */ - ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=, - db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, - sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr); - arc_buf_freeze(db->db_buf); - } - - dn->dn_phys->dn_nlevels += 1; + dn->dn_phys->dn_nlevels = new_level; dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); + /* check for existing blkptrs in the dnode */ + for (i = 0; i < nblkptr; i++) + if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) + break; + if (i != nblkptr) { + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); + } + /* set dbuf's parent pointers to new indirect buf */ - for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i); + for (i = 0; i < nblkptr; i++) { + dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); + if (child == NULL) continue; - if (child->db_dnode == NULL) { + ASSERT3P(child->db_dnode, ==, dn); + if (child->db_parent && child->db_parent != dn->dn_dbuf) { + ASSERT(child->db_parent->db_level == db->db_level); + ASSERT(child->db_blkptr != + &dn->dn_phys->dn_blkptr[child->db_blkid]); mutex_exit(&child->db_mtx); continue; } + ASSERT(child->db_parent == NULL || + child->db_parent == dn->dn_dbuf); - if (child->db_parent == NULL || - child->db_parent == dn->dn_dbuf) { - dprintf_dbuf_bp(child, child->db_blkptr, - "changing db_blkptr to new indirect %s", ""); - child->db_parent = db; - dbuf_add_ref(db, child); - if (db->db.db_data) { - child->db_blkptr = - (blkptr_t *)db->db.db_data + i; - } else { - child->db_blkptr = NULL; - } - dprintf_dbuf_bp(child, child->db_blkptr, - "changed db_blkptr to new indirect %s", ""); - } - ASSERT3P(child->db_parent, ==, db); + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) + child->db_blkptr = (blkptr_t *)db->db.db_data + i; + else + child->db_blkptr = NULL; + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); mutex_exit(&child->db_mtx); } - bzero(dn->dn_phys->dn_blkptr, - sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr); + bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); } static void @@ -122,7 +125,8 @@ bytesfreed += bp_get_dasize(os->os_spa, bp); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); - dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx); + dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx); + bzero(bp, sizeof (blkptr_t)); } dnode_diduse_space(dn, -bytesfreed); } @@ -148,8 +152,9 @@ for (i = off; i < off+num; i++) { uint64_t *buf; + dmu_buf_impl_t *child; + dbuf_dirty_record_t *dr; int j; - dmu_buf_impl_t *child; ASSERT(db->db_level == 1); @@ -161,11 +166,14 @@ continue; ASSERT(err == 0); ASSERT(child->db_level == 0); - ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK])); + dr = child->db_last_dirty; + while (dr && dr->dr_txg > txg) + dr = dr->dr_next; + ASSERT(dr == NULL || dr->dr_txg == txg); - /* db_data_old better be zeroed */ - if (child->db_d.db_data_old[txg & TXG_MASK]) { - buf = child->db_d.db_data_old[txg & TXG_MASK]->b_data; + /* data_old better be zeroed */ + if (dr) { + buf = dr->dt.dl.dr_data->b_data; for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -182,10 +190,7 @@ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && - !list_link_active(&child->db_dirty_node - [(txg+1) & TXG_MASK]) && - !list_link_active(&child->db_dirty_node - [(txg+2) & TXG_MASK])) { + child->db_last_dirty == NULL) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -210,7 +215,6 @@ dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; int epbs, shift, err; - int txgoff = tx->tx_txg & TXG_MASK; int all = TRUE; (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); @@ -239,7 +243,7 @@ FREE_VERIFY(db, start, end, tx); free_blocks(dn, bp, end-start+1, tx); arc_buf_freeze(db->db_buf); - ASSERT(all || list_link_active(&db->db_dirty_node[txgoff])); + ASSERT(all || db->db_last_dirty); return (all); } @@ -270,7 +274,7 @@ ASSERT3U(bp->blk_birth, ==, 0); } #endif - ASSERT(all || list_link_active(&db->db_dirty_node[txgoff])); + ASSERT(all || db->db_last_dirty); return (all); } @@ -418,31 +422,43 @@ return (0); } -static int +static void +dnode_undirty_dbufs(list_t *list) +{ + dbuf_dirty_record_t *dr; + + while (dr = list_head(list)) { + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(list, dr); + ASSERT(db->db_last_dirty == dr); + db->db_last_dirty = NULL; + db->db_dirtycnt -= 1; + if (db->db_level == 0) { + ASSERT(db->db_blkid == DB_BONUS_BLKID || + dr->dt.dl.dr_data == db->db_buf); + dbuf_unoverride(dr); + mutex_exit(&db->db_mtx); + } else { + mutex_exit(&db->db_mtx); + dnode_undirty_dbufs(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + dbuf_rele(db, (void *)(uintptr_t)txg); + } +} + +static void dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) { - dmu_buf_impl_t *db; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(dmu_tx_is_syncing(tx)); - /* Undirty all buffers */ - while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) { - mutex_enter(&db->db_mtx); - /* XXX - use dbuf_undirty()? */ - list_remove(&dn->dn_dirty_dbufs[txgoff], db); - if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || - db->db_d.db_data_old[txgoff] == db->db_buf); - if (db->db_d.db_overridden_by[txgoff]) - dbuf_unoverride(db, tx->tx_txg); - db->db_d.db_data_old[txgoff] = NULL; - } - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg); - } - + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); (void) dnode_evict_dbufs(dn, 0); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); @@ -487,32 +503,27 @@ * Now that we've released our hold, the dnode may * be evicted, so we musn't access it. */ - return (1); } /* - * Write out the dnode's dirty buffers at the specified level. - * This may create more dirty buffers at the next level up. + * Write out the dnode's dirty buffers. * * NOTE: The dnode is kept in memory by being dirty. Once the * dirty bit is cleared, it may be evicted. Beware of this! */ -int -dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx) +void +dnode_sync(dnode_t *dn, dmu_tx_t *tx) { free_range_t *rp; + dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; - dnode_phys_t *dnp = dn->dn_phys; + list_t *list = &dn->dn_dirty_records[txgoff]; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); DNODE_VERIFY(dn); - /* - * Make sure the dbuf for the dn_phys is released before we modify it. - */ - if (dn->dn_dbuf) - arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf); + ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { @@ -536,7 +547,7 @@ dnp->dn_nblkptr = dn->dn_nblkptr; } - ASSERT(level != 0 || dnp->dn_nlevels > 1 || + ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); @@ -545,7 +556,7 @@ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || - list_head(&dn->dn_dirty_dbufs[txgoff]) != NULL || + list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec); dnp->dn_datablkszsec = @@ -586,68 +597,25 @@ mutex_exit(&dn->dn_mtx); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { - ASSERT3U(level, ==, 0); - return (dnode_sync_free(dn, tx)); + dnode_sync_free(dn, tx); + return; } if (dn->dn_next_nlevels[txgoff]) { - int new_lvl = dn->dn_next_nlevels[txgoff]; - - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - while (new_lvl > dnp->dn_nlevels) - dnode_increase_indirection(dn, tx); - rw_exit(&dn->dn_struct_rwlock); + dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; } - if (level == dnp->dn_nlevels) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - - /* we've already synced out all data and indirect blocks */ - /* there are no more dirty dbufs under this dnode */ - ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL); - ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg); + dbuf_sync_list(list, tx); - /* NB: the "off < maxblkid" is to catch overflow */ - /* - * NB: if blocksize is changing, we could get confused, - * so only bother if there are multiple blocks and thus - * it can't be changing. - */ - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0); - - ASSERT(dnp->dn_nlevels > 1 || - BP_IS_HOLE(&dnp->dn_blkptr[0]) || - BP_GET_LSIZE(&dnp->dn_blkptr[0]) == - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + if (dn->dn_object != DMU_META_DNODE_OBJECT) { + ASSERT3P(list_head(list), ==, NULL); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } - if (dn->dn_object != DMU_META_DNODE_OBJECT) { - dbuf_will_dirty(dn->dn_dbuf, tx); - dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); - } - - /* - * Now that we've dropped the reference, the dnode may - * be evicted, so we musn't access it. - */ - return (1); - } else { - dmu_buf_impl_t *db, *db_next; - list_t *list = &dn->dn_dirty_dbufs[txgoff]; - /* - * Iterate over the list, removing and sync'ing dbufs - * which are on the level we want, and leaving others. - */ - for (db = list_head(list); db; db = db_next) { - db_next = list_next(list, db); - if (db->db_level == level) { - list_remove(list, db); - dbuf_sync(db, zio, tx); - } - } - return (0); - } + /* + * Although we have dropped our reference to the dnode, it + * can't be evicted until its written, and we haven't yet + * initiated the IO for the dnode's dbuf. + */ }
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri Feb 02 15:36:58 2007 -0800 @@ -105,26 +105,28 @@ } void -dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx) { int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); ASSERT(dmu_tx_is_syncing(tx)); + /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) return; ASSERT(used > 0); if (ds == NULL) { + int err; /* * Account for the meta-objset space in its placeholder * dataset. */ - /* XXX this can fail, what do we do when it does? */ - (void) arc_free(NULL, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, ARC_WAIT); - bzero(bp, sizeof (blkptr_t)); + err = arc_free(pio, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + ASSERT(err == 0); dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, -used, -compressed, -uncompressed, tx); @@ -136,10 +138,12 @@ dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + int err; + dprintf_bp(bp, "freeing: %s", ""); - /* XXX check return code? */ - (void) arc_free(NULL, tx->tx_pool->dp_spa, - tx->tx_txg, bp, NULL, NULL, ARC_WAIT); + err = arc_free(pio, tx->tx_pool->dp_spa, + tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + ASSERT(err == 0); mutex_enter(&ds->ds_lock); /* XXX unique_bytes is not accurate for head datasets */ @@ -167,7 +171,6 @@ } } } - bzero(bp, sizeof (blkptr_t)); mutex_enter(&ds->ds_lock); ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); ds->ds_phys->ds_used_bytes -= used; @@ -539,7 +542,8 @@ VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); - (void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx); + (void) dmu_objset_create_impl(dp->dp_spa, ds, + &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx); dsl_dataset_close(ds, DS_MODE_NONE, FTAG); } @@ -829,10 +833,10 @@ } -void -dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp) +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) { - *bp = ds->ds_phys->ds_bp; + return (&ds->ds_phys->ds_bp); } void @@ -1403,17 +1407,15 @@ } void -dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_user_ptr != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); - dmu_objset_sync(ds->ds_user_ptr, tx); dsl_dir_dirty(ds->ds_dir, tx); - bplist_close(&ds->ds_deadlist); - - dmu_buf_rele(ds->ds_dbuf, ds); + dmu_objset_sync(ds->ds_user_ptr, zio, tx); + /* Unneeded? bplist_close(&ds->ds_deadlist); */ } void
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +33,7 @@ #include <sys/dmu_objset.h> #include <sys/arc.h> #include <sys/zap.h> +#include <sys/zio.h> #include <sys/zfs_context.h> #include <sys/fs/zfs.h> @@ -143,7 +144,7 @@ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); dp->dp_meta_objset = &dmu_objset_create_impl(spa, - NULL, DMU_OST_META, tx)->os; + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -167,36 +168,36 @@ void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { + zio_t *zio; dmu_tx_t *tx; + dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_sync_task_group_t *dstg; objset_impl_t *mosi = dp->dp_meta_objset->os; + int err; tx = dmu_tx_create_assigned(dp, txg); - do { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + if (!list_link_active(&ds->ds_synced_link)) + list_insert_tail(&dp->dp_synced_objsets, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + ASSERT(err == 0); - while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_objsets, ds); - dsl_dataset_sync(ds, tx); - } - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) - dsl_sync_task_group_sync(dstg, tx); - while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) - dsl_dir_sync(dd, tx); - /* - * We need to loop since dsl_sync_task_group_sync() - * could create a new (dirty) objset. - * XXX - isn't this taken care of by the spa's sync to - * convergence loop? - */ - } while (!txg_list_empty(&dp->dp_dirty_datasets, txg)); + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + dsl_sync_task_group_sync(dstg, tx); + while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) + dsl_dir_sync(dd, tx); if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { - dmu_objset_sync(mosi, tx); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(mosi, zio, tx); + err = zio_wait(zio); + ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } @@ -216,18 +217,15 @@ } } +/* + * TRUE if the current thread is the tx_sync_thread or if we + * are being called from SPA context during pool initialization. + */ int dsl_pool_sync_context(dsl_pool_t *dp) { - /* - * Yeah, this is cheesy. But the SPA needs some way to let - * the sync threads invoke spa_open() and spa_close() while - * it holds the namespace lock. I'm certainly open to better - * ideas for how to determine whether the current thread is - * operating on behalf of spa_sync(). This works for now. - */ return (curthread == dp->dp_tx.tx_sync_thread || - BP_IS_HOLE(&dp->dp_meta_rootbp)); + spa_get_dsl(dp->dp_spa) == NULL); } uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/arc.h Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -84,10 +84,10 @@ int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, uint32_t *arc_flags, zbookmark_t *zb); -int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, - uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags, zbookmark_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, + int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb); int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_done_func_t *done, void *private, uint32_t arc_flags); int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,7 +41,7 @@ #endif #define DB_BONUS_BLKID (-1ULL) -#define IN_DMU_SYNC ((blkptr_t *)-1) +#define IN_DMU_SYNC 2 /* * define flags for dbuf_read @@ -86,6 +86,56 @@ #define LIST_LINK_INACTIVE(link) \ ((link)->list_next == NULL && (link)->list_prev == NULL) +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* pointer to next dirty record */ + struct dbuf_dirty_record *dr_next; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + } dl; + } dt; +} dbuf_dirty_record_t; + typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of @@ -152,53 +202,28 @@ arc_buf_t *db_buf; kcondvar_t db_changed; - arc_buf_t *db_data_pending; + dbuf_dirty_record_t *db_data_pending; + + /* pointer to most recent dirty record for this buffer */ + dbuf_dirty_record_t *db_last_dirty; /* - * Last time (transaction group) this buffer was dirtied. - */ - uint64_t db_dirtied; - - /* - * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list. + * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. */ list_node_t db_link; - /* Our link on dn_dirty_dbufs[txg] */ - list_node_t db_dirty_node[TXG_SIZE]; - uint8_t db_dirtycnt; - - /* - * Data which is unique to data (leaf) blocks: - */ - struct { - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - void **db_user_data_ptr_ptr; - dmu_buf_evict_func_t *db_evict_func; - uint8_t db_immediate_evict; - uint8_t db_freed_in_flight; + /* Data which is unique to data (leaf) blocks: */ - /* - * db_data_old[txg&TXG_MASK] is set when we - * dirty the buffer, so that we can retain the - * pointer even if it gets COW'd in a subsequent - * transaction group. - * - * If the buffer is dirty in any txg, it can't - * be destroyed. - */ - /* - * XXX Protected by db_mtx and dn_dirty_mtx. - * db_mtx must be held to read db_dirty[], and - * both db_mtx and dn_dirty_mtx must be held to - * modify (dirty or clean). db_mtx must be held - * before dn_dirty_mtx. - */ - arc_buf_t *db_data_old[TXG_SIZE]; - blkptr_t *db_overridden_by[TXG_SIZE]; - } db_d; + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + void **db_user_data_ptr_ptr; + dmu_buf_evict_func_t *db_evict_func; + + uint8_t db_immediate_evict; + uint8_t db_freed_in_flight; + + uint8_t db_dirtycnt; } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ @@ -237,14 +262,14 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); -void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx); -void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg); +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, dmu_tx_t *tx); void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, struct dmu_tx *);
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Fri Feb 02 15:36:58 2007 -0800 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -218,6 +217,14 @@ * held from: * dsl_dataset_* * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid */ struct objset;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/spa.h> +#include <sys/arc.h> #include <sys/txg.h> #include <sys/zfs_context.h> #include <sys/dnode.h> @@ -60,6 +61,7 @@ /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; + arc_buf_t *os_phys_buf; objset_phys_t *os_phys; dnode_t *os_meta_dnode; zilog_t *os_zil; @@ -71,7 +73,7 @@ /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ - blkptr_t os_rootbp; + blkptr_t *os_rootbp; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -108,9 +110,9 @@ int dmu_objset_evict_dbufs(objset_t *os, int try); /* called from dsl */ -void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx); +void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, - dmu_objset_type_t type, dmu_tx_t *tx); + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_impl_t **osip); void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,6 +32,7 @@ #include <sys/avl.h> #include <sys/spa.h> #include <sys/txg.h> +#include <sys/zio.h> #include <sys/refcount.h> #include <sys/dmu_zfetch.h> @@ -162,7 +163,7 @@ /* protected by dn_mtx: */ kmutex_t dn_mtx; - list_t dn_dirty_dbufs[TXG_SIZE]; + list_t dn_dirty_records[TXG_SIZE]; avl_tree_t dn_ranges[TXG_SIZE]; uint64_t dn_allocated_txg; uint64_t dn_free_txg; @@ -179,6 +180,9 @@ list_t dn_dbufs; /* linked list of descendent dbuf_t's */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + /* parent IO for current sync write */ + zio_t *dn_zio; + /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; @@ -200,7 +204,7 @@ void dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri Feb 02 15:36:58 2007 -0800 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,6 +31,7 @@ #include <sys/dmu.h> #include <sys/spa.h> #include <sys/txg.h> +#include <sys/zio.h> #include <sys/bplist.h> #include <sys/dsl_synctask.h> #include <sys/zfs_context.h> @@ -138,15 +139,16 @@ void *p, dsl_dataset_evict_func_t func); void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); -void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp); +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); -void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx); +void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); +void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, + dmu_tx_t *tx); int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Fri Feb 02 15:36:58 2007 -0800 @@ -272,6 +272,7 @@ #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) #define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) +#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) #define BP_ZERO(bp) \ { \
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/sys/zio.h Fri Feb 02 15:36:58 2007 -0800 @@ -207,6 +207,7 @@ zio_t *io_logical; /* Callback info */ + zio_done_func_t *io_ready; zio_done_func_t *io_done; void *io_private; blkptr_t io_bp_orig; @@ -262,8 +263,8 @@ extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb); + zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
--- a/usr/src/uts/common/fs/zfs/zio.c Fri Feb 02 15:36:29 2007 -0800 +++ b/usr/src/uts/common/fs/zfs/zio.c Fri Feb 02 15:36:58 2007 -0800 @@ -435,8 +435,8 @@ zio_t * zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags, - zbookmark_t *zb) + zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, + int flags, zbookmark_t *zb) { zio_t *zio; @@ -450,6 +450,8 @@ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + zio->io_ready = ready; + zio->io_bookmark = *zb; zio->io_logical = zio; @@ -810,6 +812,9 @@ { zio_t *pio = zio->io_parent; + if (zio->io_ready) + zio->io_ready(zio); + if (pio != NULL) zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY, &pio->io_children_notready);