# HG changeset patch # User Matthew Ahrens # Date 1346423165 25200 # Node ID 4be8368f41e43a5c4e6e0214fd21c8f2fb69c02a # Parent 40cea5d62fa3fee9942003ad98c338e84fc597c2 3122 zfs destroy filesystem should prefetch blocks Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Adam Leventhal Approved by: Garrett D'Amore diff -r 40cea5d62fa3 -r 4be8368f41e4 usr/src/uts/common/fs/zfs/bptree.c --- a/usr/src/uts/common/fs/zfs/bptree.c Thu Aug 30 13:37:54 2012 +0000 +++ b/usr/src/uts/common/fs/zfs/bptree.c Fri Aug 31 07:26:05 2012 -0700 @@ -189,7 +189,8 @@ break; err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, - bte.be_birth_txg, &bte.be_zb, TRAVERSE_POST, + bte.be_birth_txg, &bte.be_zb, + TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST, bptree_visit_cb, &ba); if (free) { ASSERT(err == 0 || err == ERESTART); diff -r 40cea5d62fa3 -r 4be8368f41e4 usr/src/uts/common/fs/zfs/dmu_traverse.c --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Thu Aug 30 13:37:54 2012 +0000 +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Fri Aug 31 07:26:05 2012 -0700 @@ -63,6 +63,8 @@ static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object); +static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, + arc_buf_t *buf, uint64_t objset, uint64_t object); static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -178,9 +180,34 @@ bcopy(zb, td->td_resume, sizeof (*td->td_resume)); } +static void +traverse_prefetch_metadata(traverse_data_t *td, + arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb) +{ + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) + return; + /* + * If we are in the process of resuming, don't prefetch, because + * some children will not be needed (and in fact may have already + * been freed). + */ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + return; + if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + return; + if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) + return; + + (void) arc_read(NULL, td->td_spa, bp, + pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &flags, zb); +} + static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) + arc_buf_t *pbuf, const blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; int err = 0, lasterr = 0; @@ -243,14 +270,21 @@ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); + cbp = buf->b_data; - /* recursively visitbp() blocks below this */ - cbp = buf->b_data; - for (i = 0; i < epb; i++, cbp++) { + for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); - err = traverse_visitbp(td, dnp, buf, cbp, &czb); + traverse_prefetch_metadata(td, buf, &cbp[i], &czb); + } + + /* recursively visitbp() blocks below this */ + for (i = 0; i < epb; i++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, buf, &cbp[i], &czb); if (err) { if (!hard) break; @@ -267,11 +301,16 @@ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); + dnp = buf->b_data; + + for (i = 0; i < epb; i++) { + prefetch_dnode_metadata(td, &dnp[i], buf, zb->zb_objset, + zb->zb_blkid * epb + i); + } /* recursively visitbp() blocks below this */ - dnp = buf->b_data; - for (i = 0; i < epb; i++, dnp++) { - err = traverse_dnode(td, dnp, buf, zb->zb_objset, + for (i = 0; i < epb; i++) { + err = traverse_dnode(td, &dnp[i], buf, zb->zb_objset, zb->zb_blkid * epb + i); if (err) { if (!hard) @@ -292,6 +331,15 @@ osp = buf->b_data; dnp = &osp->os_meta_dnode; + prefetch_dnode_metadata(td, dnp, buf, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + buf, zb->zb_objset, DMU_USERUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + buf, zb->zb_objset, DMU_USERUSED_OBJECT); + } + err = traverse_dnode(td, dnp, buf, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err && hard) { @@ -334,6 +382,24 @@ return (err != 0 ? err : lasterr); } +static void +prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object) +{ + int j; + zbookmark_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + traverse_prefetch_metadata(td, buf, &dnp->dn_blkptr[j], &czb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + traverse_prefetch_metadata(td, buf, &dnp->dn_spill, &czb); + } +} + static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, arc_buf_t *buf, uint64_t objset, uint64_t object) @@ -344,8 +410,7 @@ for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_blkptr[j], &czb); + err = traverse_visitbp(td, dnp, buf, &dnp->dn_blkptr[j], &czb); if (err) { if (!hard) break; @@ -354,10 +419,8 @@ } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { - SET_BOOKMARK(&czb, objset, - object, 0, DMU_SPILL_BLKID); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_spill, &czb); + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, buf, &dnp->dn_spill, &czb); if (err) { if (!hard) return (err); @@ -438,6 +501,12 @@ ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + /* + * The data prefetching mechanism (the prefetch thread) is incompatible + * with resuming from a bookmark. + */ + ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); + td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; @@ -464,7 +533,7 @@ traverse_zil(&td, &os->os_zil_header); } - if (!(flags & TRAVERSE_PREFETCH) || + if (!(flags & TRAVERSE_PREFETCH_DATA) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; diff -r 40cea5d62fa3 -r 4be8368f41e4 usr/src/uts/common/fs/zfs/sys/dnode.h --- a/usr/src/uts/common/fs/zfs/sys/dnode.h Thu Aug 30 13:37:54 2012 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Aug 31 07:26:05 2012 -0700 @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_DNODE_H @@ -276,7 +277,6 @@ void dnode_buf_byteswap(void *buf, size_t size); void dnode_verify(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); -uint64_t dnode_current_max_length(dnode_t *dn); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx);