Mercurial > illumos > fmac
changeset 5378:111aa1baa84a onnv_77
PSARC 2007/555 zfs fs-only quotas and reservations
6431277 want filesystem-only quotas
6483677 need immediate reservation
line wrap: on
line diff
--- a/usr/src/cmd/fs.d/df.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/cmd/fs.d/df.c Mon Oct 29 22:45:33 2007 -0700 @@ -1224,55 +1224,60 @@ adjust_total_blocks(struct df_request *dfrp, fsblkcnt64_t *total, uint64_t blocksize) { - zfs_handle_t *zhp; char *dataset, *slash; - uint64_t quota; + boolean_t first = TRUE; + uint64_t quota = 0; - if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 || - !load_libzfs()) + if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 || !load_libzfs()) return; /* * We want to get the total size for this filesystem as bounded by any * quotas. In order to do this, we start at the current filesystem and - * work upwards until we find a dataset with a quota. If we reach the - * pool itself, then the total space is the amount used plus the amount + * work upwards looking for the smallest quota. When we reach the + * pool itself, the quota is the amount used plus the amount * available. */ if ((dataset = strdup(DFR_SPECIAL(dfrp))) == NULL) return; slash = dataset + strlen(dataset); - do { + while (slash != NULL) { + zfs_handle_t *zhp; + uint64_t this_quota; + *slash = '\0'; - if ((zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) - == NULL) { - free(dataset); - return; + zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET); + if (zhp == NULL) + break; + + /* true at first iteration of loop */ + if (first) { + quota = _zfs_prop_get_int(zhp, ZFS_PROP_REFQUOTA); + if (quota == 0) + quota = UINT64_MAX; + first = FALSE; } - if ((quota = _zfs_prop_get_int(zhp, ZFS_PROP_QUOTA)) != 0) { - *total = quota / blocksize; - _zfs_close(zhp); - free(dataset); - return; + this_quota = _zfs_prop_get_int(zhp, ZFS_PROP_QUOTA); + if (this_quota && this_quota < quota) + quota = this_quota; + + /* true at last iteration of loop */ + if ((slash = strrchr(dataset, '/')) == NULL) { + uint64_t size; + + size = _zfs_prop_get_int(zhp, ZFS_PROP_USED) + + _zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE); + if (size < quota) + quota = size; } _zfs_close(zhp); - - } while ((slash = strrchr(dataset, '/')) != NULL); - - - if ((zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) { - free(dataset); - return; } - *total = (_zfs_prop_get_int(zhp, ZFS_PROP_USED) + - _zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE)) / blocksize; - - _zfs_close(zhp); + *total = quota / blocksize; free(dataset); }
--- a/usr/src/cmd/zfs/zfs_main.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/cmd/zfs/zfs_main.c Mon Oct 29 22:45:33 2007 -0700 @@ -281,7 +281,7 @@ { FILE *fp = cb; - (void) fprintf(fp, "\t%-13s ", zfs_prop_to_name(prop)); + (void) fprintf(fp, "\t%-14s ", zfs_prop_to_name(prop)); if (prop == ZFS_PROP_CASE) (void) fprintf(fp, "NO "); @@ -348,7 +348,7 @@ (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-13s %s %s %s\n\n", + (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ @@ -1270,7 +1270,9 @@ (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || - prop == ZFS_PROP_RESERVATION) + prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); return (1);
--- a/usr/src/cmd/zpool/zpool_main.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/cmd/zpool/zpool_main.c Mon Oct 29 22:45:33 2007 -0700 @@ -3301,8 +3301,8 @@ (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); - (void) printf(gettext(" 9 Case insensitive support and " - "File system unique identifiers (FUID)\n")); + (void) printf(gettext(" 9 refquota and refreservation " + "properties\n")); (void) printf(gettext("For more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" @@ -3385,6 +3385,8 @@ "rollback", "snapshot", "filesystem version upgrade", + "refquota set", + "refreservation set", }; /*
--- a/usr/src/common/zfs/zfs_prop.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/common/zfs/zfs_prop.c Mon Oct 29 22:45:33 2007 -0700 @@ -250,6 +250,11 @@ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV"); register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_VOLUME, "<size>", "VOLSIZE"); + register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA"); + register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "<size> | none", "REFRESERV"); /* inherit number properties */ register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c Mon Oct 29 22:45:33 2007 -0700 @@ -772,6 +772,7 @@ switch (prop) { case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: if (intval > volsize) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is greater than current " @@ -1627,6 +1628,7 @@ */ switch (prop) { case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is less than current used or " "reserved space")); @@ -1634,6 +1636,7 @@ break; case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is greater than available space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); @@ -1953,7 +1956,9 @@ break; case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: *val = getprop_uint64(zhp, prop, source); if (*val == 0) *source = ""; /* default */ @@ -2122,7 +2127,10 @@ break; case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1);
--- a/usr/src/lib/libzfs/common/libzfs_util.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_util.c Mon Oct 29 22:45:33 2007 -0700 @@ -1065,7 +1065,6 @@ const char *propname; char *value; boolean_t isnone = B_FALSE; - boolean_t boolval; if (type == ZFS_TYPE_POOL) { proptype = zpool_prop_get_type(prop); @@ -1116,34 +1115,23 @@ /* * Quota special: force 'none' and don't allow 0. */ - if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && - !isnone && prop == ZFS_PROP_QUOTA) { + if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && + (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "use 'none' to disable quota")); + "use 'none' to disable quota/refquota")); goto error; } break; case PROP_TYPE_INDEX: - switch (datatype) { - case DATA_TYPE_STRING: - (void) nvpair_value_string(elem, &value); - break; - - case DATA_TYPE_BOOLEAN_VALUE: - (void) nvpair_value_boolean_value(elem, &boolval); - if (boolval) - value = "on"; - else - value = "off"; - break; - - default: + if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } + (void) nvpair_value_string(elem, &value); + if (zprop_string_to_index(prop, value, ivalp, type) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be one of '%s'"), propname,
--- a/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c Mon Oct 29 22:45:33 2007 -0700 @@ -100,6 +100,8 @@ ZFS_PROP_RESERVATION, ZFS_PROP_USED, ZFS_PROP_VOLSIZE, + ZFS_PROP_REFQUOTA, + ZFS_PROP_REFRESERVATION, ZPROP_INVAL };
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Mon Oct 29 22:45:33 2007 -0700 @@ -498,6 +498,10 @@ VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds)); + /* copy the refquota from the target fs to the clone */ + if (ohds->ds_quota > 0) + dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); + dmu_buf_will_dirty(cds->ds_dbuf, tx); cds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; @@ -513,6 +517,7 @@ recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; + dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Mon Oct 29 22:45:33 2007 -0700 @@ -294,6 +294,8 @@ txh->txh_space_tooverwrite += space; } else { txh->txh_space_towrite += space; + if (dn && dn->dn_dbuf->db_blkptr) + txh->txh_space_tounref += space; } } @@ -319,7 +321,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { uint64_t blkid, nblks; - uint64_t space = 0; + uint64_t space = 0, unref = 0; dnode_t *dn = txh->txh_dnode; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; @@ -383,6 +385,7 @@ dprintf_bp(bp, "can free old%s", ""); space += bp_get_dasize(spa, bp); } + unref += BP_GET_ASIZE(bp); } nblks = 0; } @@ -418,6 +421,7 @@ "can free old%s", ""); space += bp_get_dasize(spa, &bp[i]); } + unref += BP_GET_ASIZE(bp); } dbuf_rele(dbuf, FTAG); } @@ -432,6 +436,7 @@ rw_exit(&dn->dn_struct_rwlock); txh->txh_space_tofree += space; + txh->txh_space_tounref += unref; } void @@ -550,10 +555,13 @@ * the size will change between now and the dbuf dirty call. */ if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) + dn->dn_phys->dn_blkptr[0].blk_birth)) { txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; - else + } else { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + txh->txh_space_tounref += + BP_GET_ASIZE(dn->dn_phys->dn_blkptr); + } return; } @@ -733,8 +741,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { dmu_tx_hold_t *txh; - uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite; spa_t *spa = tx->tx_pool->dp_spa; + uint64_t lsize, asize, fsize, usize; + uint64_t towrite, tofree, tooverwrite, tounref; ASSERT3U(tx->tx_txg, ==, 0); @@ -767,7 +776,7 @@ * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = 0; + towrite = tofree = tooverwrite = tounref = 0; for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -787,6 +796,7 @@ towrite += txh->txh_space_towrite; tofree += txh->txh_space_tofree; tooverwrite += txh->txh_space_tooverwrite; + tounref += txh->txh_space_tounref; } /* @@ -813,16 +823,18 @@ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; lsize = towrite + tooverwrite; asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); #ifdef ZFS_DEBUG tx->tx_space_towrite = asize; tx->tx_space_tofree = tofree; tx->tx_space_tooverwrite = tooverwrite; + tx->tx_space_tounref = tounref; #endif if (tx->tx_dir && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, - lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); + lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); if (err) return (err); }
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Mon Oct 29 22:45:33 2007 -0700 @@ -45,6 +45,7 @@ static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_checkfunc_t dsl_dataset_rollback_check; static dsl_syncfunc_t dsl_dataset_rollback_sync; +static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DS_REF_MAX (1ULL << 62) @@ -67,6 +68,25 @@ DS_REF_MAX /* DS_MODE_EXCLUSIVE - no other opens */ }; +/* + * Figure out how much of this delta should be propogated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. + */ +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + uint64_t old_bytes, new_bytes; + + if (ds->ds_reserved == 0) + return (delta); + + old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) @@ -74,6 +94,7 @@ int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; dprintf_bp(bp, "born, ds=%p\n", ds); @@ -96,13 +117,13 @@ } dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); ds->ds_phys->ds_used_bytes += used; ds->ds_phys->ds_compressed_bytes += compressed; ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, - used, compressed, uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx); } void @@ -140,6 +161,7 @@ if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { int err; + int64_t delta; dprintf_bp(bp, "freeing: %s", ""); err = arc_free(pio, tx->tx_pool->dp_spa, @@ -147,12 +169,13 @@ ASSERT(err == 0); mutex_enter(&ds->ds_lock); - /* XXX unique_bytes is not accurate for head datasets */ - /* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */ + ASSERT(ds->ds_phys->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, - -used, -compressed, -uncompressed, tx); + delta, -compressed, -uncompressed, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); @@ -375,6 +398,24 @@ ds->ds_fsid_guid = unique_insert(ds->ds_phys->ds_fsid_guid); } + + if (!dsl_dataset_is_snapshot(ds)) { + boolean_t need_lock = + !RW_LOCK_HELD(&dp->dp_config_rwlock); + + if (need_lock) + rw_enter(&dp->dp_config_rwlock, RW_READER); + VERIFY(0 == dsl_prop_get_ds_locked(ds->ds_dir, + "refreservation", sizeof (uint64_t), 1, + &ds->ds_reserved, NULL)); + VERIFY(0 == dsl_prop_get_ds_locked(ds->ds_dir, + "refquota", sizeof (uint64_t), 1, &ds->ds_quota, + NULL)); + if (need_lock) + rw_exit(&dp->dp_config_rwlock); + } else { + ds->ds_reserved = ds->ds_quota = 0; + } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(ds->ds_phys, ==, dbuf->db_data); @@ -591,6 +632,8 @@ dsphys->ds_creation_txg = tx->tx_txg; dsphys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); @@ -633,6 +676,9 @@ dsphys->ds_creation_txg = tx->tx_txg; dsphys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + if (origin) { dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = @@ -943,10 +989,53 @@ } } +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +static void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); + + if (ds->ds_phys->ds_prev_snap_obj != 0) + mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; + else + mrs_used = 0; + + VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, + &dluncomp)); + + ASSERT3U(dlused, <=, mrs_used); + ds->ds_phys->ds_unique_bytes = + ds->ds_phys->ds_used_bytes - (mrs_used - dlused); + + if (!DS_UNIQUE_IS_ACCURATE(ds) && + spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +static uint64_t +dsl_dataset_unique(dsl_dataset_t *ds) +{ + if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) + dsl_dataset_recalc_head_uniq(ds); + + return (ds->ds_phys->ds_unique_bytes); +} + struct killarg { - uint64_t *usedp; - uint64_t *compressedp; - uint64_t *uncompressedp; + int64_t *usedp; + int64_t *compressedp; + int64_t *uncompressedp; zio_t *zio; dmu_tx_t *tx; }; @@ -1042,7 +1131,7 @@ { /* Free blkptrs that we gave birth to */ zio_t *zio; - uint64_t used = 0, compressed = 0, uncompressed = 0; + int64_t used = 0, compressed = 0, uncompressed = 0; struct killarg ka; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, @@ -1175,7 +1264,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t used = 0, compressed = 0, uncompressed = 0; + int64_t used = 0, compressed = 0, uncompressed = 0; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1190,6 +1279,13 @@ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + /* Remove our reservation */ + if (ds->ds_reserved != 0) { + uint64_t val = 0; + dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + ASSERT3U(ds->ds_reserved, ==, 0); + } + ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); obj = ds->ds_object; @@ -1223,6 +1319,7 @@ blkptr_t bp; dsl_dataset_t *ds_next; uint64_t itor = 0; + uint64_t old_unique; spa_scrub_restart(dp->dp_spa, tx->tx_txg); @@ -1231,6 +1328,8 @@ DS_MODE_NONE, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); + old_unique = dsl_dataset_unique(ds_next); + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); ds_next->ds_phys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; @@ -1312,13 +1411,6 @@ dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); } else { - /* - * It would be nice to update the head dataset's - * unique. To do so we would have to traverse - * it for blocks born after ds_prev, which is - * pretty expensive just to maintain something - * for debugging purposes. - */ ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, ds_next); @@ -1329,13 +1421,32 @@ } else { ds_next->ds_prev = NULL; } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsmed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + ds_next->ds_phys->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, -mrsdelta, + 0, 0, tx); + } } dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG); /* - * NB: unique_bytes is not accurate for head objsets - * because we don't update it when we delete the most - * recent snapshot -- see above comment. + * NB: unique_bytes might not be accurate for the head objset. + * Before SPA_VERSION 9, we didn't update its value when we + * deleted the most recent snapshot. */ ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); } else { @@ -1366,6 +1477,9 @@ err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); + ASSERT(spa_version(dp->dp_spa) < + SPA_VERSION_UNIQUE_ACCURATE || + used == ds->ds_phys->ds_unique_bytes); } err = zio_wait(zio); @@ -1421,6 +1535,33 @@ } +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) + return (ENOSPC); + + /* + * Propogate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); +} + /* ARGSUSED */ int dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) @@ -1455,6 +1596,10 @@ if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) return (ENAMETOOLONG); + err = dsl_dataset_snapshot_reserve_space(ds, tx); + if (err) + return (err); + ds->ds_trysnap_txg = tx->tx_txg; return (0); } @@ -1510,12 +1655,24 @@ } } + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx); + } + bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg; ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; ds->ds_phys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, @@ -1557,14 +1714,22 @@ void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { + uint64_t refd, avail, uobjs, aobjs; + dsl_dir_stats(ds->ds_dir, nv); + dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, ds->ds_phys->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, ds->ds_phys->ds_creation_txg); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, - ds->ds_phys->ds_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + ds->ds_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + ds->ds_reserved); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -1618,6 +1783,18 @@ { *refdbytesp = ds->ds_phys->ds_used_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) + *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } *usedobjsp = ds->ds_phys->ds_bp.blk_fill; *availobjsp = DN_MAX_OBJECT - *usedobjsp; } @@ -2198,6 +2375,9 @@ uint64_t unique = 0; int err; + if (csa->ohds->ds_reserved) + panic("refreservation and clone swap are incompatible"); + dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); @@ -2221,6 +2401,13 @@ } VERIFY(err == ENOENT); + /* undo any accounting due to a refreservation */ + if (csa->ohds->ds_reserved > csa->ohds->ds_phys->ds_unique_bytes) { + dsl_dir_diduse_space(csa->ohds->ds_dir, + csa->ohds->ds_phys->ds_unique_bytes - + csa->ohds->ds_reserved, 0, 0, tx); + } + /* reset origin's unique bytes */ csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique; @@ -2263,6 +2450,13 @@ (y) = __tmp; \ } + /* redo any accounting due to a refreservation */ + if (csa->ohds->ds_reserved > csa->ohds->ds_phys->ds_unique_bytes) { + dsl_dir_diduse_space(csa->ohds->ds_dir, + csa->ohds->ds_reserved - + csa->ohds->ds_phys->ds_unique_bytes, 0, 0, tx); + } + /* swap ds_*_bytes */ SWITCH64(csa->ohds->ds_phys->ds_used_bytes, csa->cds->ds_phys->ds_used_bytes); @@ -2280,6 +2474,9 @@ csa->cds->ds_phys->ds_deadlist_obj)); VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj)); + /* fix up clone's unique */ + dsl_dataset_recalc_head_uniq(csa->cds); + } /* @@ -2331,3 +2528,195 @@ return (0); } + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { + if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) + error = ERESTART; + else + error = EDQUOT; + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +/* ARGSUSED */ +static int +dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) + return (ENOTSUP); + + if (new_quota == 0) + return (0); + + if (new_quota < ds->ds_phys->ds_used_bytes || + new_quota < ds->ds_reserved) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +void +dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *quotap = arg2; + uint64_t new_quota = *quotap; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + mutex_enter(&ds->ds_lock); + ds->ds_quota = new_quota; + mutex_exit(&ds->ds_lock); + + dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + + spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, + tx, cr, "%lld dataset = %llu ", + (longlong_t)new_quota, ds->ds_dir->dd_phys->dd_head_dataset_obj); +} + +int +dsl_dataset_set_quota(const char *dsname, uint64_t quota) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds); + if (err) + return (err); + + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_quota_check, + dsl_dataset_set_quota_sync, ds, "a, 0); + dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + return (err); +} + +static int +dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + int64_t delta; + uint64_t unique; + + if (new_reservation > INT64_MAX) + return (EOVERFLOW); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_REFRESERVATION) + return (ENOTSUP); + + if (dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) + return (0); + + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); + mutex_exit(&ds->ds_lock); + + if (delta > 0 && + delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (delta > 0 && ds->ds_quota > 0 && + new_reservation > ds->ds_quota) + return (ENOSPC); + + return (0); +} + +/* ARGSUSED */ +static void +dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + uint64_t *reservationp = arg2; + uint64_t new_reservation = *reservationp; + uint64_t unique; + int64_t delta; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + mutex_enter(&ds->ds_lock); + unique = dsl_dataset_unique(ds); + delta = MAX(0, (int64_t)(new_reservation - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = new_reservation; + mutex_exit(&ds->ds_lock); + + dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", + new_reservation, cr, tx); + + dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx); + + spa_history_internal_log(LOG_DS_REFRESERV, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", + (longlong_t)new_reservation, + ds->ds_dir->dd_phys->dd_head_dataset_obj); +} + +int +dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds); + if (err) + return (err); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_reservation_check, + dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); + return (err); +}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Mon Oct 29 22:45:33 2007 -0700 @@ -26,6 +26,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/dmu.h> +#include <sys/dmu_objset.h> #include <sys/dmu_tx.h> #include <sys/dsl_dataset.h> #include <sys/dsl_dir.h> @@ -39,7 +40,7 @@ #include <sys/sunddi.h> #include "zfs_namecheck.h" -static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd); +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); @@ -518,13 +519,9 @@ void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, - dsl_dir_space_available(dd, NULL, 0, TRUE)); - mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, - dd->dd_phys->dd_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, @@ -590,15 +587,13 @@ } static uint64_t -dsl_dir_estimated_space(dsl_dir_t *dd) +dsl_dir_space_towrite(dsl_dir_t *dd) { - int64_t space; + uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); - space = dd->dd_phys->dd_used_bytes; - ASSERT(space >= 0); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); @@ -632,11 +627,9 @@ mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; - if (ondiskonly) { - used = dd->dd_used_bytes; - } else { - used = dsl_dir_estimated_space(dd); - } + used = dd->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); if (dd == ancestor) used += delta; @@ -684,40 +677,50 @@ uint64_t tr_size; }; -/* - * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). - */ static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, - boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx) +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; - uint64_t est_used, quota, parent_rsrv; - int edquot = EDQUOT; + uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + struct tempreserve *tr; + int error = EDQUOT; int txgidx = txg & TXG_MASK; int i; - struct tempreserve *tr; ASSERT3U(txg, !=, 0); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); + /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ - est_used = dsl_dir_estimated_space(dd); + est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) - est_used += dd->dd_tempreserved[i]; + est_inflight += dd->dd_tempreserved[i]; + used_on_disk = dd->dd_used_bytes; /* - * If this transaction will result in a net free of space, we want - * to let it through. + * Check for dataset reference quota on first iteration. */ - if (netfree || noquota || dd->dd_phys->dd_quota == 0) + if (list_head(tr_list) == NULL && tx->tx_objset) { + dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + error = dsl_dataset_check_quota(ds, checkrefquota, + asize, est_inflight, &used_on_disk); + if (error) { + mutex_exit(&dd->dd_lock); + return (error); + } + } + + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) quota = UINT64_MAX; else quota = dd->dd_phys->dd_quota; @@ -735,34 +738,31 @@ uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); if (poolsize < quota) { quota = poolsize; - edquot = ENOSPC; + error = ENOSPC; } } /* * If they are requesting more space, and our current estimate - * is over quota. They get to try again unless the actual + * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (asize > 0 && est_used > quota) { - if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || - dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || - dd->dd_used_bytes < quota) - edquot = ERESTART; - dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " + if (used_on_disk + est_inflight > quota) { + if (est_inflight > 0 || used_on_disk < quota) + error = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", - dd->dd_used_bytes>>10, est_used>>10, - quota>>10, asize>>10, edquot); + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, error); mutex_exit(&dd->dd_lock); - return (edquot); + return (error); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; - parent_rsrv = parent_delta(dd, est_used, asize); + parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize); mutex_exit(&dd->dd_lock); tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP); @@ -775,7 +775,7 @@ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, ismos, tr_list, tx)); + parent_rsrv, netfree, ismos, TRUE, tr_list, tx)); } else { return (0); } @@ -783,25 +783,30 @@ /* * Reserve space in this dsl_dir, to be used in this tx's txg. - * After the space has been dirtied (and thus - * dsl_dir_willuse_space() has been called), the reservation should - * be canceled, using dsl_dir_tempreserve_clear(). + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). */ int -dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx) +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { int err = 0; list_t *tr_list; + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); - ASSERT3S(asize, >=, 0); + ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, - tr_list, tx); + asize > usize, tr_list, tx); if (err == 0) { struct tempreserve *tr; @@ -835,6 +840,9 @@ ASSERT3U(tx->tx_txg, !=, 0); + if (tr_cookie == NULL) + return; + while (tr = list_head(tr_list)) { if (tr->tr_ds == NULL) { arc_tempreserve_clear(tr->tr_size); @@ -867,7 +875,7 @@ if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - est_used = dsl_dir_estimated_space(dd); + est_used = dsl_dir_space_towrite(dd) + dd->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -924,14 +932,13 @@ /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the - * pending changes could under-estimat the amount of space to be + * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] + - dd->dd_space_towrite[2] + dd->dd_space_towrite[3]; + towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (new_quota < dd->dd_phys->dd_reserved || - new_quota < dsl_dir_estimated_space(dd))) { + new_quota < dd->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); @@ -978,7 +985,7 @@ return (err); } -static int +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; @@ -1028,15 +1035,15 @@ uint64_t used; int64_t delta; + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); used = dd->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); + dd->dd_phys->dd_reserved = new_reservation; mutex_exit(&dd->dd_lock); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_reserved = new_reservation; - if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c Mon Oct 29 22:45:33 2007 -0700 @@ -375,6 +375,24 @@ dd->dd_phys->dd_head_dataset_obj); } +void +dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t zapobj = dd->dd_phys->dd_props_zapobj; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); + + dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); + + spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, + "%s=%llu dataset = %llu", name, (u_longlong_t)val, + dd->dd_phys->dd_head_dataset_obj); +} + int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, const void *buf)
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c Mon Oct 29 22:45:33 2007 -0700 @@ -158,7 +158,7 @@ * Check for sufficient space. */ dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx); + dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); /* don't bother trying again */ if (dstg->dstg_err == ERESTART) dstg->dstg_err = EAGAIN;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h Mon Oct 29 22:45:33 2007 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -65,6 +65,7 @@ uint64_t tx_space_towrite; uint64_t tx_space_tofree; uint64_t tx_space_tooverwrite; + uint64_t tx_space_tounref; refcount_t tx_space_written; refcount_t tx_space_freed; #endif @@ -87,6 +88,7 @@ uint64_t txh_space_towrite; uint64_t txh_space_tofree; uint64_t txh_space_tooverwrite; + uint64_t txh_space_tounref; #ifdef ZFS_DEBUG enum dmu_tx_hold_type txh_type; uint64_t txh_arg1;
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Mon Oct 29 22:45:33 2007 -0700 @@ -55,6 +55,13 @@ */ #define DS_FLAG_NOPROMOTE (1ULL<<1) +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + typedef struct dsl_dataset_phys { uint64_t ds_dir_obj; uint64_t ds_prev_snap_obj; @@ -114,6 +121,9 @@ /* for objset_open() */ kmutex_t ds_opening_lock; + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; @@ -121,6 +131,9 @@ #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) +#define DS_UNIQUE_IS_ACCURATE(ds) \ + (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, void *tag, dsl_dataset_t **dsp); int dsl_dataset_open(const char *name, int mode, void *tag, @@ -179,6 +192,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used); +int dsl_dataset_set_quota(const char *dsname, uint64_t quota); +void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, + dmu_tx_t *tx); +int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); + #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Mon Oct 29 22:45:33 2007 -0700 @@ -110,7 +110,8 @@ void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx); + uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, + dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); void dsl_dir_diduse_space(dsl_dir_t *dd, @@ -119,6 +120,7 @@ int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); +int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS"
--- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h Mon Oct 29 22:45:33 2007 -0700 @@ -67,6 +67,8 @@ int intsz, int numints, const void *buf); int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, const void *buf); +void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + cred_t *cr, dmu_tx_t *tx); void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv,
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Oct 29 22:45:33 2007 -0700 @@ -1411,6 +1411,12 @@ return (error); break; + case ZFS_PROP_REFQUOTA: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_quota(name, intval)) != 0) + return (error); + break; + case ZFS_PROP_RESERVATION: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = dsl_dir_set_reservation(name, @@ -1418,6 +1424,13 @@ return (error); break; + case ZFS_PROP_REFRESERVATION: + if ((error = nvpair_value_uint64(elem, &intval)) != 0 || + (error = dsl_dataset_set_reservation(name, + intval)) != 0) + return (error); + break; + case ZFS_PROP_VOLSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || (error = zvol_set_volsize(name,
--- a/usr/src/uts/common/sys/fs/zfs.h Mon Oct 29 22:26:03 2007 -0700 +++ b/usr/src/uts/common/sys/fs/zfs.h Mon Oct 29 22:45:33 2007 -0700 @@ -98,6 +98,8 @@ ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, + ZFS_PROP_REFQUOTA, + ZFS_PROP_REFRESERVATION, ZFS_NUM_PROPS } zfs_prop_t; @@ -251,6 +253,9 @@ #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_NORMALIZATION SPA_VERSION_9 +#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 +#define SPA_VERSION_REFQUOTA SPA_VERSION_9 +#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -619,6 +624,8 @@ LOG_DS_ROLLBACK, LOG_DS_SNAPSHOT, LOG_DS_UPGRADE, + LOG_DS_REFQUOTA, + LOG_DS_REFRESERV, LOG_END } history_internal_events_t;