Mercurial > illumos > illumos-gate
changeset 4309:3dfde0f4662d
6542676 ARC needs to track meta-data memory overhead
6544743 state->arcs_size >= state->arcs_lsize (0xe8046200 >= 0xe8066200)
author | maybee |
---|---|
date | Thu, 24 May 2007 11:30:57 -0700 |
parents | 854a761722c7 |
children | 127d7ee782ad |
files | usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/dbuf.c usr/src/uts/common/fs/zfs/dnode.c usr/src/uts/common/fs/zfs/spa.c usr/src/uts/common/fs/zfs/sys/arc.h |
diffstat | 5 files changed, 222 insertions(+), 110 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/arc.c Thu May 24 11:05:09 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/arc.c Thu May 24 11:30:57 2007 -0700 @@ -156,15 +156,19 @@ uint64_t zfs_arc_min; /* - * Note that buffers can be on one of 5 states: + * Note that buffers can be in one of 5 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache - * When there are no active references to the buffer, they - * are linked onto one of the lists in arc. These are the - * only buffers that can be evicted or deleted. + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitely. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies @@ -175,9 +179,9 @@ */ typedef struct arc_state { - list_t arcs_list; /* linked list of evictable buffer in state */ - uint64_t arcs_lsize; /* total size of buffers in the linked list */ - uint64_t arcs_size; /* total size of all buffers in this state */ + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; } arc_state_t; @@ -311,6 +315,9 @@ static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_meta_used; +static uint64_t arc_meta_limit; +static uint64_t arc_meta_max = 0; typedef struct arc_callback arc_callback_t; @@ -370,6 +377,7 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); +static int arc_evict_needed(arc_buf_contents_t type); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) @@ -723,19 +731,21 @@ if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { uint64_t delta = ab->b_size * ab->b_datacnt; + list_t *list = &ab->b_state->arcs_list[ab->b_type]; + uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); mutex_enter(&ab->b_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->arcs_list, ab); + list_remove(list, ab); if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); delta = ab->b_size; } ASSERT(delta > 0); - ASSERT3U(ab->b_state->arcs_lsize, >=, delta); - atomic_add_64(&ab->b_state->arcs_lsize, -delta); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); mutex_exit(&ab->b_state->arcs_mtx); /* remove the prefetch flag is we get a reference */ if (ab->b_flags & ARC_PREFETCH) @@ -754,13 +764,14 @@ if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { + uint64_t *size = &state->arcs_lsize[ab->b_type]; + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list, ab); + list_insert_head(&state->arcs_list[ab->b_type], ab); ASSERT(ab->b_datacnt > 0); - atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); - ASSERT3U(state->arcs_size, >=, state->arcs_lsize); + atomic_add_64(size, ab->b_size * ab->b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -791,12 +802,13 @@ if (refcnt == 0) { if (old_state != arc_anon) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + uint64_t *size = &old_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list, ab); + list_remove(&old_state->arcs_list[ab->b_type], ab); /* * If prefetching out of the ghost cache, @@ -807,19 +819,20 @@ ASSERT(ab->b_buf == NULL); from_delta = ab->b_size; } - ASSERT3U(old_state->arcs_lsize, >=, from_delta); - atomic_add_64(&old_state->arcs_lsize, -from_delta); + ASSERT3U(*size, >=, from_delta); + atomic_add_64(size, -from_delta); if (use_mutex) mutex_exit(&old_state->arcs_mtx); } if (new_state != arc_anon) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + uint64_t *size = &new_state->arcs_lsize[ab->b_type]; if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list, ab); + list_insert_head(&new_state->arcs_list[ab->b_type], ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -827,9 +840,8 @@ ASSERT(ab->b_buf == NULL); to_delta = ab->b_size; } - atomic_add_64(&new_state->arcs_lsize, to_delta); - ASSERT3U(new_state->arcs_size + to_delta, >=, - new_state->arcs_lsize); + atomic_add_64(size, to_delta); + ASSERT3U(new_state->arcs_size + to_delta, >=, *size); if (use_mutex) mutex_exit(&new_state->arcs_mtx); @@ -851,6 +863,41 @@ ab->b_state = new_state; } +void +arc_space_consume(uint64_t space) +{ + atomic_add_64(&arc_meta_used, space); + atomic_add_64(&arc_size, space); +} + +void +arc_space_return(uint64_t space) +{ + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + atomic_add_64(&arc_meta_used, -space); + ASSERT(arc_size >= space); + atomic_add_64(&arc_size, -space); +} + +void * +arc_data_buf_alloc(uint64_t size) +{ + if (arc_evict_needed(ARC_BUFC_DATA)) + cv_signal(&arc_reclaim_thr_cv); + atomic_add_64(&arc_size, size); + return (zio_data_buf_alloc(size)); +} + +void +arc_data_buf_free(void *buf, uint64_t size) +{ + zio_data_buf_free(buf, size); + ASSERT(arc_size >= size); + atomic_add_64(&arc_size, -size); +} + arc_buf_t * arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) { @@ -955,17 +1002,21 @@ if (!recycle) { if (type == ARC_BUFC_METADATA) { zio_buf_free(buf->b_data, size); + arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf->b_data, size); + atomic_add_64(&arc_size, -size); } - atomic_add_64(&arc_size, -size); } if (list_link_active(&buf->b_hdr->b_arc_node)) { + uint64_t *cnt = &state->arcs_lsize[type]; + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); ASSERT(state != arc_anon); - ASSERT3U(state->arcs_lsize, >=, size); - atomic_add_64(&state->arcs_lsize, -size); + + ASSERT3U(*cnt, >=, size); + atomic_add_64(cnt, -size); } ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); @@ -1125,6 +1176,7 @@ arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; arc_buf_hdr_t *ab, *ab_prev = NULL; + list_t *list = &state->arcs_list[type]; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; @@ -1136,8 +1188,8 @@ mutex_enter(&state->arcs_mtx); mutex_enter(&evicted_state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && @@ -1216,6 +1268,7 @@ arc_evict_ghost(arc_state_t *state, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; + list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; @@ -1223,8 +1276,8 @@ ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->arcs_mtx); - for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { - ab_prev = list_prev(&state->arcs_list, ab); + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); @@ -1249,6 +1302,12 @@ } mutex_exit(&state->arcs_mtx); + if (list == &state->arcs_list[ARC_BUFC_DATA] && + (bytes < 0 || bytes_deleted < bytes)) { + list = &state->arcs_list[ARC_BUFC_METADATA]; + goto top; + } + if (bufs_skipped) { ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); ASSERT(bytes >= 0); @@ -1266,17 +1325,25 @@ top_sz = arc_anon->arcs_size + arc_mru->arcs_size; - if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); - (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); + (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); + top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + } + + if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); + (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; if (mru_over > 0) { - if (arc_mru_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); + if (arc_mru_ghost->arcs_size > 0) { + todelete = MIN(arc_mru_ghost->arcs_size, mru_over); arc_evict_ghost(arc_mru_ghost, todelete); } } @@ -1284,17 +1351,28 @@ if ((arc_over = arc_size - arc_c) > 0) { int64_t tbl_over; - if (arc_mfu->arcs_lsize > 0) { - int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); + if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); (void) arc_evict(arc_mfu, toevict, FALSE, - ARC_BUFC_UNDEF); + ARC_BUFC_DATA); + arc_over = arc_size - arc_c; } - tbl_over = arc_size + arc_mru_ghost->arcs_lsize + - arc_mfu_ghost->arcs_lsize - arc_c*2; - - if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { - todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); + if (arc_over > 0 && + arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t toevict = + MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], + arc_over); + (void) arc_evict(arc_mfu, toevict, FALSE, + ARC_BUFC_METADATA); + } + + tbl_over = arc_size + arc_mru_ghost->arcs_size + + arc_mfu_ghost->arcs_size - arc_c * 2; + + if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { + todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); arc_evict_ghost(arc_mfu_ghost, todelete); } } @@ -1328,10 +1406,14 @@ void arc_flush(void) { - while (list_head(&arc_mru->arcs_list)) - (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); - while (list_head(&arc_mfu->arcs_list)) - (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) + (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) + (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) + (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); + while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) + (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); arc_evict_ghost(arc_mru_ghost, -1); arc_evict_ghost(arc_mfu_ghost, -1); @@ -1408,23 +1490,6 @@ if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); - /* - * If zio data pages are being allocated out of a separate heap segment, - * then check that the size of available vmem for this area remains - * above 1/4th free. This needs to be done when the size of the - * non-default segment is smaller than physical memory, so we could - * conceivably run out of VA in that segment before running out of - * physical memory. - */ - if (zio_arena != NULL) { - size_t arc_ziosize = - btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); - - if ((physmem > arc_ziosize) && - (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) - return (1); - } - #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -1459,12 +1524,13 @@ extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL - /* - * First purge some DNLC entries, in case the DNLC is using - * up too much memory. - */ - dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); - + if (arc_meta_used >= arc_meta_limit) { + /* + * We are exceeding our meta-data cache limit. + * Purge some DNLC entries to release holds on meta-data. + */ + dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); + } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. @@ -1521,11 +1587,10 @@ /* reset the growth delay for every reclaim */ growtime = lbolt + (arc_grow_retry * hz); - ASSERT(growtime > 0); arc_kmem_reap_now(last_reclaim); - } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { + } else if (arc_no_grow && lbolt >= growtime) { arc_no_grow = FALSE; } @@ -1613,8 +1678,23 @@ * prior to insert. */ static int -arc_evict_needed() +arc_evict_needed(arc_buf_contents_t type) { + if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) + return (1); + +#ifdef _KERNEL + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this area remains + * above about 1/32nd free. + */ + if (type == ARC_BUFC_DATA && zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) + return (1); +#endif + if (arc_reclaim_needed()) return (1); @@ -1657,14 +1737,15 @@ * We have not yet reached cache maximum size, * just allocate a new buffer. */ - if (!arc_evict_needed()) { + if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); goto out; } @@ -1679,20 +1760,23 @@ if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_p > mru_used) ? arc_mfu : arc_mru; + state = (arc_mfu->arcs_lsize[type] > 0 && + arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + state = (arc_mru->arcs_lsize[type] > 0 && + mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); + arc_space_consume(size); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + atomic_add_64(&arc_size, size); } - atomic_add_64(&arc_size, size); ARCSTAT_BUMP(arcstat_recycle_miss); } ASSERT(buf->b_data != NULL); @@ -1707,7 +1791,7 @@ atomic_add_64(&hdr->b_state->arcs_size, size); if (list_link_active(&hdr->b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize, size); + atomic_add_64(&hdr->b_state->arcs_lsize[type], size); } /* * If we are growing the cache, and we are adding anonymous @@ -1752,10 +1836,6 @@ if ((buf->b_flags & ARC_PREFETCH) != 0) { if (refcount_count(&buf->b_refcnt) == 0) { ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mru->arcs_mtx); - list_remove(&arc_mru->arcs_list, buf); - list_insert_head(&arc_mru->arcs_list, buf); - mutex_exit(&arc_mru->arcs_mtx); } else { buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); @@ -1815,10 +1895,6 @@ if ((buf->b_flags & ARC_PREFETCH) != 0) { ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); - mutex_enter(&arc_mfu->arcs_mtx); - list_remove(&arc_mfu->arcs_list, buf); - list_insert_head(&arc_mfu->arcs_list, buf); - mutex_exit(&arc_mfu->arcs_mtx); } ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = lbolt; @@ -1858,7 +1934,7 @@ VERIFY(arc_buf_remove_ref(buf, arg) == 1); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { @@ -2368,8 +2444,9 @@ ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); if (refcount_is_zero(&hdr->b_refcnt)) { - ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); + uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3U(*size, >=, hdr->b_size); + atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; arc_cksum_verify(buf); @@ -2650,9 +2727,11 @@ if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { - dprintf("failing, arc_tempreserve=%lluK anon=%lluK " - "tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, arc_anon->arcs_lsize>>10, + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve>>10, + arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, + arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, tempreserve>>10, arc_c>>10); return (ERESTART); } @@ -2702,6 +2781,11 @@ arc_c = arc_c_max; arc_p = (arc_c >> 1); + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) + arc_c_min = arc_meta_limit / 2; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -2721,14 +2805,22 @@ mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); @@ -2773,10 +2865,14 @@ mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list); - list_destroy(&arc_mru_ghost->arcs_list); - list_destroy(&arc_mfu->arcs_list); - list_destroy(&arc_mfu_ghost->arcs_list); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); mutex_destroy(&arc_anon->arcs_mtx); mutex_destroy(&arc_mru->arcs_mtx);
--- a/usr/src/uts/common/fs/zfs/dbuf.c Thu May 24 11:05:09 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Thu May 24 11:30:57 2007 -0700 @@ -470,6 +470,7 @@ if (db->db_blkid == DB_BONUS_BLKID) { ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(512); if (db->db.db_size < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, @@ -657,6 +658,7 @@ if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -1277,8 +1279,10 @@ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) + if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } @@ -1396,6 +1400,7 @@ db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t)); return (db); } else { int blocksize = @@ -1422,6 +1427,7 @@ list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t)); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1489,6 +1495,7 @@ ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t)); } void @@ -1913,8 +1920,10 @@ ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - if (*datap != db->db.db_data) + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr)
--- a/usr/src/uts/common/fs/zfs/dnode.c Thu May 24 11:05:09 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode.c Thu May 24 11:30:57 2007 -0700 @@ -284,6 +284,7 @@ list_insert_head(&os->os_dnodes, dn); mutex_exit(&os->os_lock); + arc_space_consume(sizeof (dnode_t)); return (dn); } @@ -318,6 +319,7 @@ dn->dn_bonus = NULL; } kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t)); } void @@ -601,9 +603,10 @@ } if ((dn = children_dnodes[idx]) == NULL) { + dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; dnode_t *winner; - dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx, - db, object); + + dn = dnode_create(os, dnp, db, object); winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); if (winner != NULL) { dnode_destroy(dn);
--- a/usr/src/uts/common/fs/zfs/spa.c Thu May 24 11:05:09 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Thu May 24 11:30:57 2007 -0700 @@ -2230,7 +2230,7 @@ { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + arc_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2266,7 +2266,7 @@ spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - data = zio_data_buf_alloc(size); + data = arc_data_buf_alloc(size); if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Thu May 24 11:05:09 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/arc.h Thu May 24 11:30:57 2007 -0700 @@ -55,9 +55,9 @@ }; typedef enum arc_buf_contents { - ARC_BUFC_UNDEF, /* buffer contents undefined */ ARC_BUFC_DATA, /* buffer contains data */ - ARC_BUFC_METADATA /* buffer contains metadata */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES } arc_buf_contents_t; /* * These are the flags we pass into calls to the arc @@ -67,6 +67,10 @@ #define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ #define ARC_CACHED (1 << 4) /* I/O was already in cache */ +void arc_space_consume(uint64_t space); +void arc_space_return(uint64_t space); +void *arc_data_buf_alloc(uint64_t space); +void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); void arc_buf_add_ref(arc_buf_t *buf, void *tag);