changeset 6987:877c018eb06c

6628282 arc_stats.arcstat_l2_writes_sent.value.ui64) == (arc_stats.arcstat_l2_writes_done.value.ui64) 6672464 L2ARC doesn't need startup grace interval 6693918 l2arc_fini() list->list_head.list_next == node, file: ../../common/os/list. c, line: 77 6701480 zpool command hang while add or remove cache from storage pool 6703384 assertion failed: lold->list_next != 0L, file: ../../common/os/list.c, line: 116 6709301 An empty L2ARC cache device is slow to warm up 6710331 L2ARC can hang during heavy error load
author brendan
date Tue, 01 Jul 2008 10:31:58 -0700
parents 1e7638a44ce6
children 6e7ccf1f1466
files usr/src/uts/common/fs/zfs/arc.c usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/spa_impl.h
diffstat 3 files changed, 219 insertions(+), 129 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/arc.c	Tue Jul 01 10:11:12 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Tue Jul 01 10:31:58 2008 -0700
@@ -163,6 +163,11 @@
 static int arc_dead;
 
 /*
+ * The arc has filled available memory and has now warmed up.
+ */
+static boolean_t arc_warm;
+
+/*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max;
@@ -466,10 +471,9 @@
 #define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
 #define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
 #define	ARC_DONT_L2CACHE	(1 << 16)	/* originated by prefetch */
-#define	ARC_L2_READING		(1 << 17)	/* L2ARC read in progress */
-#define	ARC_L2_WRITING		(1 << 18)	/* L2ARC write in progress */
-#define	ARC_L2_EVICTED		(1 << 19)	/* evicted during I/O */
-#define	ARC_L2_WRITE_HEAD	(1 << 20)	/* head of write list */
+#define	ARC_L2_WRITING		(1 << 17)	/* L2ARC write in progress */
+#define	ARC_L2_EVICTED		(1 << 18)	/* evicted during I/O */
+#define	ARC_L2_WRITE_HEAD	(1 << 19)	/* head of write list */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -478,7 +482,8 @@
 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 #define	HDR_DONT_L2CACHE(hdr)	((hdr)->b_flags & ARC_DONT_L2CACHE)
-#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_L2_READING)
+#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
+				    (hdr)->b_l2hdr != NULL)
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
@@ -527,7 +532,6 @@
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		4		/* num of writes */
-#define	L2ARC_FEED_DELAY	180		/* starting grace */
 #define	L2ARC_FEED_SECS		1		/* caching interval */
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
@@ -537,6 +541,7 @@
  * L2ARC Performance Tunables
  */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
@@ -549,6 +554,7 @@
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_write;	/* desired write size, bytes */
+	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
@@ -1885,6 +1891,7 @@
 			growtime = lbolt + (arc_grow_retry * hz);
 
 			arc_kmem_reap_now(last_reclaim);
+			arc_warm = B_TRUE;
 
 		} else if (arc_no_grow && lbolt >= growtime) {
 			arc_no_grow = FALSE;
@@ -2282,7 +2289,7 @@
 	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 	    (found == hdr && HDR_L2_READING(hdr)));
 
-	hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED);
+	hdr->b_flags &= ~ARC_L2_EVICTED;
 	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
 		hdr->b_flags |= ARC_DONT_L2CACHE;
 
@@ -2471,6 +2478,8 @@
 	} else {
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
+		vdev_t *vd = NULL;
+		daddr_t addr;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
@@ -2544,6 +2553,13 @@
 		if (GHOST_STATE(hdr->b_state))
 			arc_access(hdr, hash_lock);
 
+		if (hdr->b_l2hdr != NULL) {
+			vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
+			addr = hdr->b_l2hdr->b_daddr;
+		}
+
+		mutex_exit(hash_lock);
+
 		ASSERT3U(hdr->b_size, ==, size);
 		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
 		    zbookmark_t *, zb);
@@ -2554,20 +2570,24 @@
 
 		if (l2arc_ndev != 0) {
 			/*
-			 * Read from the L2ARC if the following are true:
-			 * 1. This buffer has L2ARC metadata.
-			 * 2. This buffer isn't currently writing to the L2ARC.
+			 * Lock out device removal.
 			 */
-			if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) {
-				vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
-				daddr_t addr = hdr->b_l2hdr->b_daddr;
+			spa_config_enter(spa, RW_READER, FTAG);
+
+			/*
+			 * Read from the L2ARC if the following are true:
+			 * 1. The L2ARC vdev was previously cached.
+			 * 2. This buffer still has L2ARC metadata.
+			 * 3. This buffer isn't currently writing to the L2ARC.
+			 * 4. The L2ARC entry wasn't evicted, which may
+			 *    also have invalidated the vdev.
+			 */
+			if (vd != NULL && hdr->b_l2hdr != NULL &&
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 
 				if (vdev_is_dead(vd))
-					goto skip_l2arc;
-
-				hdr->b_flags |= ARC_L2_READING;
-				mutex_exit(hash_lock);
+					goto l2skip;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
@@ -2585,29 +2605,34 @@
 				 */
 				rzio = zio_read_phys(pio, vd, addr, size,
 				    buf->b_data, ZIO_CHECKSUM_OFF,
-				    l2arc_read_done, cb, priority,
-				    flags | ZIO_FLAG_DONT_CACHE, B_FALSE);
+				    l2arc_read_done, cb, priority, flags |
+				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL,
+				    B_FALSE);
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
-
-				if (*arc_flags & ARC_WAIT)
-					return (zio_wait(rzio));
-
-				ASSERT(*arc_flags & ARC_NOWAIT);
-				zio_nowait(rzio);
-				return (0);
+				spa_config_exit(spa, FTAG);
+
+				if (*arc_flags & ARC_NOWAIT) {
+					zio_nowait(rzio);
+					return (0);
+				}
+
+				ASSERT(*arc_flags & ARC_WAIT);
+				if (zio_wait(rzio) == 0)
+					return (0);
+
+				/* l2arc read error; goto zio_read() */
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
+l2skip:
+				spa_config_exit(spa, FTAG);
 			}
 		}
 
-skip_l2arc:
-		mutex_exit(hash_lock);
-
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, flags, zb);
 
@@ -3326,6 +3351,7 @@
 	    TS_RUN, minclsyspri);
 
 	arc_dead = FALSE;
+	arc_warm = B_FALSE;
 
 	if (zfs_write_limit_max == 0)
 		zfs_write_limit_max = physmem * PAGESIZE >>
@@ -3466,21 +3492,33 @@
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
- * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static.  Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
- * 6. The L2ARC does not store dirty content.  It never needs to flush
+ * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
- * 7. If an ARC buffer is written (and dirtied) which also exists in the
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
+ *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_feed_secs		seconds between L2ARC writing
@@ -3505,16 +3543,24 @@
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
- * This is called with l2arc_dev_mtx held, which also locks out spa removal.
+ * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
-	l2arc_dev_t *next, *first;
+	l2arc_dev_t *first, *next = NULL;
+
+	/*
+	 * Lock out the removal of spas (spa_namespace_lock), then removal
+	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
+	 * both locks will be dropped and a spa config lock held instead.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
-		return (NULL);
+		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
@@ -3538,14 +3584,49 @@
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev))
-		return (NULL);
+		next = NULL;
 
 	l2arc_dev_last = next;
 
+out:
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Grab the config lock to prevent the 'next' device from being
+	 * removed while we are writing to it.
+	 */
+	if (next != NULL)
+		spa_config_enter(next->l2ad_spa, RW_READER, next);
+	mutex_exit(&spa_namespace_lock);
+
 	return (next);
 }
 
 /*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write()
+{
+	list_t *buflist;
+	l2arc_data_free_t *df, *df_prev;
+
+	mutex_enter(&l2arc_free_on_write_mtx);
+	buflist = l2arc_free_on_write;
+
+	for (df = list_tail(buflist); df; df = df_prev) {
+		df_prev = list_prev(buflist, df);
+		ASSERT(df->l2df_data != NULL);
+		ASSERT(df->l2df_func != NULL);
+		df->l2df_func(df->l2df_data, df->l2df_size);
+		list_remove(buflist, df);
+		kmem_free(df, sizeof (l2arc_data_free_t));
+	}
+
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
@@ -3555,8 +3636,8 @@
 	l2arc_write_callback_t *cb;
 	l2arc_dev_t *dev;
 	list_t *buflist;
-	l2arc_data_free_t *df, *df_prev;
 	arc_buf_hdr_t *head, *ab, *ab_prev;
+	l2arc_buf_hdr_t *abl2;
 	kmutex_t *hash_lock;
 
 	cb = zio->io_private;
@@ -3594,9 +3675,13 @@
 
 		if (zio->io_error != 0) {
 			/*
-			 * Error - invalidate L2ARC entry.
+			 * Error - drop L2ARC entry.
 			 */
+			list_remove(buflist, ab);
+			abl2 = ab->b_l2hdr;
 			ab->b_l2hdr = NULL;
+			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
 		}
 
 		/*
@@ -3612,20 +3697,7 @@
 	kmem_cache_free(hdr_cache, head);
 	mutex_exit(&l2arc_buflist_mtx);
 
-	/*
-	 * Free buffers that were tagged for destruction.
-	 */
-	mutex_enter(&l2arc_free_on_write_mtx);
-	buflist = l2arc_free_on_write;
-	for (df = list_tail(buflist); df; df = df_prev) {
-		df_prev = list_prev(buflist, df);
-		ASSERT(df->l2df_data != NULL);
-		ASSERT(df->l2df_func != NULL);
-		df->l2df_func(df->l2df_data, df->l2df_size);
-		list_remove(buflist, df);
-		kmem_free(df, sizeof (l2arc_data_free_t));
-	}
-	mutex_exit(&l2arc_free_on_write_mtx);
+	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
@@ -3642,7 +3714,7 @@
 	arc_buf_t *buf;
 	zio_t *rzio;
 	kmutex_t *hash_lock;
-	int equal, err = 0;
+	int equal;
 
 	cb = zio->io_private;
 	ASSERT(cb != NULL);
@@ -3668,29 +3740,27 @@
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
-		if (zio->io_error != 0)
+		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
+		} else {
+			zio->io_error = EIO;
+		}
 		if (!equal)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
-		zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
-		rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
-		    buf->b_data, zio->io_size, arc_read_done, buf,
-		    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
-
-		/*
-		 * Since this is a seperate thread, we can wait on this
-		 * I/O whether there is an io_waiter or not.
-		 */
-		err = zio_wait(rzio);
-
-		/*
-		 * Let the resent I/O call arc_read_done() instead.
-		 * io_error is set to the reissued I/O error status.
-		 */
-		zio->io_done = NULL;
-		zio->io_waiter = NULL;
-		zio->io_error = err;
+		if (zio->io_waiter == NULL) {
+			/*
+			 * Let the resent I/O call arc_read_done() instead.
+			 */
+			zio->io_done = NULL;
+			zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
+
+			rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
+			    buf->b_data, zio->io_size, arc_read_done, buf,
+			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
+
+			(void) zio_nowait(rzio);
+		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
@@ -3752,8 +3822,6 @@
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 
-	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
-
 	buflist = dev->l2ad_buflist;
 
 	if (buflist == NULL)
@@ -3767,7 +3835,7 @@
 		return;
 	}
 
-	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * dev->l2ad_write))) {
+	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
 		/*
 		 * When nearing the end of the device, evict to the end
 		 * before the device write hand jumps to the start.
@@ -3836,6 +3904,16 @@
 			arc_hdr_destroy(ab);
 		} else {
 			/*
+			 * Invalidate issued or about to be issued
+			 * reads, since we may be about to write
+			 * over this location.
+			 */
+			if (HDR_L2_READING(ab)) {
+				ARCSTAT_BUMP(arcstat_l2_evict_reading);
+				ab->b_flags |= ARC_L2_EVICTED;
+			}
+
+			/*
 			 * Tell ARC this no longer exists in L2ARC.
 			 */
 			if (ab->b_l2hdr != NULL) {
@@ -3851,16 +3929,6 @@
 			 * failed write.
 			 */
 			ab->b_flags &= ~ARC_L2_WRITING;
-
-			/*
-			 * Invalidate issued or about to be issued
-			 * reads, since we may be about to write
-			 * over this location.
-			 */
-			if (HDR_L2_READING(ab)) {
-				ARCSTAT_BUMP(arcstat_l2_evict_reading);
-				ab->b_flags |= ARC_L2_EVICTED;
-			}
 		}
 		mutex_exit(hash_lock);
 	}
@@ -3877,21 +3945,18 @@
  * for reading until they have completed writing.
  */
 static void
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t *ab, *ab_prev, *head;
 	l2arc_buf_hdr_t *hdrl2;
 	list_t *list;
-	uint64_t passed_sz, write_sz, buf_sz;
-	uint64_t target_sz = dev->l2ad_write;
-	uint64_t headroom = dev->l2ad_write * l2arc_headroom;
+	uint64_t passed_sz, write_sz, buf_sz, headroom;
 	void *buf_data;
 	kmutex_t *hash_lock, *list_lock;
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 
-	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
 	ASSERT(dev->l2ad_vdev != NULL);
 
 	pio = NULL;
@@ -3908,8 +3973,23 @@
 		list = l2arc_list_locked(try, &list_lock);
 		passed_sz = 0;
 
-		for (ab = list_tail(list); ab; ab = ab_prev) {
-			ab_prev = list_prev(list, ab);
+		/*
+		 * L2ARC fast warmup.
+		 *
+		 * Until the ARC is warm and starts to evict, read from the
+		 * head of the ARC lists rather than the tail.
+		 */
+		headroom = target_sz * l2arc_headroom;
+		if (arc_warm == B_FALSE)
+			ab = list_head(list);
+		else
+			ab = list_tail(list);
+
+		for (; ab; ab = ab_prev) {
+			if (arc_warm == B_FALSE)
+				ab_prev = list_next(list, ab);
+			else
+				ab_prev = list_prev(list, ab);
 
 			hash_lock = HDR_LOCK(ab);
 			have_lock = MUTEX_HELD(hash_lock);
@@ -4032,7 +4112,7 @@
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
-	if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) {
+	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
 		spa_l2cache_space_update(dev->l2ad_vdev, 0,
 		    dev->l2ad_end - dev->l2ad_hand);
 		dev->l2ad_hand = dev->l2ad_start;
@@ -4053,8 +4133,7 @@
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
-	int interval;
-	boolean_t startup = B_TRUE;
+	uint64_t size;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4062,57 +4141,64 @@
 
 	while (l2arc_thread_exit == 0) {
 		/*
-		 * Initially pause for L2ARC_FEED_DELAY seconds as a grace
-		 * interval during boot, followed by l2arc_feed_secs seconds
-		 * thereafter.
+		 * Pause for l2arc_feed_secs seconds between writes.
 		 */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		if (startup) {
-			interval = L2ARC_FEED_DELAY;
-			startup = B_FALSE;
-		} else {
-			interval = l2arc_feed_secs;
+		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+		    lbolt + (hz * l2arc_feed_secs));
+		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+		/*
+		 * Quick check for L2ARC devices.
+		 */
+		mutex_enter(&l2arc_dev_mtx);
+		if (l2arc_ndev == 0) {
+			mutex_exit(&l2arc_dev_mtx);
+			continue;
 		}
-		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    lbolt + (hz * interval));
-		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-
-		mutex_enter(&l2arc_dev_mtx);
+		mutex_exit(&l2arc_dev_mtx);
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
-		 * will return NULL if there are no l2arc devices or if they
-		 * are all faulted.
+		 * will return NULL if there are now no l2arc devices or if
+		 * they are all faulted.
+		 *
+		 * If a device is returned, its spa's config lock is also
+		 * held to prevent device removal.  l2arc_dev_get_next()
+		 * will grab and release l2arc_dev_mtx.
 		 */
-		if ((dev = l2arc_dev_get_next()) == NULL) {
-			mutex_exit(&l2arc_dev_mtx);
+		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
-		}
+
+		spa = dev->l2ad_spa;
+		ASSERT(spa != NULL);
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (arc_reclaim_needed()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
-			mutex_exit(&l2arc_dev_mtx);
+			spa_config_exit(spa, dev);
 			continue;
 		}
 
-		spa = dev->l2ad_spa;
-		ASSERT(spa != NULL);
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
+		size = dev->l2ad_write;
+		if (arc_warm == B_FALSE)
+			size += dev->l2ad_boost;
+
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
-		l2arc_evict(dev, dev->l2ad_write, B_FALSE);
+		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
-		l2arc_write_buffers(spa, dev);
-		mutex_exit(&l2arc_dev_mtx);
+		l2arc_write_buffers(spa, dev, size);
+		spa_config_exit(spa, dev);
 	}
 
 	l2arc_thread_exit = 0;
@@ -4155,6 +4241,7 @@
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	adddev->l2ad_write = l2arc_write_max;
+	adddev->l2ad_boost = l2arc_write_boost;
 	adddev->l2ad_start = start;
 	adddev->l2ad_end = end;
 	adddev->l2ad_hand = adddev->l2ad_start;
@@ -4190,12 +4277,6 @@
 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
 
 	/*
-	 * We can only grab the spa config lock when cache device writes
-	 * complete.
-	 */
-	ASSERT3U(l2arc_writes_sent, ==, l2arc_writes_done);
-
-	/*
 	 * Find the device by vdev
 	 */
 	mutex_enter(&l2arc_dev_mtx);
@@ -4213,6 +4294,8 @@
 	 */
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
+	atomic_dec_64(&l2arc_ndev);
+	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
@@ -4221,9 +4304,6 @@
 	list_destroy(remdev->l2ad_buflist);
 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
 	kmem_free(remdev, sizeof (l2arc_dev_t));
-
-	atomic_dec_64(&l2arc_ndev);
-	mutex_exit(&l2arc_dev_mtx);
 }
 
 void
@@ -4254,6 +4334,12 @@
 void
 l2arc_fini()
 {
+	/*
+	 * This is called from dmu_fini(), which is called from spa_fini();
+	 * Because of this, we can assume that all l2arc devices have
+	 * already been removed when the pools themselves were removed.
+	 */
+
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
@@ -4261,6 +4347,8 @@
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 
+	l2arc_do_free_on_write();
+
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Jul 01 10:11:12 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Jul 01 10:31:58 2008 -0700
@@ -380,6 +380,9 @@
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
+/* spa namespace global mutex */
+extern kmutex_t spa_namespace_lock;
+
 /*
  * SPA configuration functions in spa_config.c
  */
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Jul 01 10:11:12 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Tue Jul 01 10:31:58 2008 -0700
@@ -175,7 +175,6 @@
 };
 
 extern const char *spa_config_path;
-extern kmutex_t spa_namespace_lock;
 
 #ifdef	__cplusplus
 }