Mercurial > illumos > illumos-gate
annotate usr/src/uts/common/fs/zfs/dmu.c @ 2237:45affe88ed99
6416482 filebench oltp workload hangs in zfs
6440499 zil should avoid txg_wait_synced() and use dmu_sync() to issue parallel IOs when fsyncing
author | maybee |
---|---|
date | Mon, 19 Jun 2006 19:31:35 -0700 |
parents | 712a788c2dfd |
children | 2fa3fd1db808 |
rev | line source |
---|---|
789 | 1 /* |
2 * CDDL HEADER START | |
3 * | |
4 * The contents of this file are subject to the terms of the | |
1544 | 5 * Common Development and Distribution License (the "License"). |
6 * You may not use this file except in compliance with the License. | |
789 | 7 * |
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 * or http://www.opensolaris.org/os/licensing. | |
10 * See the License for the specific language governing permissions | |
11 * and limitations under the License. | |
12 * | |
13 * When distributing Covered Code, include this CDDL HEADER in each | |
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 * If applicable, add the following below this CDDL HEADER, with the | |
16 * fields enclosed by brackets "[]" replaced with your own identifying | |
17 * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 * | |
19 * CDDL HEADER END | |
20 */ | |
21 /* | |
1544 | 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
789 | 23 * Use is subject to license terms. |
24 */ | |
25 | |
26 #pragma ident "%Z%%M% %I% %E% SMI" | |
27 | |
28 #include <sys/dmu.h> | |
29 #include <sys/dmu_impl.h> | |
30 #include <sys/dmu_tx.h> | |
31 #include <sys/dbuf.h> | |
32 #include <sys/dnode.h> | |
33 #include <sys/zfs_context.h> | |
34 #include <sys/dmu_objset.h> | |
35 #include <sys/dmu_traverse.h> | |
36 #include <sys/dsl_dataset.h> | |
37 #include <sys/dsl_dir.h> | |
38 #include <sys/dsl_pool.h> | |
2199 | 39 #include <sys/dsl_synctask.h> |
789 | 40 #include <sys/dmu_zfetch.h> |
41 #include <sys/zfs_ioctl.h> | |
42 #include <sys/zap.h> | |
1544 | 43 #include <sys/zio_checksum.h> |
789 | 44 |
45 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { | |
46 { byteswap_uint8_array, TRUE, "unallocated" }, | |
47 { zap_byteswap, TRUE, "object directory" }, | |
48 { byteswap_uint64_array, TRUE, "object array" }, | |
49 { byteswap_uint8_array, TRUE, "packed nvlist" }, | |
50 { byteswap_uint64_array, TRUE, "packed nvlist size" }, | |
51 { byteswap_uint64_array, TRUE, "bplist" }, | |
52 { byteswap_uint64_array, TRUE, "bplist header" }, | |
53 { byteswap_uint64_array, TRUE, "SPA space map header" }, | |
54 { byteswap_uint64_array, TRUE, "SPA space map" }, | |
55 { byteswap_uint64_array, TRUE, "ZIL intent log" }, | |
56 { dnode_buf_byteswap, TRUE, "DMU dnode" }, | |
57 { dmu_objset_byteswap, TRUE, "DMU objset" }, | |
58 { byteswap_uint64_array, TRUE, "DSL directory" }, | |
59 { zap_byteswap, TRUE, "DSL directory child map"}, | |
60 { zap_byteswap, TRUE, "DSL dataset snap map" }, | |
61 { zap_byteswap, TRUE, "DSL props" }, | |
62 { byteswap_uint64_array, TRUE, "DSL dataset" }, | |
63 { zfs_znode_byteswap, TRUE, "ZFS znode" }, | |
64 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, | |
65 { byteswap_uint8_array, FALSE, "ZFS plain file" }, | |
66 { zap_byteswap, TRUE, "ZFS directory" }, | |
67 { zap_byteswap, TRUE, "ZFS master node" }, | |
68 { zap_byteswap, TRUE, "ZFS delete queue" }, | |
69 { byteswap_uint8_array, FALSE, "zvol object" }, | |
70 { zap_byteswap, TRUE, "zvol prop" }, | |
71 { byteswap_uint8_array, FALSE, "other uint8[]" }, | |
72 { byteswap_uint64_array, FALSE, "other uint64[]" }, | |
73 { zap_byteswap, TRUE, "other ZAP" }, | |
1544 | 74 { zap_byteswap, TRUE, "persistent error log" }, |
789 | 75 }; |
76 | |
77 int | |
1544 | 78 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, |
79 void *tag, dmu_buf_t **dbp) | |
789 | 80 { |
81 dnode_t *dn; | |
82 uint64_t blkid; | |
83 dmu_buf_impl_t *db; | |
1544 | 84 int err; |
789 | 85 |
1544 | 86 err = dnode_hold(os->os, object, FTAG, &dn); |
87 if (err) | |
88 return (err); | |
789 | 89 blkid = dbuf_whichblock(dn, offset); |
90 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1544 | 91 db = dbuf_hold(dn, blkid, tag); |
789 | 92 rw_exit(&dn->dn_struct_rwlock); |
1544 | 93 if (db == NULL) { |
94 err = EIO; | |
95 } else { | |
96 err = dbuf_read(db, NULL, DB_RF_CANFAIL); | |
97 if (err) { | |
98 dbuf_rele(db, tag); | |
99 db = NULL; | |
100 } | |
101 } | |
102 | |
789 | 103 dnode_rele(dn, FTAG); |
1544 | 104 *dbp = &db->db; |
105 return (err); | |
789 | 106 } |
107 | |
108 int | |
109 dmu_bonus_max(void) | |
110 { | |
111 return (DN_MAX_BONUSLEN); | |
112 } | |
113 | |
114 /* | |
1544 | 115 * returns ENOENT, EIO, or 0. |
789 | 116 */ |
1544 | 117 int |
118 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) | |
789 | 119 { |
1544 | 120 dnode_t *dn; |
121 int err, count; | |
789 | 122 dmu_buf_impl_t *db; |
123 | |
1544 | 124 err = dnode_hold(os->os, object, FTAG, &dn); |
125 if (err) | |
126 return (err); | |
789 | 127 |
1544 | 128 rw_enter(&dn->dn_struct_rwlock, RW_READER); |
129 if (dn->dn_bonus == NULL) { | |
789 | 130 rw_exit(&dn->dn_struct_rwlock); |
1544 | 131 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); |
132 if (dn->dn_bonus == NULL) | |
133 dn->dn_bonus = dbuf_create_bonus(dn); | |
789 | 134 } |
1544 | 135 db = dn->dn_bonus; |
136 rw_exit(&dn->dn_struct_rwlock); | |
137 mutex_enter(&db->db_mtx); | |
138 count = refcount_add(&db->db_holds, tag); | |
139 mutex_exit(&db->db_mtx); | |
140 if (count == 1) | |
141 dnode_add_ref(dn, db); | |
789 | 142 dnode_rele(dn, FTAG); |
1544 | 143 |
144 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); | |
145 | |
146 *dbp = &db->db; | |
147 return (0); | |
789 | 148 } |
149 | |
1544 | 150 int |
151 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, | |
152 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) | |
789 | 153 { |
1544 | 154 dnode_t *dn; |
789 | 155 dmu_buf_t **dbp; |
156 uint64_t blkid, nblks, i; | |
1544 | 157 uint32_t flags; |
158 int err; | |
159 zio_t *zio; | |
160 | |
161 ASSERT(length <= DMU_MAX_ACCESS); | |
789 | 162 |
163 if (length == 0) { | |
164 if (numbufsp) | |
165 *numbufsp = 0; | |
1544 | 166 *dbpp = NULL; |
167 return (0); | |
789 | 168 } |
169 | |
1544 | 170 flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
171 if (length > zfetch_array_rd_sz) |
1544 | 172 flags |= DB_RF_NOPREFETCH; |
173 | |
174 err = dnode_hold(os->os, object, FTAG, &dn); | |
175 if (err) | |
176 return (err); | |
177 | |
789 | 178 rw_enter(&dn->dn_struct_rwlock, RW_READER); |
179 if (dn->dn_datablkshift) { | |
180 int blkshift = dn->dn_datablkshift; | |
181 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - | |
182 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; | |
183 } else { | |
184 ASSERT3U(offset + length, <=, dn->dn_datablksz); | |
185 nblks = 1; | |
186 } | |
1544 | 187 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); |
789 | 188 |
1544 | 189 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); |
789 | 190 blkid = dbuf_whichblock(dn, offset); |
191 for (i = 0; i < nblks; i++) { | |
1544 | 192 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); |
193 if (db == NULL) { | |
194 rw_exit(&dn->dn_struct_rwlock); | |
195 dmu_buf_rele_array(dbp, nblks, tag); | |
196 dnode_rele(dn, FTAG); | |
197 zio_nowait(zio); | |
198 return (EIO); | |
199 } | |
200 /* initiate async i/o */ | |
201 if (read && db->db_state == DB_UNCACHED) { | |
202 rw_exit(&dn->dn_struct_rwlock); | |
203 (void) dbuf_read(db, zio, flags); | |
204 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
205 } | |
206 dbp[i] = &db->db; | |
789 | 207 } |
208 rw_exit(&dn->dn_struct_rwlock); | |
1544 | 209 dnode_rele(dn, FTAG); |
789 | 210 |
1544 | 211 /* wait for async i/o */ |
212 err = zio_wait(zio); | |
213 if (err) { | |
214 dmu_buf_rele_array(dbp, nblks, tag); | |
215 return (err); | |
789 | 216 } |
217 | |
1544 | 218 /* wait for other io to complete */ |
219 if (read) { | |
220 for (i = 0; i < nblks; i++) { | |
221 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; | |
222 mutex_enter(&db->db_mtx); | |
223 while (db->db_state == DB_READ || | |
224 db->db_state == DB_FILL) | |
225 cv_wait(&db->db_changed, &db->db_mtx); | |
226 if (db->db_state == DB_UNCACHED) | |
227 err = EIO; | |
228 mutex_exit(&db->db_mtx); | |
229 if (err) { | |
230 dmu_buf_rele_array(dbp, nblks, tag); | |
231 return (err); | |
232 } | |
233 } | |
234 } | |
789 | 235 |
1544 | 236 *numbufsp = nblks; |
237 *dbpp = dbp; | |
238 return (0); | |
789 | 239 } |
240 | |
241 void | |
1544 | 242 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) |
789 | 243 { |
244 int i; | |
245 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; | |
246 | |
247 if (numbufs == 0) | |
248 return; | |
249 | |
1544 | 250 for (i = 0; i < numbufs; i++) { |
251 if (dbp[i]) | |
252 dbuf_rele(dbp[i], tag); | |
253 } | |
789 | 254 |
255 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); | |
256 } | |
257 | |
258 void | |
259 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
260 { | |
261 dnode_t *dn; | |
262 uint64_t blkid; | |
1544 | 263 int nblks, i, err; |
789 | 264 |
265 if (len == 0) { /* they're interested in the bonus buffer */ | |
266 dn = os->os->os_meta_dnode; | |
267 | |
268 if (object == 0 || object >= DN_MAX_OBJECT) | |
269 return; | |
270 | |
271 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
272 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); | |
273 dbuf_prefetch(dn, blkid); | |
274 rw_exit(&dn->dn_struct_rwlock); | |
275 return; | |
276 } | |
277 | |
278 /* | |
279 * XXX - Note, if the dnode for the requested object is not | |
280 * already cached, we will do a *synchronous* read in the | |
281 * dnode_hold() call. The same is true for any indirects. | |
282 */ | |
1544 | 283 err = dnode_hold(os->os, object, FTAG, &dn); |
284 if (err != 0) | |
789 | 285 return; |
286 | |
287 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
288 if (dn->dn_datablkshift) { | |
289 int blkshift = dn->dn_datablkshift; | |
290 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - | |
291 P2ALIGN(offset, 1<<blkshift)) >> blkshift; | |
292 } else { | |
293 nblks = (offset < dn->dn_datablksz); | |
294 } | |
295 | |
296 if (nblks != 0) { | |
297 blkid = dbuf_whichblock(dn, offset); | |
298 for (i = 0; i < nblks; i++) | |
299 dbuf_prefetch(dn, blkid+i); | |
300 } | |
301 | |
302 rw_exit(&dn->dn_struct_rwlock); | |
303 | |
304 dnode_rele(dn, FTAG); | |
305 } | |
306 | |
1544 | 307 int |
789 | 308 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, |
309 uint64_t size, dmu_tx_t *tx) | |
310 { | |
1544 | 311 dnode_t *dn; |
312 int err = dnode_hold(os->os, object, FTAG, &dn); | |
313 if (err) | |
314 return (err); | |
789 | 315 ASSERT(offset < UINT64_MAX); |
316 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); | |
317 dnode_free_range(dn, offset, size, tx); | |
318 dnode_rele(dn, FTAG); | |
1544 | 319 return (0); |
789 | 320 } |
321 | |
1544 | 322 int |
323 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
324 void *buf) | |
789 | 325 { |
326 dnode_t *dn; | |
327 dmu_buf_t **dbp; | |
1544 | 328 int numbufs, i, err; |
789 | 329 |
1544 | 330 /* |
331 * Deal with odd block sizes, where there can't be data past the | |
332 * first block. | |
333 */ | |
334 err = dnode_hold(os->os, object, FTAG, &dn); | |
335 if (err) | |
336 return (err); | |
789 | 337 if (dn->dn_datablkshift == 0) { |
338 int newsz = offset > dn->dn_datablksz ? 0 : | |
339 MIN(size, dn->dn_datablksz - offset); | |
340 bzero((char *)buf + newsz, size - newsz); | |
341 size = newsz; | |
342 } | |
343 dnode_rele(dn, FTAG); | |
344 | |
345 while (size > 0) { | |
346 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); | |
347 int err; | |
348 | |
349 /* | |
350 * NB: we could do this block-at-a-time, but it's nice | |
351 * to be reading in parallel. | |
352 */ | |
1544 | 353 err = dmu_buf_hold_array(os, object, offset, mylen, |
354 TRUE, FTAG, &numbufs, &dbp); | |
355 if (err) | |
789 | 356 return (err); |
357 | |
358 for (i = 0; i < numbufs; i++) { | |
359 int tocpy; | |
360 int bufoff; | |
361 dmu_buf_t *db = dbp[i]; | |
362 | |
363 ASSERT(size > 0); | |
364 | |
365 bufoff = offset - db->db_offset; | |
366 tocpy = (int)MIN(db->db_size - bufoff, size); | |
367 | |
368 bcopy((char *)db->db_data + bufoff, buf, tocpy); | |
369 | |
370 offset += tocpy; | |
371 size -= tocpy; | |
372 buf = (char *)buf + tocpy; | |
373 } | |
1544 | 374 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 375 } |
376 return (0); | |
377 } | |
378 | |
379 void | |
380 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
381 const void *buf, dmu_tx_t *tx) | |
382 { | |
383 dmu_buf_t **dbp; | |
384 int numbufs, i; | |
385 | |
1544 | 386 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, |
387 FALSE, FTAG, &numbufs, &dbp)); | |
789 | 388 |
389 for (i = 0; i < numbufs; i++) { | |
390 int tocpy; | |
391 int bufoff; | |
392 dmu_buf_t *db = dbp[i]; | |
393 | |
394 ASSERT(size > 0); | |
395 | |
396 bufoff = offset - db->db_offset; | |
397 tocpy = (int)MIN(db->db_size - bufoff, size); | |
398 | |
399 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
400 | |
401 if (tocpy == db->db_size) | |
402 dmu_buf_will_fill(db, tx); | |
403 else | |
404 dmu_buf_will_dirty(db, tx); | |
405 | |
406 bcopy(buf, (char *)db->db_data + bufoff, tocpy); | |
407 | |
408 if (tocpy == db->db_size) | |
409 dmu_buf_fill_done(db, tx); | |
410 | |
411 offset += tocpy; | |
412 size -= tocpy; | |
413 buf = (char *)buf + tocpy; | |
414 } | |
1544 | 415 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 416 } |
417 | |
418 #ifdef _KERNEL | |
419 int | |
420 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
421 uio_t *uio, dmu_tx_t *tx) | |
422 { | |
423 dmu_buf_t **dbp; | |
424 int numbufs, i; | |
425 int err = 0; | |
426 | |
1544 | 427 err = dmu_buf_hold_array(os, object, offset, size, |
428 FALSE, FTAG, &numbufs, &dbp); | |
429 if (err) | |
430 return (err); | |
789 | 431 |
432 for (i = 0; i < numbufs; i++) { | |
433 int tocpy; | |
434 int bufoff; | |
435 dmu_buf_t *db = dbp[i]; | |
436 | |
437 ASSERT(size > 0); | |
438 | |
439 bufoff = offset - db->db_offset; | |
440 tocpy = (int)MIN(db->db_size - bufoff, size); | |
441 | |
442 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
443 | |
444 if (tocpy == db->db_size) | |
445 dmu_buf_will_fill(db, tx); | |
446 else | |
447 dmu_buf_will_dirty(db, tx); | |
448 | |
449 /* | |
450 * XXX uiomove could block forever (eg. nfs-backed | |
451 * pages). There needs to be a uiolockdown() function | |
452 * to lock the pages in memory, so that uiomove won't | |
453 * block. | |
454 */ | |
455 err = uiomove((char *)db->db_data + bufoff, tocpy, | |
456 UIO_WRITE, uio); | |
457 | |
458 if (tocpy == db->db_size) | |
459 dmu_buf_fill_done(db, tx); | |
460 | |
461 if (err) | |
462 break; | |
463 | |
464 offset += tocpy; | |
465 size -= tocpy; | |
466 } | |
1544 | 467 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 468 return (err); |
469 } | |
470 #endif | |
471 | |
2199 | 472 /* |
473 * XXX move send/recv stuff to its own new file! | |
474 */ | |
475 | |
789 | 476 struct backuparg { |
477 dmu_replay_record_t *drr; | |
478 vnode_t *vp; | |
479 objset_t *os; | |
1544 | 480 zio_cksum_t zc; |
789 | 481 int err; |
482 }; | |
483 | |
484 static int | |
485 dump_bytes(struct backuparg *ba, void *buf, int len) | |
486 { | |
487 ssize_t resid; /* have to get resid to get detailed errno */ | |
488 ASSERT3U(len % 8, ==, 0); | |
1544 | 489 |
490 fletcher_4_incremental_native(buf, len, &ba->zc); | |
789 | 491 ba->err = vn_rdwr(UIO_WRITE, ba->vp, |
492 (caddr_t)buf, len, | |
1630
4803baf78b7f
6398622 'zfs backup > file' can get 'file too large' error on 32-bit systems
ahrens
parents:
1596
diff
changeset
|
493 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); |
789 | 494 return (ba->err); |
495 } | |
496 | |
497 static int | |
498 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, | |
499 uint64_t length) | |
500 { | |
501 /* write a FREE record */ | |
502 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
503 ba->drr->drr_type = DRR_FREE; | |
504 ba->drr->drr_u.drr_free.drr_object = object; | |
505 ba->drr->drr_u.drr_free.drr_offset = offset; | |
506 ba->drr->drr_u.drr_free.drr_length = length; | |
507 | |
508 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
509 return (EINTR); | |
510 return (0); | |
511 } | |
512 | |
513 static int | |
514 dump_data(struct backuparg *ba, dmu_object_type_t type, | |
515 uint64_t object, uint64_t offset, int blksz, void *data) | |
516 { | |
517 /* write a DATA record */ | |
518 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
519 ba->drr->drr_type = DRR_WRITE; | |
520 ba->drr->drr_u.drr_write.drr_object = object; | |
521 ba->drr->drr_u.drr_write.drr_type = type; | |
522 ba->drr->drr_u.drr_write.drr_offset = offset; | |
523 ba->drr->drr_u.drr_write.drr_length = blksz; | |
524 | |
525 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
526 return (EINTR); | |
527 if (dump_bytes(ba, data, blksz)) | |
528 return (EINTR); | |
529 return (0); | |
530 } | |
531 | |
532 static int | |
533 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) | |
534 { | |
535 /* write a FREEOBJECTS record */ | |
536 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
537 ba->drr->drr_type = DRR_FREEOBJECTS; | |
538 ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; | |
539 ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; | |
540 | |
541 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
542 return (EINTR); | |
543 return (0); | |
544 } | |
545 | |
546 static int | |
547 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) | |
548 { | |
549 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) | |
550 return (dump_freeobjects(ba, object, 1)); | |
551 | |
552 /* write an OBJECT record */ | |
553 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
554 ba->drr->drr_type = DRR_OBJECT; | |
555 ba->drr->drr_u.drr_object.drr_object = object; | |
556 ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; | |
557 ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; | |
558 ba->drr->drr_u.drr_object.drr_blksz = | |
559 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
560 ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; | |
561 ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; | |
562 ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; | |
563 | |
564 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
565 return (EINTR); | |
566 | |
567 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) | |
568 return (EINTR); | |
569 | |
570 /* free anything past the end of the file */ | |
571 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * | |
572 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) | |
573 return (EINTR); | |
574 if (ba->err) | |
575 return (EINTR); | |
576 return (0); | |
577 } | |
578 | |
579 #define BP_SPAN(dnp, level) \ | |
580 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ | |
581 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) | |
582 | |
583 static int | |
584 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) | |
585 { | |
586 struct backuparg *ba = arg; | |
587 uint64_t object = bc->bc_bookmark.zb_object; | |
588 int level = bc->bc_bookmark.zb_level; | |
589 uint64_t blkid = bc->bc_bookmark.zb_blkid; | |
590 blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; | |
591 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; | |
592 void *data = bc->bc_data; | |
593 int err = 0; | |
594 | |
1544 | 595 if (issig(JUSTLOOKING) && issig(FORREAL)) |
789 | 596 return (EINTR); |
597 | |
598 ASSERT(data || bp == NULL); | |
599 | |
600 if (bp == NULL && object == 0) { | |
601 uint64_t span = BP_SPAN(bc->bc_dnode, level); | |
602 uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; | |
603 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); | |
604 } else if (bp == NULL) { | |
605 uint64_t span = BP_SPAN(bc->bc_dnode, level); | |
606 err = dump_free(ba, object, blkid * span, span); | |
607 } else if (data && level == 0 && type == DMU_OT_DNODE) { | |
608 dnode_phys_t *blk = data; | |
609 int i; | |
610 int blksz = BP_GET_LSIZE(bp); | |
611 | |
612 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { | |
613 uint64_t dnobj = | |
614 (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; | |
615 err = dump_dnode(ba, dnobj, blk+i); | |
616 if (err) | |
617 break; | |
618 } | |
619 } else if (level == 0 && | |
620 type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { | |
621 int blksz = BP_GET_LSIZE(bp); | |
622 if (data == NULL) { | |
623 arc_buf_t *abuf; | |
1544 | 624 zbookmark_t zb; |
789 | 625 |
1544 | 626 zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; |
627 zb.zb_object = object; | |
628 zb.zb_level = level; | |
629 zb.zb_blkid = blkid; | |
789 | 630 (void) arc_read(NULL, spa, bp, |
631 dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, | |
632 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, | |
1544 | 633 ARC_WAIT, &zb); |
789 | 634 |
635 if (abuf) { | |
636 err = dump_data(ba, type, object, blkid * blksz, | |
637 blksz, abuf->b_data); | |
1544 | 638 (void) arc_buf_remove_ref(abuf, &abuf); |
789 | 639 } |
640 } else { | |
641 err = dump_data(ba, type, object, blkid * blksz, | |
642 blksz, data); | |
643 } | |
644 } | |
645 | |
646 ASSERT(err == 0 || err == EINTR); | |
647 return (err); | |
648 } | |
649 | |
650 int | |
651 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) | |
652 { | |
653 dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; | |
654 dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; | |
655 dmu_replay_record_t *drr; | |
656 struct backuparg ba; | |
657 int err; | |
658 | |
659 /* tosnap must be a snapshot */ | |
660 if (ds->ds_phys->ds_next_snap_obj == 0) | |
661 return (EINVAL); | |
662 | |
663 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ | |
664 if (fromds && (ds->ds_dir != fromds->ds_dir || | |
665 fromds->ds_phys->ds_creation_txg >= | |
666 ds->ds_phys->ds_creation_txg)) | |
667 return (EXDEV); | |
668 | |
669 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); | |
670 drr->drr_type = DRR_BEGIN; | |
671 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; | |
672 drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; | |
673 drr->drr_u.drr_begin.drr_creation_time = | |
674 ds->ds_phys->ds_creation_time; | |
675 drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; | |
676 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; | |
677 if (fromds) | |
678 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; | |
679 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); | |
680 | |
681 ba.drr = drr; | |
682 ba.vp = vp; | |
683 ba.os = tosnap; | |
1544 | 684 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); |
789 | 685 |
686 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { | |
687 kmem_free(drr, sizeof (dmu_replay_record_t)); | |
688 return (ba.err); | |
689 } | |
690 | |
691 err = traverse_dsl_dataset(ds, | |
692 fromds ? fromds->ds_phys->ds_creation_txg : 0, | |
693 ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, | |
694 backup_cb, &ba); | |
695 | |
696 if (err) { | |
697 if (err == EINTR && ba.err) | |
698 err = ba.err; | |
699 return (err); | |
700 } | |
701 | |
702 bzero(drr, sizeof (dmu_replay_record_t)); | |
703 drr->drr_type = DRR_END; | |
1544 | 704 drr->drr_u.drr_end.drr_checksum = ba.zc; |
789 | 705 |
706 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) | |
707 return (ba.err); | |
708 | |
709 kmem_free(drr, sizeof (dmu_replay_record_t)); | |
710 | |
711 return (0); | |
712 } | |
713 | |
714 struct restorearg { | |
715 int err; | |
716 int byteswap; | |
717 vnode_t *vp; | |
718 char *buf; | |
719 uint64_t voff; | |
720 int buflen; /* number of valid bytes in buf */ | |
721 int bufoff; /* next offset to read */ | |
722 int bufsize; /* amount of memory allocated for buf */ | |
1544 | 723 zio_cksum_t zc; |
789 | 724 }; |
725 | |
2199 | 726 /* ARGSUSED */ |
789 | 727 static int |
2199 | 728 replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 729 { |
2199 | 730 dsl_dataset_t *ds = arg1; |
731 struct drr_begin *drrb = arg2; | |
789 | 732 const char *snapname; |
2199 | 733 int err; |
789 | 734 uint64_t val; |
735 | |
736 /* must already be a snapshot of this fs */ | |
2199 | 737 if (ds->ds_phys->ds_prev_snap_obj == 0) |
738 return (ENODEV); | |
789 | 739 |
740 /* most recent snapshot must match fromguid */ | |
2199 | 741 if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) |
742 return (ENODEV); | |
789 | 743 /* must not have any changes since most recent snapshot */ |
744 if (ds->ds_phys->ds_bp.blk_birth > | |
2199 | 745 ds->ds_prev->ds_phys->ds_creation_txg) |
746 return (ETXTBSY); | |
789 | 747 |
748 /* new snapshot name must not exist */ | |
749 snapname = strrchr(drrb->drr_toname, '@'); | |
2199 | 750 if (snapname == NULL) |
751 return (EEXIST); | |
752 | |
789 | 753 snapname++; |
2199 | 754 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, |
789 | 755 ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); |
2199 | 756 if (err == 0) |
757 return (EEXIST); | |
758 if (err != ENOENT) | |
789 | 759 return (err); |
2199 | 760 |
761 return (0); | |
762 } | |
789 | 763 |
2199 | 764 /* ARGSUSED */ |
765 static void | |
766 replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
767 { | |
768 dsl_dataset_t *ds = arg1; | |
789 | 769 dmu_buf_will_dirty(ds->ds_dbuf, tx); |
2082 | 770 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; |
789 | 771 } |
772 | |
2199 | 773 /* ARGSUSED */ |
789 | 774 static int |
2199 | 775 replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 776 { |
2199 | 777 dsl_dir_t *dd = arg1; |
778 struct drr_begin *drrb = arg2; | |
779 objset_t *mos = dd->dd_pool->dp_meta_objset; | |
780 char *cp; | |
781 uint64_t val; | |
789 | 782 int err; |
783 | |
2199 | 784 cp = strchr(drrb->drr_toname, '@'); |
789 | 785 *cp = '\0'; |
2199 | 786 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, |
787 strrchr(drrb->drr_toname, '/') + 1, | |
788 sizeof (uint64_t), 1, &val); | |
789 *cp = '@'; | |
790 | |
791 if (err != ENOENT) | |
792 return (err ? err : EEXIST); | |
793 | |
794 return (0); | |
795 } | |
789 | 796 |
2199 | 797 static void |
798 replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
799 { | |
800 dsl_dir_t *dd = arg1; | |
801 struct drr_begin *drrb = arg2; | |
802 char *cp; | |
803 dsl_dataset_t *ds; | |
804 uint64_t dsobj; | |
789 | 805 |
2199 | 806 cp = strchr(drrb->drr_toname, '@'); |
807 *cp = '\0'; | |
808 dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, | |
809 NULL, tx); | |
810 *cp = '@'; | |
789 | 811 |
2199 | 812 VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, |
1544 | 813 DS_MODE_EXCLUSIVE, FTAG, &ds)); |
789 | 814 |
815 (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), | |
816 ds, drrb->drr_type, tx); | |
817 | |
818 dmu_buf_will_dirty(ds->ds_dbuf, tx); | |
2082 | 819 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; |
789 | 820 |
821 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
822 } | |
823 | |
824 static int | |
2199 | 825 replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 826 { |
2199 | 827 objset_t *os = arg1; |
828 struct drr_begin *drrb = arg2; | |
789 | 829 char *snapname; |
830 | |
831 /* XXX verify that drr_toname is in dd */ | |
832 | |
833 snapname = strchr(drrb->drr_toname, '@'); | |
834 if (snapname == NULL) | |
835 return (EINVAL); | |
836 snapname++; | |
837 | |
2199 | 838 return (dsl_dataset_snapshot_check(os, snapname, tx)); |
839 } | |
840 | |
841 static void | |
842 replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
843 { | |
844 objset_t *os = arg1; | |
845 struct drr_begin *drrb = arg2; | |
846 char *snapname; | |
847 dsl_dataset_t *ds, *hds; | |
848 | |
849 snapname = strchr(drrb->drr_toname, '@') + 1; | |
850 | |
851 dsl_dataset_snapshot_sync(os, snapname, tx); | |
789 | 852 |
853 /* set snapshot's creation time and guid */ | |
2199 | 854 hds = os->os->os_dsl_dataset; |
855 VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, | |
856 hds->ds_phys->ds_prev_snap_obj, NULL, | |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
857 DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, |
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
858 FTAG, &ds)); |
789 | 859 |
860 dmu_buf_will_dirty(ds->ds_dbuf, tx); | |
861 ds->ds_phys->ds_creation_time = drrb->drr_creation_time; | |
862 ds->ds_phys->ds_guid = drrb->drr_toguid; | |
2082 | 863 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; |
789 | 864 |
865 dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); | |
866 | |
2199 | 867 dmu_buf_will_dirty(hds->ds_dbuf, tx); |
868 hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; | |
789 | 869 } |
870 | |
871 void * | |
872 restore_read(struct restorearg *ra, int len) | |
873 { | |
874 void *rv; | |
875 | |
876 /* some things will require 8-byte alignment, so everything must */ | |
877 ASSERT3U(len % 8, ==, 0); | |
878 | |
879 while (ra->buflen - ra->bufoff < len) { | |
880 ssize_t resid; | |
881 int leftover = ra->buflen - ra->bufoff; | |
882 | |
883 (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); | |
884 ra->err = vn_rdwr(UIO_READ, ra->vp, | |
885 (caddr_t)ra->buf + leftover, ra->bufsize - leftover, | |
886 ra->voff, UIO_SYSSPACE, FAPPEND, | |
1630
4803baf78b7f
6398622 'zfs backup > file' can get 'file too large' error on 32-bit systems
ahrens
parents:
1596
diff
changeset
|
887 RLIM64_INFINITY, CRED(), &resid); |
789 | 888 |
889 ra->voff += ra->bufsize - leftover - resid; | |
890 ra->buflen = ra->bufsize - resid; | |
891 ra->bufoff = 0; | |
892 if (resid == ra->bufsize - leftover) | |
893 ra->err = EINVAL; | |
894 if (ra->err) | |
895 return (NULL); | |
1544 | 896 /* Could compute checksum here? */ |
789 | 897 } |
898 | |
899 ASSERT3U(ra->bufoff % 8, ==, 0); | |
900 ASSERT3U(ra->buflen - ra->bufoff, >=, len); | |
901 rv = ra->buf + ra->bufoff; | |
902 ra->bufoff += len; | |
1544 | 903 if (ra->byteswap) |
904 fletcher_4_incremental_byteswap(rv, len, &ra->zc); | |
905 else | |
906 fletcher_4_incremental_native(rv, len, &ra->zc); | |
789 | 907 return (rv); |
908 } | |
909 | |
910 static void | |
911 backup_byteswap(dmu_replay_record_t *drr) | |
912 { | |
913 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) | |
914 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) | |
915 drr->drr_type = BSWAP_32(drr->drr_type); | |
916 switch (drr->drr_type) { | |
917 case DRR_BEGIN: | |
918 DO64(drr_begin.drr_magic); | |
919 DO64(drr_begin.drr_version); | |
920 DO64(drr_begin.drr_creation_time); | |
921 DO32(drr_begin.drr_type); | |
922 DO64(drr_begin.drr_toguid); | |
923 DO64(drr_begin.drr_fromguid); | |
924 break; | |
925 case DRR_OBJECT: | |
926 DO64(drr_object.drr_object); | |
927 /* DO64(drr_object.drr_allocation_txg); */ | |
928 DO32(drr_object.drr_type); | |
929 DO32(drr_object.drr_bonustype); | |
930 DO32(drr_object.drr_blksz); | |
931 DO32(drr_object.drr_bonuslen); | |
932 break; | |
933 case DRR_FREEOBJECTS: | |
934 DO64(drr_freeobjects.drr_firstobj); | |
935 DO64(drr_freeobjects.drr_numobjs); | |
936 break; | |
937 case DRR_WRITE: | |
938 DO64(drr_write.drr_object); | |
939 DO32(drr_write.drr_type); | |
940 DO64(drr_write.drr_offset); | |
941 DO64(drr_write.drr_length); | |
942 break; | |
943 case DRR_FREE: | |
944 DO64(drr_free.drr_object); | |
945 DO64(drr_free.drr_offset); | |
946 DO64(drr_free.drr_length); | |
947 break; | |
948 case DRR_END: | |
1544 | 949 DO64(drr_end.drr_checksum.zc_word[0]); |
950 DO64(drr_end.drr_checksum.zc_word[1]); | |
951 DO64(drr_end.drr_checksum.zc_word[2]); | |
952 DO64(drr_end.drr_checksum.zc_word[3]); | |
789 | 953 break; |
954 } | |
955 #undef DO64 | |
956 #undef DO32 | |
957 } | |
958 | |
959 static int | |
960 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) | |
961 { | |
962 int err; | |
963 dmu_tx_t *tx; | |
964 | |
965 err = dmu_object_info(os, drro->drr_object, NULL); | |
966 | |
967 if (err != 0 && err != ENOENT) | |
968 return (EINVAL); | |
969 | |
970 if (drro->drr_type == DMU_OT_NONE || | |
971 drro->drr_type >= DMU_OT_NUMTYPES || | |
972 drro->drr_bonustype >= DMU_OT_NUMTYPES || | |
973 drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || | |
974 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || | |
975 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || | |
976 drro->drr_blksz < SPA_MINBLOCKSIZE || | |
977 drro->drr_blksz > SPA_MAXBLOCKSIZE || | |
978 drro->drr_bonuslen > DN_MAX_BONUSLEN) { | |
979 return (EINVAL); | |
980 } | |
981 | |
982 tx = dmu_tx_create(os); | |
983 | |
984 if (err == ENOENT) { | |
985 /* currently free, want to be allocated */ | |
986 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
987 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); | |
988 err = dmu_tx_assign(tx, TXG_WAIT); | |
989 if (err) { | |
990 dmu_tx_abort(tx); | |
991 return (err); | |
992 } | |
993 err = dmu_object_claim(os, drro->drr_object, | |
994 drro->drr_type, drro->drr_blksz, | |
995 drro->drr_bonustype, drro->drr_bonuslen, tx); | |
996 } else { | |
997 /* currently allocated, want to be allocated */ | |
998 dmu_tx_hold_bonus(tx, drro->drr_object); | |
999 /* | |
1000 * We may change blocksize, so need to | |
1001 * hold_write | |
1002 */ | |
1003 dmu_tx_hold_write(tx, drro->drr_object, 0, 1); | |
1004 err = dmu_tx_assign(tx, TXG_WAIT); | |
1005 if (err) { | |
1006 dmu_tx_abort(tx); | |
1007 return (err); | |
1008 } | |
1009 | |
1010 err = dmu_object_reclaim(os, drro->drr_object, | |
1011 drro->drr_type, drro->drr_blksz, | |
1012 drro->drr_bonustype, drro->drr_bonuslen, tx); | |
1013 } | |
1014 if (err) { | |
1015 dmu_tx_commit(tx); | |
1016 return (EINVAL); | |
1017 } | |
1018 | |
1019 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); | |
1020 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); | |
1021 | |
1022 if (drro->drr_bonuslen) { | |
1023 dmu_buf_t *db; | |
1024 void *data; | |
1544 | 1025 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); |
789 | 1026 dmu_buf_will_dirty(db, tx); |
1027 | |
1028 ASSERT3U(db->db_size, ==, drro->drr_bonuslen); | |
1029 data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); | |
1030 if (data == NULL) { | |
1031 dmu_tx_commit(tx); | |
1032 return (ra->err); | |
1033 } | |
1034 bcopy(data, db->db_data, db->db_size); | |
1035 if (ra->byteswap) { | |
1036 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, | |
1037 drro->drr_bonuslen); | |
1038 } | |
1544 | 1039 dmu_buf_rele(db, FTAG); |
789 | 1040 } |
1041 dmu_tx_commit(tx); | |
1042 return (0); | |
1043 } | |
1044 | |
1045 /* ARGSUSED */ | |
1046 static int | |
1047 restore_freeobjects(struct restorearg *ra, objset_t *os, | |
1048 struct drr_freeobjects *drrfo) | |
1049 { | |
1050 uint64_t obj; | |
1051 | |
1052 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) | |
1053 return (EINVAL); | |
1054 | |
1055 for (obj = drrfo->drr_firstobj; | |
1056 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { | |
1057 dmu_tx_t *tx; | |
1058 int err; | |
1059 | |
1060 if (dmu_object_info(os, obj, NULL) != 0) | |
1061 continue; | |
1062 | |
1063 tx = dmu_tx_create(os); | |
1064 dmu_tx_hold_bonus(tx, obj); | |
1065 err = dmu_tx_assign(tx, TXG_WAIT); | |
1066 if (err) { | |
1067 dmu_tx_abort(tx); | |
1068 return (err); | |
1069 } | |
1070 err = dmu_object_free(os, obj, tx); | |
1071 dmu_tx_commit(tx); | |
1072 if (err && err != ENOENT) | |
1073 return (EINVAL); | |
1074 } | |
1075 return (0); | |
1076 } | |
1077 | |
1078 static int | |
1079 restore_write(struct restorearg *ra, objset_t *os, | |
1080 struct drr_write *drrw) | |
1081 { | |
1082 dmu_tx_t *tx; | |
1083 void *data; | |
1084 int err; | |
1085 | |
1086 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || | |
1087 drrw->drr_type >= DMU_OT_NUMTYPES) | |
1088 return (EINVAL); | |
1089 | |
1090 data = restore_read(ra, drrw->drr_length); | |
1091 if (data == NULL) | |
1092 return (ra->err); | |
1093 | |
1094 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) | |
1095 return (EINVAL); | |
1096 | |
1097 tx = dmu_tx_create(os); | |
1098 | |
1099 dmu_tx_hold_write(tx, drrw->drr_object, | |
1100 drrw->drr_offset, drrw->drr_length); | |
1101 err = dmu_tx_assign(tx, TXG_WAIT); | |
1102 if (err) { | |
1103 dmu_tx_abort(tx); | |
1104 return (err); | |
1105 } | |
1106 if (ra->byteswap) | |
1107 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); | |
1108 dmu_write(os, drrw->drr_object, | |
1109 drrw->drr_offset, drrw->drr_length, data, tx); | |
1110 dmu_tx_commit(tx); | |
1111 return (0); | |
1112 } | |
1113 | |
1114 /* ARGSUSED */ | |
1115 static int | |
1116 restore_free(struct restorearg *ra, objset_t *os, | |
1117 struct drr_free *drrf) | |
1118 { | |
1119 dmu_tx_t *tx; | |
1120 int err; | |
1121 | |
1122 if (drrf->drr_length != -1ULL && | |
1123 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) | |
1124 return (EINVAL); | |
1125 | |
1126 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) | |
1127 return (EINVAL); | |
1128 | |
1129 tx = dmu_tx_create(os); | |
1130 | |
1131 dmu_tx_hold_free(tx, drrf->drr_object, | |
1132 drrf->drr_offset, drrf->drr_length); | |
1133 err = dmu_tx_assign(tx, TXG_WAIT); | |
1134 if (err) { | |
1135 dmu_tx_abort(tx); | |
1136 return (err); | |
1137 } | |
1544 | 1138 err = dmu_free_range(os, drrf->drr_object, |
789 | 1139 drrf->drr_offset, drrf->drr_length, tx); |
1140 dmu_tx_commit(tx); | |
1544 | 1141 return (err); |
789 | 1142 } |
1143 | |
1144 int | |
1544 | 1145 dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, |
789 | 1146 vnode_t *vp, uint64_t voffset) |
1147 { | |
1148 struct restorearg ra; | |
1149 dmu_replay_record_t *drr; | |
1544 | 1150 char *cp; |
789 | 1151 objset_t *os = NULL; |
1544 | 1152 zio_cksum_t pzc; |
789 | 1153 |
1154 bzero(&ra, sizeof (ra)); | |
1155 ra.vp = vp; | |
1156 ra.voff = voffset; | |
1157 ra.bufsize = 1<<20; | |
1158 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); | |
1159 | |
1160 if (drrb->drr_magic == DMU_BACKUP_MAGIC) { | |
1161 ra.byteswap = FALSE; | |
1162 } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { | |
1163 ra.byteswap = TRUE; | |
1164 } else { | |
1165 ra.err = EINVAL; | |
1166 goto out; | |
1167 } | |
1168 | |
1544 | 1169 /* |
1170 * NB: this assumes that struct drr_begin will be the largest in | |
1171 * dmu_replay_record_t's drr_u, and thus we don't need to pad it | |
1172 * with zeros to make it the same length as we wrote out. | |
1173 */ | |
1174 ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; | |
1175 ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; | |
1176 ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; | |
1177 if (ra.byteswap) { | |
1178 fletcher_4_incremental_byteswap(ra.buf, | |
1179 sizeof (dmu_replay_record_t), &ra.zc); | |
1180 } else { | |
1181 fletcher_4_incremental_native(ra.buf, | |
1182 sizeof (dmu_replay_record_t), &ra.zc); | |
1183 } | |
1184 (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ | |
1185 | |
789 | 1186 if (ra.byteswap) { |
1187 drrb->drr_magic = BSWAP_64(drrb->drr_magic); | |
1188 drrb->drr_version = BSWAP_64(drrb->drr_version); | |
1189 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); | |
1190 drrb->drr_type = BSWAP_32(drrb->drr_type); | |
1191 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); | |
1192 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); | |
1193 } | |
1194 | |
1195 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); | |
1196 | |
1197 if (drrb->drr_version != DMU_BACKUP_VERSION || | |
1198 drrb->drr_type >= DMU_OST_NUMTYPES || | |
1199 strchr(drrb->drr_toname, '@') == NULL) { | |
1200 ra.err = EINVAL; | |
1201 goto out; | |
1202 } | |
1203 | |
1204 /* | |
1205 * Process the begin in syncing context. | |
1206 */ | |
1207 if (drrb->drr_fromguid) { | |
1208 /* incremental backup */ | |
2199 | 1209 dsl_dataset_t *ds = NULL; |
789 | 1210 |
1211 cp = strchr(tosnap, '@'); | |
1212 *cp = '\0'; | |
2199 | 1213 ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); |
789 | 1214 *cp = '@'; |
1544 | 1215 if (ra.err) |
789 | 1216 goto out; |
1217 | |
2199 | 1218 ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, |
1219 replay_incremental_check, replay_incremental_sync, | |
1220 ds, drrb, 1); | |
1221 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
789 | 1222 } else { |
1223 /* full backup */ | |
2199 | 1224 dsl_dir_t *dd = NULL; |
789 | 1225 const char *tail; |
1226 | |
2199 | 1227 /* can't restore full backup into topmost fs, for now */ |
1228 if (strrchr(drrb->drr_toname, '/') == NULL) { | |
1229 ra.err = EINVAL; | |
1230 goto out; | |
1231 } | |
1232 | |
789 | 1233 cp = strchr(tosnap, '@'); |
1234 *cp = '\0'; | |
1544 | 1235 ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); |
789 | 1236 *cp = '@'; |
1544 | 1237 if (ra.err) |
789 | 1238 goto out; |
1239 if (tail == NULL) { | |
1240 ra.err = EEXIST; | |
1241 goto out; | |
1242 } | |
1243 | |
2199 | 1244 ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, |
1245 replay_full_sync, dd, drrb, 5); | |
1246 dsl_dir_close(dd, FTAG); | |
789 | 1247 } |
1248 if (ra.err) | |
1249 goto out; | |
1250 | |
1251 /* | |
1252 * Open the objset we are modifying. | |
1253 */ | |
1254 | |
1255 cp = strchr(tosnap, '@'); | |
1256 *cp = '\0'; | |
1257 ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, | |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
1258 DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); |
789 | 1259 *cp = '@'; |
1260 ASSERT3U(ra.err, ==, 0); | |
1261 | |
1262 /* | |
1263 * Read records and process them. | |
1264 */ | |
1544 | 1265 pzc = ra.zc; |
789 | 1266 while (ra.err == 0 && |
1267 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { | |
1544 | 1268 if (issig(JUSTLOOKING) && issig(FORREAL)) { |
789 | 1269 ra.err = EINTR; |
1270 goto out; | |
1271 } | |
1272 | |
1273 if (ra.byteswap) | |
1274 backup_byteswap(drr); | |
1275 | |
1276 switch (drr->drr_type) { | |
1277 case DRR_OBJECT: | |
1278 { | |
1279 /* | |
1280 * We need to make a copy of the record header, | |
1281 * because restore_{object,write} may need to | |
1282 * restore_read(), which will invalidate drr. | |
1283 */ | |
1284 struct drr_object drro = drr->drr_u.drr_object; | |
1285 ra.err = restore_object(&ra, os, &drro); | |
1286 break; | |
1287 } | |
1288 case DRR_FREEOBJECTS: | |
1289 { | |
1290 struct drr_freeobjects drrfo = | |
1291 drr->drr_u.drr_freeobjects; | |
1292 ra.err = restore_freeobjects(&ra, os, &drrfo); | |
1293 break; | |
1294 } | |
1295 case DRR_WRITE: | |
1296 { | |
1297 struct drr_write drrw = drr->drr_u.drr_write; | |
1298 ra.err = restore_write(&ra, os, &drrw); | |
1299 break; | |
1300 } | |
1301 case DRR_FREE: | |
1302 { | |
1303 struct drr_free drrf = drr->drr_u.drr_free; | |
1304 ra.err = restore_free(&ra, os, &drrf); | |
1305 break; | |
1306 } | |
1307 case DRR_END: | |
1544 | 1308 { |
1309 struct drr_end drre = drr->drr_u.drr_end; | |
1310 /* | |
1311 * We compare against the *previous* checksum | |
1312 * value, because the stored checksum is of | |
1313 * everything before the DRR_END record. | |
1314 */ | |
1315 if (drre.drr_checksum.zc_word[0] != 0 && | |
1316 ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) | | |
1317 (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) | | |
1318 (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) | | |
1319 (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) { | |
1320 ra.err = ECKSUM; | |
1321 goto out; | |
1322 } | |
1323 | |
2199 | 1324 ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> |
1325 ds_dir->dd_pool, replay_end_check, replay_end_sync, | |
1326 os, drrb, 3); | |
789 | 1327 goto out; |
1544 | 1328 } |
789 | 1329 default: |
1330 ra.err = EINVAL; | |
1331 goto out; | |
1332 } | |
1544 | 1333 pzc = ra.zc; |
789 | 1334 } |
1335 | |
1336 out: | |
1337 if (os) | |
1338 dmu_objset_close(os); | |
1339 | |
1340 /* | |
1341 * Make sure we don't rollback/destroy unless we actually | |
1342 * processed the begin properly. 'os' will only be set if this | |
1343 * is the case. | |
1344 */ | |
2199 | 1345 if (ra.err && os && tosnap && strchr(tosnap, '@')) { |
789 | 1346 /* |
1347 * rollback or destroy what we created, so we don't | |
1348 * leave it in the restoring state. | |
1349 */ | |
2199 | 1350 dsl_dataset_t *ds; |
1351 int err; | |
1352 | |
1353 cp = strchr(tosnap, '@'); | |
1354 *cp = '\0'; | |
1355 err = dsl_dataset_open(tosnap, | |
1356 DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, | |
1357 FTAG, &ds); | |
1358 if (err == 0) { | |
1359 txg_wait_synced(ds->ds_dir->dd_pool, 0); | |
1360 if (drrb->drr_fromguid) { | |
1361 /* incremental: rollback to most recent snap */ | |
1362 (void) dsl_dataset_rollback(ds); | |
1363 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
1364 } else { | |
1365 /* full: destroy whole fs */ | |
1366 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
1367 (void) dsl_dataset_destroy(tosnap); | |
789 | 1368 } |
1369 } | |
2199 | 1370 *cp = '@'; |
789 | 1371 } |
1372 | |
1373 kmem_free(ra.buf, ra.bufsize); | |
1374 if (sizep) | |
1375 *sizep = ra.voff; | |
1376 return (ra.err); | |
1377 } | |
1378 | |
2237 | 1379 typedef struct { |
1380 uint64_t txg; | |
1381 dmu_buf_impl_t *db; | |
1382 dmu_sync_cb_t *done; | |
1383 void *arg; | |
1384 } dmu_sync_cbin_t; | |
1385 | |
1386 typedef union { | |
1387 dmu_sync_cbin_t data; | |
1388 blkptr_t blk; | |
1389 } dmu_sync_cbarg_t; | |
1390 | |
1391 /* ARGSUSED */ | |
1392 static void | |
1393 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) | |
1394 { | |
1395 dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg; | |
1396 dmu_buf_impl_t *db = in->db; | |
1397 uint64_t txg = in->txg; | |
1398 dmu_sync_cb_t *done = in->done; | |
1399 void *arg = in->arg; | |
1400 blkptr_t *blk = (blkptr_t *)varg; | |
1401 | |
1402 if (!BP_IS_HOLE(zio->io_bp)) { | |
1403 zio->io_bp->blk_fill = 1; | |
1404 BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); | |
1405 BP_SET_LEVEL(zio->io_bp, 0); | |
1406 } | |
1407 | |
1408 *blk = *zio->io_bp; /* structure assignment */ | |
1409 | |
1410 mutex_enter(&db->db_mtx); | |
1411 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC); | |
1412 db->db_d.db_overridden_by[txg&TXG_MASK] = blk; | |
1413 cv_broadcast(&db->db_changed); | |
1414 mutex_exit(&db->db_mtx); | |
1415 | |
1416 if (done) | |
1417 done(&(db->db), arg); | |
1418 } | |
1419 | |
789 | 1420 /* |
2237 | 1421 * Intent log support: sync the block associated with db to disk. |
1422 * N.B. and XXX: the caller is responsible for making sure that the | |
1423 * data isn't changing while dmu_sync() is writing it. | |
789 | 1424 * |
1425 * Return values: | |
1426 * | |
2237 | 1427 * EEXIST: this txg has already been synced, so there's nothing to to. |
789 | 1428 * The caller should not log the write. |
1429 * | |
1430 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. | |
1431 * The caller should not log the write. | |
1432 * | |
2237 | 1433 * EALREADY: this block is already in the process of being synced. |
1434 * The caller should track its progress (somehow). | |
789 | 1435 * |
2237 | 1436 * EINPROGRESS: the IO has been initiated. |
1437 * The caller should log this blkptr in the callback. | |
789 | 1438 * |
2237 | 1439 * 0: completed. Sets *bp to the blkptr just written. |
1440 * The caller should log this blkptr immediately. | |
789 | 1441 */ |
1442 int | |
2237 | 1443 dmu_sync(zio_t *pio, dmu_buf_t *db_fake, |
1444 blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) | |
789 | 1445 { |
2237 | 1446 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; |
1447 objset_impl_t *os = db->db_objset; | |
1448 dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; | |
789 | 1449 tx_state_t *tx = &dp->dp_tx; |
2237 | 1450 dmu_sync_cbin_t *in; |
789 | 1451 blkptr_t *blk; |
2237 | 1452 zbookmark_t zb; |
1453 uint32_t arc_flag; | |
789 | 1454 int err; |
1455 | |
1456 ASSERT(BP_IS_HOLE(bp)); | |
1457 ASSERT(txg != 0); | |
1458 | |
2237 | 1459 |
789 | 1460 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", |
1461 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); | |
1462 | |
1463 /* | |
2237 | 1464 * XXX - would be nice if we could do this without suspending... |
1544 | 1465 */ |
2237 | 1466 txg_suspend(dp); |
1544 | 1467 |
1468 /* | |
789 | 1469 * If this txg already synced, there's nothing to do. |
1470 */ | |
1471 if (txg <= tx->tx_synced_txg) { | |
2237 | 1472 txg_resume(dp); |
789 | 1473 /* |
1474 * If we're running ziltest, we need the blkptr regardless. | |
1475 */ | |
1476 if (txg > spa_freeze_txg(dp->dp_spa)) { | |
1477 /* if db_blkptr == NULL, this was an empty write */ | |
1478 if (db->db_blkptr) | |
1479 *bp = *db->db_blkptr; /* structure assignment */ | |
1480 return (0); | |
1481 } | |
2237 | 1482 return (EEXIST); |
789 | 1483 } |
1484 | |
1485 mutex_enter(&db->db_mtx); | |
1486 | |
2237 | 1487 blk = db->db_d.db_overridden_by[txg&TXG_MASK]; |
1488 if (blk == IN_DMU_SYNC) { | |
1489 /* | |
1490 * We have already issued a sync write for this buffer. | |
1491 */ | |
1492 mutex_exit(&db->db_mtx); | |
1493 txg_resume(dp); | |
1494 return (EALREADY); | |
1495 } else if (blk != NULL) { | |
1496 /* | |
1497 * This buffer had already been synced. It could not | |
1498 * have been dirtied since, or we would have cleared blk. | |
1499 */ | |
1500 *bp = *blk; /* structure assignment */ | |
789 | 1501 mutex_exit(&db->db_mtx); |
2237 | 1502 txg_resume(dp); |
1503 return (0); | |
1504 } | |
1505 | |
1506 if (txg == tx->tx_syncing_txg) { | |
1507 while (db->db_data_pending) { | |
1508 /* | |
1509 * IO is in-progress. Wait for it to finish. | |
1510 * XXX - would be nice to be able to somehow "attach" | |
1511 * this zio to the parent zio passed in. | |
1512 */ | |
1513 cv_wait(&db->db_changed, &db->db_mtx); | |
1514 ASSERT(db->db_data_pending || | |
1515 (db->db_blkptr && db->db_blkptr->blk_birth == txg)); | |
1516 } | |
1517 | |
1518 if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { | |
1519 /* | |
1520 * IO is already completed. | |
1521 */ | |
1522 *bp = *db->db_blkptr; /* structure assignment */ | |
1523 mutex_exit(&db->db_mtx); | |
1524 txg_resume(dp); | |
1525 return (0); | |
1526 } | |
1527 } | |
1528 | |
1529 if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) { | |
1530 /* | |
1531 * This dbuf isn't dirty, must have been free_range'd. | |
1532 * There's no need to log writes to freed blocks, so we're done. | |
1533 */ | |
1534 mutex_exit(&db->db_mtx); | |
1535 txg_resume(dp); | |
789 | 1536 return (ENOENT); |
1537 } | |
1538 | |
2237 | 1539 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); |
1540 db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; | |
789 | 1541 /* |
2237 | 1542 * XXX - a little ugly to stash the blkptr in the callback |
1543 * buffer. We always need to make sure the following is true: | |
1544 * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t)); | |
789 | 1545 */ |
2237 | 1546 in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); |
1547 in->db = db; | |
1548 in->txg = txg; | |
1549 in->done = done; | |
1550 in->arg = arg; | |
1551 mutex_exit(&db->db_mtx); | |
1552 txg_resume(dp); | |
789 | 1553 |
2237 | 1554 arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT; |
1555 zb.zb_objset = os->os_dsl_dataset->ds_object; | |
1544 | 1556 zb.zb_object = db->db.db_object; |
1557 zb.zb_level = db->db_level; | |
1558 zb.zb_blkid = db->db_blkid; | |
2237 | 1559 err = arc_write(pio, os->os_spa, |
1560 zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), | |
1561 zio_compress_select(db->db_dnode->dn_compress, os->os_compress), | |
1562 dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type), | |
1563 txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in, | |
1564 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb); | |
789 | 1565 ASSERT(err == 0); |
1566 | |
2237 | 1567 return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0); |
789 | 1568 } |
1569 | |
1570 uint64_t | |
1571 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) | |
1572 { | |
1544 | 1573 dnode_t *dn; |
1574 | |
1575 /* XXX assumes dnode_hold will not get an i/o error */ | |
1576 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1577 uint64_t rv = dnode_max_nonzero_offset(dn); |
1578 dnode_rele(dn, FTAG); | |
1579 return (rv); | |
1580 } | |
1581 | |
1582 int | |
1583 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, | |
1584 dmu_tx_t *tx) | |
1585 { | |
1544 | 1586 dnode_t *dn; |
1587 int err; | |
1588 | |
1589 err = dnode_hold(os->os, object, FTAG, &dn); | |
1590 if (err) | |
1591 return (err); | |
1592 err = dnode_set_blksz(dn, size, ibs, tx); | |
789 | 1593 dnode_rele(dn, FTAG); |
1594 return (err); | |
1595 } | |
1596 | |
1597 void | |
1598 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, | |
1599 dmu_tx_t *tx) | |
1600 { | |
1544 | 1601 dnode_t *dn; |
1602 | |
1603 /* XXX assumes dnode_hold will not get an i/o error */ | |
1604 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1605 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); |
1606 dn->dn_checksum = checksum; | |
1607 dnode_setdirty(dn, tx); | |
1608 dnode_rele(dn, FTAG); | |
1609 } | |
1610 | |
1611 void | |
1612 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, | |
1613 dmu_tx_t *tx) | |
1614 { | |
1544 | 1615 dnode_t *dn; |
1616 | |
1617 /* XXX assumes dnode_hold will not get an i/o error */ | |
1618 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1619 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); |
1620 dn->dn_compress = compress; | |
1621 dnode_setdirty(dn, tx); | |
1622 dnode_rele(dn, FTAG); | |
1623 } | |
1624 | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1625 /* |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1626 * XXX - eventually, this should take into account per-dataset (or |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1627 * even per-object?) user requests for higher levels of replication. |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1628 */ |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1629 int |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1630 dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1631 { |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1632 int ncopies = 1; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1633 |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1634 if (dmu_ot[ot].ot_metadata) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1635 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1636 if (zb->zb_level != 0) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1637 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1638 if (zb->zb_objset == 0 && zb->zb_object == 0) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1639 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1640 return (MIN(ncopies, spa_max_replication(spa))); |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1641 } |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1642 |
789 | 1643 int |
1644 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) | |
1645 { | |
1646 dnode_t *dn; | |
1647 int i, err; | |
1648 | |
1544 | 1649 err = dnode_hold(os->os, object, FTAG, &dn); |
1650 if (err) | |
1651 return (err); | |
789 | 1652 /* |
1653 * Sync any current changes before | |
1654 * we go trundling through the block pointers. | |
1655 */ | |
1656 for (i = 0; i < TXG_SIZE; i++) { | |
1596
2e2377ccbf85
6395371 ASSERT in dmu_tx_count_free: blkid + i < dn->dn_phys->dn_nblkptr
ahrens
parents:
1544
diff
changeset
|
1657 if (list_link_active(&dn->dn_dirty_link[i])) |
789 | 1658 break; |
1659 } | |
1660 if (i != TXG_SIZE) { | |
1661 dnode_rele(dn, FTAG); | |
1662 txg_wait_synced(dmu_objset_pool(os), 0); | |
1544 | 1663 err = dnode_hold(os->os, object, FTAG, &dn); |
1664 if (err) | |
1665 return (err); | |
789 | 1666 } |
1667 | |
1668 err = dnode_next_offset(dn, hole, off, 1, 1); | |
1669 dnode_rele(dn, FTAG); | |
1670 | |
1671 return (err); | |
1672 } | |
1673 | |
1674 void | |
1675 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) | |
1676 { | |
1677 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1678 mutex_enter(&dn->dn_mtx); | |
1679 | |
1680 doi->doi_data_block_size = dn->dn_datablksz; | |
1681 doi->doi_metadata_block_size = dn->dn_indblkshift ? | |
1682 1ULL << dn->dn_indblkshift : 0; | |
1683 doi->doi_indirection = dn->dn_nlevels; | |
1684 doi->doi_checksum = dn->dn_checksum; | |
1685 doi->doi_compress = dn->dn_compress; | |
2082 | 1686 doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + |
1687 SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; | |
789 | 1688 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; |
1689 doi->doi_type = dn->dn_type; | |
1690 doi->doi_bonus_size = dn->dn_bonuslen; | |
1691 doi->doi_bonus_type = dn->dn_bonustype; | |
1692 | |
1693 mutex_exit(&dn->dn_mtx); | |
1694 rw_exit(&dn->dn_struct_rwlock); | |
1695 } | |
1696 | |
1697 /* | |
1698 * Get information on a DMU object. | |
1699 * If doi is NULL, just indicates whether the object exists. | |
1700 */ | |
1701 int | |
1702 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) | |
1703 { | |
1544 | 1704 dnode_t *dn; |
1705 int err = dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1706 |
1544 | 1707 if (err) |
1708 return (err); | |
789 | 1709 |
1710 if (doi != NULL) | |
1711 dmu_object_info_from_dnode(dn, doi); | |
1712 | |
1713 dnode_rele(dn, FTAG); | |
1714 return (0); | |
1715 } | |
1716 | |
1717 /* | |
1718 * As above, but faster; can be used when you have a held dbuf in hand. | |
1719 */ | |
1720 void | |
1721 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) | |
1722 { | |
1723 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); | |
1724 } | |
1725 | |
1726 /* | |
1727 * Faster still when you only care about the size. | |
1728 * This is specifically optimized for zfs_getattr(). | |
1729 */ | |
1730 void | |
1731 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) | |
1732 { | |
1733 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; | |
1734 | |
1735 *blksize = dn->dn_datablksz; | |
2082 | 1736 /* add 1 for dnode space */ |
1737 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> | |
1738 SPA_MINBLOCKSHIFT) + 1; | |
789 | 1739 } |
1740 | |
1544 | 1741 /* |
1742 * Given a bookmark, return the name of the dataset, object, and range in | |
1743 * human-readable format. | |
1744 */ | |
1745 int | |
1746 spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen, | |
1747 char *objname, size_t objlen, char *range, size_t rangelen) | |
1748 { | |
1749 dsl_pool_t *dp; | |
1750 dsl_dataset_t *ds = NULL; | |
1751 objset_t *os = NULL; | |
1752 dnode_t *dn = NULL; | |
1753 int err, shift; | |
1754 | |
1755 if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64) | |
1756 return (ENOSPC); | |
1757 | |
1758 dp = spa_get_dsl(spa); | |
1759 if (zb->zb_objset != 0) { | |
1760 rw_enter(&dp->dp_config_rwlock, RW_READER); | |
1761 err = dsl_dataset_open_obj(dp, zb->zb_objset, | |
1762 NULL, DS_MODE_NONE, FTAG, &ds); | |
1763 if (err) { | |
1764 rw_exit(&dp->dp_config_rwlock); | |
1765 return (err); | |
1766 } | |
1767 dsl_dataset_name(ds, dsname); | |
1768 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); | |
1769 rw_exit(&dp->dp_config_rwlock); | |
1770 | |
1771 err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os); | |
1772 if (err) | |
1773 goto out; | |
1774 | |
1775 } else { | |
1776 dsl_dataset_name(NULL, dsname); | |
1777 os = dp->dp_meta_objset; | |
1778 } | |
1779 | |
1780 | |
1781 if (zb->zb_object == DMU_META_DNODE_OBJECT) { | |
1782 (void) strncpy(objname, "mdn", objlen); | |
1783 } else { | |
1784 (void) snprintf(objname, objlen, "%lld", | |
1785 (longlong_t)zb->zb_object); | |
1786 } | |
1787 | |
1788 err = dnode_hold(os->os, zb->zb_object, FTAG, &dn); | |
1789 if (err) | |
1790 goto out; | |
1791 | |
1792 shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) + | |
1793 zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); | |
1794 (void) snprintf(range, rangelen, "%llu-%llu", | |
1795 (u_longlong_t)(zb->zb_blkid << shift), | |
1796 (u_longlong_t)((zb->zb_blkid+1) << shift)); | |
1797 | |
1798 out: | |
1799 if (dn) | |
1800 dnode_rele(dn, FTAG); | |
1801 if (os && os != dp->dp_meta_objset) | |
1802 dmu_objset_close(os); | |
1803 return (err); | |
1804 } | |
1805 | |
789 | 1806 void |
1807 byteswap_uint64_array(void *vbuf, size_t size) | |
1808 { | |
1809 uint64_t *buf = vbuf; | |
1810 size_t count = size >> 3; | |
1811 int i; | |
1812 | |
1813 ASSERT((size & 7) == 0); | |
1814 | |
1815 for (i = 0; i < count; i++) | |
1816 buf[i] = BSWAP_64(buf[i]); | |
1817 } | |
1818 | |
1819 void | |
1820 byteswap_uint32_array(void *vbuf, size_t size) | |
1821 { | |
1822 uint32_t *buf = vbuf; | |
1823 size_t count = size >> 2; | |
1824 int i; | |
1825 | |
1826 ASSERT((size & 3) == 0); | |
1827 | |
1828 for (i = 0; i < count; i++) | |
1829 buf[i] = BSWAP_32(buf[i]); | |
1830 } | |
1831 | |
1832 void | |
1833 byteswap_uint16_array(void *vbuf, size_t size) | |
1834 { | |
1835 uint16_t *buf = vbuf; | |
1836 size_t count = size >> 1; | |
1837 int i; | |
1838 | |
1839 ASSERT((size & 1) == 0); | |
1840 | |
1841 for (i = 0; i < count; i++) | |
1842 buf[i] = BSWAP_16(buf[i]); | |
1843 } | |
1844 | |
1845 /* ARGSUSED */ | |
1846 void | |
1847 byteswap_uint8_array(void *vbuf, size_t size) | |
1848 { | |
1849 } | |
1850 | |
1851 void | |
1852 dmu_init(void) | |
1853 { | |
1854 dbuf_init(); | |
1855 dnode_init(); | |
1856 arc_init(); | |
1857 } | |
1858 | |
1859 void | |
1860 dmu_fini(void) | |
1861 { | |
1862 arc_fini(); | |
1863 dnode_fini(); | |
1864 dbuf_fini(); | |
1865 } |