Mercurial > illumos > illumos-gate
annotate usr/src/uts/common/fs/zfs/dmu.c @ 2199:712a788c2dfd
PSARC 2006/388 snapshot -r
6373978 want to take lots of snapshots quickly ('zfs snapshot -r')
author | ahrens |
---|---|
date | Wed, 14 Jun 2006 23:16:39 -0700 |
parents | 76b439ec3ac1 |
children | 45affe88ed99 |
rev | line source |
---|---|
789 | 1 /* |
2 * CDDL HEADER START | |
3 * | |
4 * The contents of this file are subject to the terms of the | |
1544 | 5 * Common Development and Distribution License (the "License"). |
6 * You may not use this file except in compliance with the License. | |
789 | 7 * |
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 * or http://www.opensolaris.org/os/licensing. | |
10 * See the License for the specific language governing permissions | |
11 * and limitations under the License. | |
12 * | |
13 * When distributing Covered Code, include this CDDL HEADER in each | |
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 * If applicable, add the following below this CDDL HEADER, with the | |
16 * fields enclosed by brackets "[]" replaced with your own identifying | |
17 * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 * | |
19 * CDDL HEADER END | |
20 */ | |
21 /* | |
1544 | 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. |
789 | 23 * Use is subject to license terms. |
24 */ | |
25 | |
26 #pragma ident "%Z%%M% %I% %E% SMI" | |
27 | |
28 #include <sys/dmu.h> | |
29 #include <sys/dmu_impl.h> | |
30 #include <sys/dmu_tx.h> | |
31 #include <sys/dbuf.h> | |
32 #include <sys/dnode.h> | |
33 #include <sys/zfs_context.h> | |
34 #include <sys/dmu_objset.h> | |
35 #include <sys/dmu_traverse.h> | |
36 #include <sys/dsl_dataset.h> | |
37 #include <sys/dsl_dir.h> | |
38 #include <sys/dsl_pool.h> | |
2199 | 39 #include <sys/dsl_synctask.h> |
789 | 40 #include <sys/dmu_zfetch.h> |
41 #include <sys/zfs_ioctl.h> | |
42 #include <sys/zap.h> | |
1544 | 43 #include <sys/zio_checksum.h> |
789 | 44 |
45 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { | |
46 { byteswap_uint8_array, TRUE, "unallocated" }, | |
47 { zap_byteswap, TRUE, "object directory" }, | |
48 { byteswap_uint64_array, TRUE, "object array" }, | |
49 { byteswap_uint8_array, TRUE, "packed nvlist" }, | |
50 { byteswap_uint64_array, TRUE, "packed nvlist size" }, | |
51 { byteswap_uint64_array, TRUE, "bplist" }, | |
52 { byteswap_uint64_array, TRUE, "bplist header" }, | |
53 { byteswap_uint64_array, TRUE, "SPA space map header" }, | |
54 { byteswap_uint64_array, TRUE, "SPA space map" }, | |
55 { byteswap_uint64_array, TRUE, "ZIL intent log" }, | |
56 { dnode_buf_byteswap, TRUE, "DMU dnode" }, | |
57 { dmu_objset_byteswap, TRUE, "DMU objset" }, | |
58 { byteswap_uint64_array, TRUE, "DSL directory" }, | |
59 { zap_byteswap, TRUE, "DSL directory child map"}, | |
60 { zap_byteswap, TRUE, "DSL dataset snap map" }, | |
61 { zap_byteswap, TRUE, "DSL props" }, | |
62 { byteswap_uint64_array, TRUE, "DSL dataset" }, | |
63 { zfs_znode_byteswap, TRUE, "ZFS znode" }, | |
64 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, | |
65 { byteswap_uint8_array, FALSE, "ZFS plain file" }, | |
66 { zap_byteswap, TRUE, "ZFS directory" }, | |
67 { zap_byteswap, TRUE, "ZFS master node" }, | |
68 { zap_byteswap, TRUE, "ZFS delete queue" }, | |
69 { byteswap_uint8_array, FALSE, "zvol object" }, | |
70 { zap_byteswap, TRUE, "zvol prop" }, | |
71 { byteswap_uint8_array, FALSE, "other uint8[]" }, | |
72 { byteswap_uint64_array, FALSE, "other uint64[]" }, | |
73 { zap_byteswap, TRUE, "other ZAP" }, | |
1544 | 74 { zap_byteswap, TRUE, "persistent error log" }, |
789 | 75 }; |
76 | |
77 int | |
1544 | 78 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, |
79 void *tag, dmu_buf_t **dbp) | |
789 | 80 { |
81 dnode_t *dn; | |
82 uint64_t blkid; | |
83 dmu_buf_impl_t *db; | |
1544 | 84 int err; |
789 | 85 |
1544 | 86 err = dnode_hold(os->os, object, FTAG, &dn); |
87 if (err) | |
88 return (err); | |
789 | 89 blkid = dbuf_whichblock(dn, offset); |
90 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1544 | 91 db = dbuf_hold(dn, blkid, tag); |
789 | 92 rw_exit(&dn->dn_struct_rwlock); |
1544 | 93 if (db == NULL) { |
94 err = EIO; | |
95 } else { | |
96 err = dbuf_read(db, NULL, DB_RF_CANFAIL); | |
97 if (err) { | |
98 dbuf_rele(db, tag); | |
99 db = NULL; | |
100 } | |
101 } | |
102 | |
789 | 103 dnode_rele(dn, FTAG); |
1544 | 104 *dbp = &db->db; |
105 return (err); | |
789 | 106 } |
107 | |
108 int | |
109 dmu_bonus_max(void) | |
110 { | |
111 return (DN_MAX_BONUSLEN); | |
112 } | |
113 | |
114 /* | |
1544 | 115 * returns ENOENT, EIO, or 0. |
789 | 116 */ |
1544 | 117 int |
118 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) | |
789 | 119 { |
1544 | 120 dnode_t *dn; |
121 int err, count; | |
789 | 122 dmu_buf_impl_t *db; |
123 | |
1544 | 124 err = dnode_hold(os->os, object, FTAG, &dn); |
125 if (err) | |
126 return (err); | |
789 | 127 |
1544 | 128 rw_enter(&dn->dn_struct_rwlock, RW_READER); |
129 if (dn->dn_bonus == NULL) { | |
789 | 130 rw_exit(&dn->dn_struct_rwlock); |
1544 | 131 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); |
132 if (dn->dn_bonus == NULL) | |
133 dn->dn_bonus = dbuf_create_bonus(dn); | |
789 | 134 } |
1544 | 135 db = dn->dn_bonus; |
136 rw_exit(&dn->dn_struct_rwlock); | |
137 mutex_enter(&db->db_mtx); | |
138 count = refcount_add(&db->db_holds, tag); | |
139 mutex_exit(&db->db_mtx); | |
140 if (count == 1) | |
141 dnode_add_ref(dn, db); | |
789 | 142 dnode_rele(dn, FTAG); |
1544 | 143 |
144 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); | |
145 | |
146 *dbp = &db->db; | |
147 return (0); | |
789 | 148 } |
149 | |
1544 | 150 int |
151 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, | |
152 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) | |
789 | 153 { |
1544 | 154 dnode_t *dn; |
789 | 155 dmu_buf_t **dbp; |
156 uint64_t blkid, nblks, i; | |
1544 | 157 uint32_t flags; |
158 int err; | |
159 zio_t *zio; | |
160 | |
161 ASSERT(length <= DMU_MAX_ACCESS); | |
789 | 162 |
163 if (length == 0) { | |
164 if (numbufsp) | |
165 *numbufsp = 0; | |
1544 | 166 *dbpp = NULL; |
167 return (0); | |
789 | 168 } |
169 | |
1544 | 170 flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
171 if (length > zfetch_array_rd_sz) |
1544 | 172 flags |= DB_RF_NOPREFETCH; |
173 | |
174 err = dnode_hold(os->os, object, FTAG, &dn); | |
175 if (err) | |
176 return (err); | |
177 | |
789 | 178 rw_enter(&dn->dn_struct_rwlock, RW_READER); |
179 if (dn->dn_datablkshift) { | |
180 int blkshift = dn->dn_datablkshift; | |
181 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - | |
182 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; | |
183 } else { | |
184 ASSERT3U(offset + length, <=, dn->dn_datablksz); | |
185 nblks = 1; | |
186 } | |
1544 | 187 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); |
789 | 188 |
1544 | 189 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); |
789 | 190 blkid = dbuf_whichblock(dn, offset); |
191 for (i = 0; i < nblks; i++) { | |
1544 | 192 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); |
193 if (db == NULL) { | |
194 rw_exit(&dn->dn_struct_rwlock); | |
195 dmu_buf_rele_array(dbp, nblks, tag); | |
196 dnode_rele(dn, FTAG); | |
197 zio_nowait(zio); | |
198 return (EIO); | |
199 } | |
200 /* initiate async i/o */ | |
201 if (read && db->db_state == DB_UNCACHED) { | |
202 rw_exit(&dn->dn_struct_rwlock); | |
203 (void) dbuf_read(db, zio, flags); | |
204 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
205 } | |
206 dbp[i] = &db->db; | |
789 | 207 } |
208 rw_exit(&dn->dn_struct_rwlock); | |
1544 | 209 dnode_rele(dn, FTAG); |
789 | 210 |
1544 | 211 /* wait for async i/o */ |
212 err = zio_wait(zio); | |
213 if (err) { | |
214 dmu_buf_rele_array(dbp, nblks, tag); | |
215 return (err); | |
789 | 216 } |
217 | |
1544 | 218 /* wait for other io to complete */ |
219 if (read) { | |
220 for (i = 0; i < nblks; i++) { | |
221 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; | |
222 mutex_enter(&db->db_mtx); | |
223 while (db->db_state == DB_READ || | |
224 db->db_state == DB_FILL) | |
225 cv_wait(&db->db_changed, &db->db_mtx); | |
226 if (db->db_state == DB_UNCACHED) | |
227 err = EIO; | |
228 mutex_exit(&db->db_mtx); | |
229 if (err) { | |
230 dmu_buf_rele_array(dbp, nblks, tag); | |
231 return (err); | |
232 } | |
233 } | |
234 } | |
789 | 235 |
1544 | 236 *numbufsp = nblks; |
237 *dbpp = dbp; | |
238 return (0); | |
789 | 239 } |
240 | |
241 void | |
1544 | 242 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) |
789 | 243 { |
244 int i; | |
245 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; | |
246 | |
247 if (numbufs == 0) | |
248 return; | |
249 | |
1544 | 250 for (i = 0; i < numbufs; i++) { |
251 if (dbp[i]) | |
252 dbuf_rele(dbp[i], tag); | |
253 } | |
789 | 254 |
255 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); | |
256 } | |
257 | |
258 void | |
259 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
260 { | |
261 dnode_t *dn; | |
262 uint64_t blkid; | |
1544 | 263 int nblks, i, err; |
789 | 264 |
265 if (len == 0) { /* they're interested in the bonus buffer */ | |
266 dn = os->os->os_meta_dnode; | |
267 | |
268 if (object == 0 || object >= DN_MAX_OBJECT) | |
269 return; | |
270 | |
271 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
272 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); | |
273 dbuf_prefetch(dn, blkid); | |
274 rw_exit(&dn->dn_struct_rwlock); | |
275 return; | |
276 } | |
277 | |
278 /* | |
279 * XXX - Note, if the dnode for the requested object is not | |
280 * already cached, we will do a *synchronous* read in the | |
281 * dnode_hold() call. The same is true for any indirects. | |
282 */ | |
1544 | 283 err = dnode_hold(os->os, object, FTAG, &dn); |
284 if (err != 0) | |
789 | 285 return; |
286 | |
287 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
288 if (dn->dn_datablkshift) { | |
289 int blkshift = dn->dn_datablkshift; | |
290 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - | |
291 P2ALIGN(offset, 1<<blkshift)) >> blkshift; | |
292 } else { | |
293 nblks = (offset < dn->dn_datablksz); | |
294 } | |
295 | |
296 if (nblks != 0) { | |
297 blkid = dbuf_whichblock(dn, offset); | |
298 for (i = 0; i < nblks; i++) | |
299 dbuf_prefetch(dn, blkid+i); | |
300 } | |
301 | |
302 rw_exit(&dn->dn_struct_rwlock); | |
303 | |
304 dnode_rele(dn, FTAG); | |
305 } | |
306 | |
1544 | 307 int |
789 | 308 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, |
309 uint64_t size, dmu_tx_t *tx) | |
310 { | |
1544 | 311 dnode_t *dn; |
312 int err = dnode_hold(os->os, object, FTAG, &dn); | |
313 if (err) | |
314 return (err); | |
789 | 315 ASSERT(offset < UINT64_MAX); |
316 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); | |
317 dnode_free_range(dn, offset, size, tx); | |
318 dnode_rele(dn, FTAG); | |
1544 | 319 return (0); |
789 | 320 } |
321 | |
1544 | 322 int |
323 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
324 void *buf) | |
789 | 325 { |
326 dnode_t *dn; | |
327 dmu_buf_t **dbp; | |
1544 | 328 int numbufs, i, err; |
789 | 329 |
1544 | 330 /* |
331 * Deal with odd block sizes, where there can't be data past the | |
332 * first block. | |
333 */ | |
334 err = dnode_hold(os->os, object, FTAG, &dn); | |
335 if (err) | |
336 return (err); | |
789 | 337 if (dn->dn_datablkshift == 0) { |
338 int newsz = offset > dn->dn_datablksz ? 0 : | |
339 MIN(size, dn->dn_datablksz - offset); | |
340 bzero((char *)buf + newsz, size - newsz); | |
341 size = newsz; | |
342 } | |
343 dnode_rele(dn, FTAG); | |
344 | |
345 while (size > 0) { | |
346 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); | |
347 int err; | |
348 | |
349 /* | |
350 * NB: we could do this block-at-a-time, but it's nice | |
351 * to be reading in parallel. | |
352 */ | |
1544 | 353 err = dmu_buf_hold_array(os, object, offset, mylen, |
354 TRUE, FTAG, &numbufs, &dbp); | |
355 if (err) | |
789 | 356 return (err); |
357 | |
358 for (i = 0; i < numbufs; i++) { | |
359 int tocpy; | |
360 int bufoff; | |
361 dmu_buf_t *db = dbp[i]; | |
362 | |
363 ASSERT(size > 0); | |
364 | |
365 bufoff = offset - db->db_offset; | |
366 tocpy = (int)MIN(db->db_size - bufoff, size); | |
367 | |
368 bcopy((char *)db->db_data + bufoff, buf, tocpy); | |
369 | |
370 offset += tocpy; | |
371 size -= tocpy; | |
372 buf = (char *)buf + tocpy; | |
373 } | |
1544 | 374 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 375 } |
376 return (0); | |
377 } | |
378 | |
379 void | |
380 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
381 const void *buf, dmu_tx_t *tx) | |
382 { | |
383 dmu_buf_t **dbp; | |
384 int numbufs, i; | |
385 | |
1544 | 386 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, |
387 FALSE, FTAG, &numbufs, &dbp)); | |
789 | 388 |
389 for (i = 0; i < numbufs; i++) { | |
390 int tocpy; | |
391 int bufoff; | |
392 dmu_buf_t *db = dbp[i]; | |
393 | |
394 ASSERT(size > 0); | |
395 | |
396 bufoff = offset - db->db_offset; | |
397 tocpy = (int)MIN(db->db_size - bufoff, size); | |
398 | |
399 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
400 | |
401 if (tocpy == db->db_size) | |
402 dmu_buf_will_fill(db, tx); | |
403 else | |
404 dmu_buf_will_dirty(db, tx); | |
405 | |
406 bcopy(buf, (char *)db->db_data + bufoff, tocpy); | |
407 | |
408 if (tocpy == db->db_size) | |
409 dmu_buf_fill_done(db, tx); | |
410 | |
411 offset += tocpy; | |
412 size -= tocpy; | |
413 buf = (char *)buf + tocpy; | |
414 } | |
1544 | 415 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 416 } |
417 | |
418 #ifdef _KERNEL | |
419 int | |
420 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
421 uio_t *uio, dmu_tx_t *tx) | |
422 { | |
423 dmu_buf_t **dbp; | |
424 int numbufs, i; | |
425 int err = 0; | |
426 | |
1544 | 427 err = dmu_buf_hold_array(os, object, offset, size, |
428 FALSE, FTAG, &numbufs, &dbp); | |
429 if (err) | |
430 return (err); | |
789 | 431 |
432 for (i = 0; i < numbufs; i++) { | |
433 int tocpy; | |
434 int bufoff; | |
435 dmu_buf_t *db = dbp[i]; | |
436 | |
437 ASSERT(size > 0); | |
438 | |
439 bufoff = offset - db->db_offset; | |
440 tocpy = (int)MIN(db->db_size - bufoff, size); | |
441 | |
442 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
443 | |
444 if (tocpy == db->db_size) | |
445 dmu_buf_will_fill(db, tx); | |
446 else | |
447 dmu_buf_will_dirty(db, tx); | |
448 | |
449 /* | |
450 * XXX uiomove could block forever (eg. nfs-backed | |
451 * pages). There needs to be a uiolockdown() function | |
452 * to lock the pages in memory, so that uiomove won't | |
453 * block. | |
454 */ | |
455 err = uiomove((char *)db->db_data + bufoff, tocpy, | |
456 UIO_WRITE, uio); | |
457 | |
458 if (tocpy == db->db_size) | |
459 dmu_buf_fill_done(db, tx); | |
460 | |
461 if (err) | |
462 break; | |
463 | |
464 offset += tocpy; | |
465 size -= tocpy; | |
466 } | |
1544 | 467 dmu_buf_rele_array(dbp, numbufs, FTAG); |
789 | 468 return (err); |
469 } | |
470 #endif | |
471 | |
2199 | 472 /* |
473 * XXX move send/recv stuff to its own new file! | |
474 */ | |
475 | |
789 | 476 struct backuparg { |
477 dmu_replay_record_t *drr; | |
478 vnode_t *vp; | |
479 objset_t *os; | |
1544 | 480 zio_cksum_t zc; |
789 | 481 int err; |
482 }; | |
483 | |
484 static int | |
485 dump_bytes(struct backuparg *ba, void *buf, int len) | |
486 { | |
487 ssize_t resid; /* have to get resid to get detailed errno */ | |
488 ASSERT3U(len % 8, ==, 0); | |
1544 | 489 |
490 fletcher_4_incremental_native(buf, len, &ba->zc); | |
789 | 491 ba->err = vn_rdwr(UIO_WRITE, ba->vp, |
492 (caddr_t)buf, len, | |
1630
4803baf78b7f
6398622 'zfs backup > file' can get 'file too large' error on 32-bit systems
ahrens
parents:
1596
diff
changeset
|
493 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); |
789 | 494 return (ba->err); |
495 } | |
496 | |
497 static int | |
498 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, | |
499 uint64_t length) | |
500 { | |
501 /* write a FREE record */ | |
502 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
503 ba->drr->drr_type = DRR_FREE; | |
504 ba->drr->drr_u.drr_free.drr_object = object; | |
505 ba->drr->drr_u.drr_free.drr_offset = offset; | |
506 ba->drr->drr_u.drr_free.drr_length = length; | |
507 | |
508 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
509 return (EINTR); | |
510 return (0); | |
511 } | |
512 | |
513 static int | |
514 dump_data(struct backuparg *ba, dmu_object_type_t type, | |
515 uint64_t object, uint64_t offset, int blksz, void *data) | |
516 { | |
517 /* write a DATA record */ | |
518 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
519 ba->drr->drr_type = DRR_WRITE; | |
520 ba->drr->drr_u.drr_write.drr_object = object; | |
521 ba->drr->drr_u.drr_write.drr_type = type; | |
522 ba->drr->drr_u.drr_write.drr_offset = offset; | |
523 ba->drr->drr_u.drr_write.drr_length = blksz; | |
524 | |
525 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
526 return (EINTR); | |
527 if (dump_bytes(ba, data, blksz)) | |
528 return (EINTR); | |
529 return (0); | |
530 } | |
531 | |
532 static int | |
533 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) | |
534 { | |
535 /* write a FREEOBJECTS record */ | |
536 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
537 ba->drr->drr_type = DRR_FREEOBJECTS; | |
538 ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; | |
539 ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; | |
540 | |
541 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
542 return (EINTR); | |
543 return (0); | |
544 } | |
545 | |
546 static int | |
547 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) | |
548 { | |
549 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) | |
550 return (dump_freeobjects(ba, object, 1)); | |
551 | |
552 /* write an OBJECT record */ | |
553 bzero(ba->drr, sizeof (dmu_replay_record_t)); | |
554 ba->drr->drr_type = DRR_OBJECT; | |
555 ba->drr->drr_u.drr_object.drr_object = object; | |
556 ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; | |
557 ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; | |
558 ba->drr->drr_u.drr_object.drr_blksz = | |
559 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
560 ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; | |
561 ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; | |
562 ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; | |
563 | |
564 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) | |
565 return (EINTR); | |
566 | |
567 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) | |
568 return (EINTR); | |
569 | |
570 /* free anything past the end of the file */ | |
571 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * | |
572 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) | |
573 return (EINTR); | |
574 if (ba->err) | |
575 return (EINTR); | |
576 return (0); | |
577 } | |
578 | |
579 #define BP_SPAN(dnp, level) \ | |
580 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ | |
581 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) | |
582 | |
583 static int | |
584 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) | |
585 { | |
586 struct backuparg *ba = arg; | |
587 uint64_t object = bc->bc_bookmark.zb_object; | |
588 int level = bc->bc_bookmark.zb_level; | |
589 uint64_t blkid = bc->bc_bookmark.zb_blkid; | |
590 blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; | |
591 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; | |
592 void *data = bc->bc_data; | |
593 int err = 0; | |
594 | |
1544 | 595 if (issig(JUSTLOOKING) && issig(FORREAL)) |
789 | 596 return (EINTR); |
597 | |
598 ASSERT(data || bp == NULL); | |
599 | |
600 if (bp == NULL && object == 0) { | |
601 uint64_t span = BP_SPAN(bc->bc_dnode, level); | |
602 uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; | |
603 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); | |
604 } else if (bp == NULL) { | |
605 uint64_t span = BP_SPAN(bc->bc_dnode, level); | |
606 err = dump_free(ba, object, blkid * span, span); | |
607 } else if (data && level == 0 && type == DMU_OT_DNODE) { | |
608 dnode_phys_t *blk = data; | |
609 int i; | |
610 int blksz = BP_GET_LSIZE(bp); | |
611 | |
612 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { | |
613 uint64_t dnobj = | |
614 (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; | |
615 err = dump_dnode(ba, dnobj, blk+i); | |
616 if (err) | |
617 break; | |
618 } | |
619 } else if (level == 0 && | |
620 type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { | |
621 int blksz = BP_GET_LSIZE(bp); | |
622 if (data == NULL) { | |
623 arc_buf_t *abuf; | |
1544 | 624 zbookmark_t zb; |
789 | 625 |
1544 | 626 zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; |
627 zb.zb_object = object; | |
628 zb.zb_level = level; | |
629 zb.zb_blkid = blkid; | |
789 | 630 (void) arc_read(NULL, spa, bp, |
631 dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, | |
632 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, | |
1544 | 633 ARC_WAIT, &zb); |
789 | 634 |
635 if (abuf) { | |
636 err = dump_data(ba, type, object, blkid * blksz, | |
637 blksz, abuf->b_data); | |
1544 | 638 (void) arc_buf_remove_ref(abuf, &abuf); |
789 | 639 } |
640 } else { | |
641 err = dump_data(ba, type, object, blkid * blksz, | |
642 blksz, data); | |
643 } | |
644 } | |
645 | |
646 ASSERT(err == 0 || err == EINTR); | |
647 return (err); | |
648 } | |
649 | |
650 int | |
651 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) | |
652 { | |
653 dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; | |
654 dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; | |
655 dmu_replay_record_t *drr; | |
656 struct backuparg ba; | |
657 int err; | |
658 | |
659 /* tosnap must be a snapshot */ | |
660 if (ds->ds_phys->ds_next_snap_obj == 0) | |
661 return (EINVAL); | |
662 | |
663 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ | |
664 if (fromds && (ds->ds_dir != fromds->ds_dir || | |
665 fromds->ds_phys->ds_creation_txg >= | |
666 ds->ds_phys->ds_creation_txg)) | |
667 return (EXDEV); | |
668 | |
669 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); | |
670 drr->drr_type = DRR_BEGIN; | |
671 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; | |
672 drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; | |
673 drr->drr_u.drr_begin.drr_creation_time = | |
674 ds->ds_phys->ds_creation_time; | |
675 drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; | |
676 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; | |
677 if (fromds) | |
678 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; | |
679 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); | |
680 | |
681 ba.drr = drr; | |
682 ba.vp = vp; | |
683 ba.os = tosnap; | |
1544 | 684 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); |
789 | 685 |
686 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { | |
687 kmem_free(drr, sizeof (dmu_replay_record_t)); | |
688 return (ba.err); | |
689 } | |
690 | |
691 err = traverse_dsl_dataset(ds, | |
692 fromds ? fromds->ds_phys->ds_creation_txg : 0, | |
693 ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, | |
694 backup_cb, &ba); | |
695 | |
696 if (err) { | |
697 if (err == EINTR && ba.err) | |
698 err = ba.err; | |
699 return (err); | |
700 } | |
701 | |
702 bzero(drr, sizeof (dmu_replay_record_t)); | |
703 drr->drr_type = DRR_END; | |
1544 | 704 drr->drr_u.drr_end.drr_checksum = ba.zc; |
789 | 705 |
706 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) | |
707 return (ba.err); | |
708 | |
709 kmem_free(drr, sizeof (dmu_replay_record_t)); | |
710 | |
711 return (0); | |
712 } | |
713 | |
714 struct restorearg { | |
715 int err; | |
716 int byteswap; | |
717 vnode_t *vp; | |
718 char *buf; | |
719 uint64_t voff; | |
720 int buflen; /* number of valid bytes in buf */ | |
721 int bufoff; /* next offset to read */ | |
722 int bufsize; /* amount of memory allocated for buf */ | |
1544 | 723 zio_cksum_t zc; |
789 | 724 }; |
725 | |
2199 | 726 /* ARGSUSED */ |
789 | 727 static int |
2199 | 728 replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 729 { |
2199 | 730 dsl_dataset_t *ds = arg1; |
731 struct drr_begin *drrb = arg2; | |
789 | 732 const char *snapname; |
2199 | 733 int err; |
789 | 734 uint64_t val; |
735 | |
736 /* must already be a snapshot of this fs */ | |
2199 | 737 if (ds->ds_phys->ds_prev_snap_obj == 0) |
738 return (ENODEV); | |
789 | 739 |
740 /* most recent snapshot must match fromguid */ | |
2199 | 741 if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) |
742 return (ENODEV); | |
789 | 743 /* must not have any changes since most recent snapshot */ |
744 if (ds->ds_phys->ds_bp.blk_birth > | |
2199 | 745 ds->ds_prev->ds_phys->ds_creation_txg) |
746 return (ETXTBSY); | |
789 | 747 |
748 /* new snapshot name must not exist */ | |
749 snapname = strrchr(drrb->drr_toname, '@'); | |
2199 | 750 if (snapname == NULL) |
751 return (EEXIST); | |
752 | |
789 | 753 snapname++; |
2199 | 754 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, |
789 | 755 ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); |
2199 | 756 if (err == 0) |
757 return (EEXIST); | |
758 if (err != ENOENT) | |
789 | 759 return (err); |
2199 | 760 |
761 return (0); | |
762 } | |
789 | 763 |
2199 | 764 /* ARGSUSED */ |
765 static void | |
766 replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
767 { | |
768 dsl_dataset_t *ds = arg1; | |
789 | 769 dmu_buf_will_dirty(ds->ds_dbuf, tx); |
2082 | 770 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; |
789 | 771 } |
772 | |
2199 | 773 /* ARGSUSED */ |
789 | 774 static int |
2199 | 775 replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 776 { |
2199 | 777 dsl_dir_t *dd = arg1; |
778 struct drr_begin *drrb = arg2; | |
779 objset_t *mos = dd->dd_pool->dp_meta_objset; | |
780 char *cp; | |
781 uint64_t val; | |
789 | 782 int err; |
783 | |
2199 | 784 cp = strchr(drrb->drr_toname, '@'); |
789 | 785 *cp = '\0'; |
2199 | 786 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, |
787 strrchr(drrb->drr_toname, '/') + 1, | |
788 sizeof (uint64_t), 1, &val); | |
789 *cp = '@'; | |
790 | |
791 if (err != ENOENT) | |
792 return (err ? err : EEXIST); | |
793 | |
794 return (0); | |
795 } | |
789 | 796 |
2199 | 797 static void |
798 replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
799 { | |
800 dsl_dir_t *dd = arg1; | |
801 struct drr_begin *drrb = arg2; | |
802 char *cp; | |
803 dsl_dataset_t *ds; | |
804 uint64_t dsobj; | |
789 | 805 |
2199 | 806 cp = strchr(drrb->drr_toname, '@'); |
807 *cp = '\0'; | |
808 dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, | |
809 NULL, tx); | |
810 *cp = '@'; | |
789 | 811 |
2199 | 812 VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, |
1544 | 813 DS_MODE_EXCLUSIVE, FTAG, &ds)); |
789 | 814 |
815 (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), | |
816 ds, drrb->drr_type, tx); | |
817 | |
818 dmu_buf_will_dirty(ds->ds_dbuf, tx); | |
2082 | 819 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; |
789 | 820 |
821 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
822 } | |
823 | |
824 static int | |
2199 | 825 replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) |
789 | 826 { |
2199 | 827 objset_t *os = arg1; |
828 struct drr_begin *drrb = arg2; | |
789 | 829 char *snapname; |
830 | |
831 /* XXX verify that drr_toname is in dd */ | |
832 | |
833 snapname = strchr(drrb->drr_toname, '@'); | |
834 if (snapname == NULL) | |
835 return (EINVAL); | |
836 snapname++; | |
837 | |
2199 | 838 return (dsl_dataset_snapshot_check(os, snapname, tx)); |
839 } | |
840 | |
841 static void | |
842 replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) | |
843 { | |
844 objset_t *os = arg1; | |
845 struct drr_begin *drrb = arg2; | |
846 char *snapname; | |
847 dsl_dataset_t *ds, *hds; | |
848 | |
849 snapname = strchr(drrb->drr_toname, '@') + 1; | |
850 | |
851 dsl_dataset_snapshot_sync(os, snapname, tx); | |
789 | 852 |
853 /* set snapshot's creation time and guid */ | |
2199 | 854 hds = os->os->os_dsl_dataset; |
855 VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, | |
856 hds->ds_phys->ds_prev_snap_obj, NULL, | |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
857 DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, |
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
858 FTAG, &ds)); |
789 | 859 |
860 dmu_buf_will_dirty(ds->ds_dbuf, tx); | |
861 ds->ds_phys->ds_creation_time = drrb->drr_creation_time; | |
862 ds->ds_phys->ds_guid = drrb->drr_toguid; | |
2082 | 863 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; |
789 | 864 |
865 dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); | |
866 | |
2199 | 867 dmu_buf_will_dirty(hds->ds_dbuf, tx); |
868 hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; | |
789 | 869 } |
870 | |
871 void * | |
872 restore_read(struct restorearg *ra, int len) | |
873 { | |
874 void *rv; | |
875 | |
876 /* some things will require 8-byte alignment, so everything must */ | |
877 ASSERT3U(len % 8, ==, 0); | |
878 | |
879 while (ra->buflen - ra->bufoff < len) { | |
880 ssize_t resid; | |
881 int leftover = ra->buflen - ra->bufoff; | |
882 | |
883 (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); | |
884 ra->err = vn_rdwr(UIO_READ, ra->vp, | |
885 (caddr_t)ra->buf + leftover, ra->bufsize - leftover, | |
886 ra->voff, UIO_SYSSPACE, FAPPEND, | |
1630
4803baf78b7f
6398622 'zfs backup > file' can get 'file too large' error on 32-bit systems
ahrens
parents:
1596
diff
changeset
|
887 RLIM64_INFINITY, CRED(), &resid); |
789 | 888 |
889 ra->voff += ra->bufsize - leftover - resid; | |
890 ra->buflen = ra->bufsize - resid; | |
891 ra->bufoff = 0; | |
892 if (resid == ra->bufsize - leftover) | |
893 ra->err = EINVAL; | |
894 if (ra->err) | |
895 return (NULL); | |
1544 | 896 /* Could compute checksum here? */ |
789 | 897 } |
898 | |
899 ASSERT3U(ra->bufoff % 8, ==, 0); | |
900 ASSERT3U(ra->buflen - ra->bufoff, >=, len); | |
901 rv = ra->buf + ra->bufoff; | |
902 ra->bufoff += len; | |
1544 | 903 if (ra->byteswap) |
904 fletcher_4_incremental_byteswap(rv, len, &ra->zc); | |
905 else | |
906 fletcher_4_incremental_native(rv, len, &ra->zc); | |
789 | 907 return (rv); |
908 } | |
909 | |
910 static void | |
911 backup_byteswap(dmu_replay_record_t *drr) | |
912 { | |
913 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) | |
914 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) | |
915 drr->drr_type = BSWAP_32(drr->drr_type); | |
916 switch (drr->drr_type) { | |
917 case DRR_BEGIN: | |
918 DO64(drr_begin.drr_magic); | |
919 DO64(drr_begin.drr_version); | |
920 DO64(drr_begin.drr_creation_time); | |
921 DO32(drr_begin.drr_type); | |
922 DO64(drr_begin.drr_toguid); | |
923 DO64(drr_begin.drr_fromguid); | |
924 break; | |
925 case DRR_OBJECT: | |
926 DO64(drr_object.drr_object); | |
927 /* DO64(drr_object.drr_allocation_txg); */ | |
928 DO32(drr_object.drr_type); | |
929 DO32(drr_object.drr_bonustype); | |
930 DO32(drr_object.drr_blksz); | |
931 DO32(drr_object.drr_bonuslen); | |
932 break; | |
933 case DRR_FREEOBJECTS: | |
934 DO64(drr_freeobjects.drr_firstobj); | |
935 DO64(drr_freeobjects.drr_numobjs); | |
936 break; | |
937 case DRR_WRITE: | |
938 DO64(drr_write.drr_object); | |
939 DO32(drr_write.drr_type); | |
940 DO64(drr_write.drr_offset); | |
941 DO64(drr_write.drr_length); | |
942 break; | |
943 case DRR_FREE: | |
944 DO64(drr_free.drr_object); | |
945 DO64(drr_free.drr_offset); | |
946 DO64(drr_free.drr_length); | |
947 break; | |
948 case DRR_END: | |
1544 | 949 DO64(drr_end.drr_checksum.zc_word[0]); |
950 DO64(drr_end.drr_checksum.zc_word[1]); | |
951 DO64(drr_end.drr_checksum.zc_word[2]); | |
952 DO64(drr_end.drr_checksum.zc_word[3]); | |
789 | 953 break; |
954 } | |
955 #undef DO64 | |
956 #undef DO32 | |
957 } | |
958 | |
959 static int | |
960 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) | |
961 { | |
962 int err; | |
963 dmu_tx_t *tx; | |
964 | |
965 err = dmu_object_info(os, drro->drr_object, NULL); | |
966 | |
967 if (err != 0 && err != ENOENT) | |
968 return (EINVAL); | |
969 | |
970 if (drro->drr_type == DMU_OT_NONE || | |
971 drro->drr_type >= DMU_OT_NUMTYPES || | |
972 drro->drr_bonustype >= DMU_OT_NUMTYPES || | |
973 drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || | |
974 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || | |
975 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || | |
976 drro->drr_blksz < SPA_MINBLOCKSIZE || | |
977 drro->drr_blksz > SPA_MAXBLOCKSIZE || | |
978 drro->drr_bonuslen > DN_MAX_BONUSLEN) { | |
979 return (EINVAL); | |
980 } | |
981 | |
982 tx = dmu_tx_create(os); | |
983 | |
984 if (err == ENOENT) { | |
985 /* currently free, want to be allocated */ | |
986 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
987 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); | |
988 err = dmu_tx_assign(tx, TXG_WAIT); | |
989 if (err) { | |
990 dmu_tx_abort(tx); | |
991 return (err); | |
992 } | |
993 err = dmu_object_claim(os, drro->drr_object, | |
994 drro->drr_type, drro->drr_blksz, | |
995 drro->drr_bonustype, drro->drr_bonuslen, tx); | |
996 } else { | |
997 /* currently allocated, want to be allocated */ | |
998 dmu_tx_hold_bonus(tx, drro->drr_object); | |
999 /* | |
1000 * We may change blocksize, so need to | |
1001 * hold_write | |
1002 */ | |
1003 dmu_tx_hold_write(tx, drro->drr_object, 0, 1); | |
1004 err = dmu_tx_assign(tx, TXG_WAIT); | |
1005 if (err) { | |
1006 dmu_tx_abort(tx); | |
1007 return (err); | |
1008 } | |
1009 | |
1010 err = dmu_object_reclaim(os, drro->drr_object, | |
1011 drro->drr_type, drro->drr_blksz, | |
1012 drro->drr_bonustype, drro->drr_bonuslen, tx); | |
1013 } | |
1014 if (err) { | |
1015 dmu_tx_commit(tx); | |
1016 return (EINVAL); | |
1017 } | |
1018 | |
1019 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); | |
1020 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); | |
1021 | |
1022 if (drro->drr_bonuslen) { | |
1023 dmu_buf_t *db; | |
1024 void *data; | |
1544 | 1025 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); |
789 | 1026 dmu_buf_will_dirty(db, tx); |
1027 | |
1028 ASSERT3U(db->db_size, ==, drro->drr_bonuslen); | |
1029 data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); | |
1030 if (data == NULL) { | |
1031 dmu_tx_commit(tx); | |
1032 return (ra->err); | |
1033 } | |
1034 bcopy(data, db->db_data, db->db_size); | |
1035 if (ra->byteswap) { | |
1036 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, | |
1037 drro->drr_bonuslen); | |
1038 } | |
1544 | 1039 dmu_buf_rele(db, FTAG); |
789 | 1040 } |
1041 dmu_tx_commit(tx); | |
1042 return (0); | |
1043 } | |
1044 | |
1045 /* ARGSUSED */ | |
1046 static int | |
1047 restore_freeobjects(struct restorearg *ra, objset_t *os, | |
1048 struct drr_freeobjects *drrfo) | |
1049 { | |
1050 uint64_t obj; | |
1051 | |
1052 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) | |
1053 return (EINVAL); | |
1054 | |
1055 for (obj = drrfo->drr_firstobj; | |
1056 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { | |
1057 dmu_tx_t *tx; | |
1058 int err; | |
1059 | |
1060 if (dmu_object_info(os, obj, NULL) != 0) | |
1061 continue; | |
1062 | |
1063 tx = dmu_tx_create(os); | |
1064 dmu_tx_hold_bonus(tx, obj); | |
1065 err = dmu_tx_assign(tx, TXG_WAIT); | |
1066 if (err) { | |
1067 dmu_tx_abort(tx); | |
1068 return (err); | |
1069 } | |
1070 err = dmu_object_free(os, obj, tx); | |
1071 dmu_tx_commit(tx); | |
1072 if (err && err != ENOENT) | |
1073 return (EINVAL); | |
1074 } | |
1075 return (0); | |
1076 } | |
1077 | |
1078 static int | |
1079 restore_write(struct restorearg *ra, objset_t *os, | |
1080 struct drr_write *drrw) | |
1081 { | |
1082 dmu_tx_t *tx; | |
1083 void *data; | |
1084 int err; | |
1085 | |
1086 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || | |
1087 drrw->drr_type >= DMU_OT_NUMTYPES) | |
1088 return (EINVAL); | |
1089 | |
1090 data = restore_read(ra, drrw->drr_length); | |
1091 if (data == NULL) | |
1092 return (ra->err); | |
1093 | |
1094 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) | |
1095 return (EINVAL); | |
1096 | |
1097 tx = dmu_tx_create(os); | |
1098 | |
1099 dmu_tx_hold_write(tx, drrw->drr_object, | |
1100 drrw->drr_offset, drrw->drr_length); | |
1101 err = dmu_tx_assign(tx, TXG_WAIT); | |
1102 if (err) { | |
1103 dmu_tx_abort(tx); | |
1104 return (err); | |
1105 } | |
1106 if (ra->byteswap) | |
1107 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); | |
1108 dmu_write(os, drrw->drr_object, | |
1109 drrw->drr_offset, drrw->drr_length, data, tx); | |
1110 dmu_tx_commit(tx); | |
1111 return (0); | |
1112 } | |
1113 | |
1114 /* ARGSUSED */ | |
1115 static int | |
1116 restore_free(struct restorearg *ra, objset_t *os, | |
1117 struct drr_free *drrf) | |
1118 { | |
1119 dmu_tx_t *tx; | |
1120 int err; | |
1121 | |
1122 if (drrf->drr_length != -1ULL && | |
1123 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) | |
1124 return (EINVAL); | |
1125 | |
1126 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) | |
1127 return (EINVAL); | |
1128 | |
1129 tx = dmu_tx_create(os); | |
1130 | |
1131 dmu_tx_hold_free(tx, drrf->drr_object, | |
1132 drrf->drr_offset, drrf->drr_length); | |
1133 err = dmu_tx_assign(tx, TXG_WAIT); | |
1134 if (err) { | |
1135 dmu_tx_abort(tx); | |
1136 return (err); | |
1137 } | |
1544 | 1138 err = dmu_free_range(os, drrf->drr_object, |
789 | 1139 drrf->drr_offset, drrf->drr_length, tx); |
1140 dmu_tx_commit(tx); | |
1544 | 1141 return (err); |
789 | 1142 } |
1143 | |
1144 int | |
1544 | 1145 dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, |
789 | 1146 vnode_t *vp, uint64_t voffset) |
1147 { | |
1148 struct restorearg ra; | |
1149 dmu_replay_record_t *drr; | |
1544 | 1150 char *cp; |
789 | 1151 objset_t *os = NULL; |
1544 | 1152 zio_cksum_t pzc; |
789 | 1153 |
1154 bzero(&ra, sizeof (ra)); | |
1155 ra.vp = vp; | |
1156 ra.voff = voffset; | |
1157 ra.bufsize = 1<<20; | |
1158 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); | |
1159 | |
1160 if (drrb->drr_magic == DMU_BACKUP_MAGIC) { | |
1161 ra.byteswap = FALSE; | |
1162 } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { | |
1163 ra.byteswap = TRUE; | |
1164 } else { | |
1165 ra.err = EINVAL; | |
1166 goto out; | |
1167 } | |
1168 | |
1544 | 1169 /* |
1170 * NB: this assumes that struct drr_begin will be the largest in | |
1171 * dmu_replay_record_t's drr_u, and thus we don't need to pad it | |
1172 * with zeros to make it the same length as we wrote out. | |
1173 */ | |
1174 ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; | |
1175 ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; | |
1176 ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; | |
1177 if (ra.byteswap) { | |
1178 fletcher_4_incremental_byteswap(ra.buf, | |
1179 sizeof (dmu_replay_record_t), &ra.zc); | |
1180 } else { | |
1181 fletcher_4_incremental_native(ra.buf, | |
1182 sizeof (dmu_replay_record_t), &ra.zc); | |
1183 } | |
1184 (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ | |
1185 | |
789 | 1186 if (ra.byteswap) { |
1187 drrb->drr_magic = BSWAP_64(drrb->drr_magic); | |
1188 drrb->drr_version = BSWAP_64(drrb->drr_version); | |
1189 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); | |
1190 drrb->drr_type = BSWAP_32(drrb->drr_type); | |
1191 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); | |
1192 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); | |
1193 } | |
1194 | |
1195 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); | |
1196 | |
1197 if (drrb->drr_version != DMU_BACKUP_VERSION || | |
1198 drrb->drr_type >= DMU_OST_NUMTYPES || | |
1199 strchr(drrb->drr_toname, '@') == NULL) { | |
1200 ra.err = EINVAL; | |
1201 goto out; | |
1202 } | |
1203 | |
1204 /* | |
1205 * Process the begin in syncing context. | |
1206 */ | |
1207 if (drrb->drr_fromguid) { | |
1208 /* incremental backup */ | |
2199 | 1209 dsl_dataset_t *ds = NULL; |
789 | 1210 |
1211 cp = strchr(tosnap, '@'); | |
1212 *cp = '\0'; | |
2199 | 1213 ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); |
789 | 1214 *cp = '@'; |
1544 | 1215 if (ra.err) |
789 | 1216 goto out; |
1217 | |
2199 | 1218 ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, |
1219 replay_incremental_check, replay_incremental_sync, | |
1220 ds, drrb, 1); | |
1221 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
789 | 1222 } else { |
1223 /* full backup */ | |
2199 | 1224 dsl_dir_t *dd = NULL; |
789 | 1225 const char *tail; |
1226 | |
2199 | 1227 /* can't restore full backup into topmost fs, for now */ |
1228 if (strrchr(drrb->drr_toname, '/') == NULL) { | |
1229 ra.err = EINVAL; | |
1230 goto out; | |
1231 } | |
1232 | |
789 | 1233 cp = strchr(tosnap, '@'); |
1234 *cp = '\0'; | |
1544 | 1235 ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); |
789 | 1236 *cp = '@'; |
1544 | 1237 if (ra.err) |
789 | 1238 goto out; |
1239 if (tail == NULL) { | |
1240 ra.err = EEXIST; | |
1241 goto out; | |
1242 } | |
1243 | |
2199 | 1244 ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, |
1245 replay_full_sync, dd, drrb, 5); | |
1246 dsl_dir_close(dd, FTAG); | |
789 | 1247 } |
1248 if (ra.err) | |
1249 goto out; | |
1250 | |
1251 /* | |
1252 * Open the objset we are modifying. | |
1253 */ | |
1254 | |
1255 cp = strchr(tosnap, '@'); | |
1256 *cp = '\0'; | |
1257 ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, | |
1731
1efa8b3d1296
6402598 'zfs destroy <fs>' can take a long time, stopping up the txg train
bonwick
parents:
1630
diff
changeset
|
1258 DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); |
789 | 1259 *cp = '@'; |
1260 ASSERT3U(ra.err, ==, 0); | |
1261 | |
1262 /* | |
1263 * Read records and process them. | |
1264 */ | |
1544 | 1265 pzc = ra.zc; |
789 | 1266 while (ra.err == 0 && |
1267 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { | |
1544 | 1268 if (issig(JUSTLOOKING) && issig(FORREAL)) { |
789 | 1269 ra.err = EINTR; |
1270 goto out; | |
1271 } | |
1272 | |
1273 if (ra.byteswap) | |
1274 backup_byteswap(drr); | |
1275 | |
1276 switch (drr->drr_type) { | |
1277 case DRR_OBJECT: | |
1278 { | |
1279 /* | |
1280 * We need to make a copy of the record header, | |
1281 * because restore_{object,write} may need to | |
1282 * restore_read(), which will invalidate drr. | |
1283 */ | |
1284 struct drr_object drro = drr->drr_u.drr_object; | |
1285 ra.err = restore_object(&ra, os, &drro); | |
1286 break; | |
1287 } | |
1288 case DRR_FREEOBJECTS: | |
1289 { | |
1290 struct drr_freeobjects drrfo = | |
1291 drr->drr_u.drr_freeobjects; | |
1292 ra.err = restore_freeobjects(&ra, os, &drrfo); | |
1293 break; | |
1294 } | |
1295 case DRR_WRITE: | |
1296 { | |
1297 struct drr_write drrw = drr->drr_u.drr_write; | |
1298 ra.err = restore_write(&ra, os, &drrw); | |
1299 break; | |
1300 } | |
1301 case DRR_FREE: | |
1302 { | |
1303 struct drr_free drrf = drr->drr_u.drr_free; | |
1304 ra.err = restore_free(&ra, os, &drrf); | |
1305 break; | |
1306 } | |
1307 case DRR_END: | |
1544 | 1308 { |
1309 struct drr_end drre = drr->drr_u.drr_end; | |
1310 /* | |
1311 * We compare against the *previous* checksum | |
1312 * value, because the stored checksum is of | |
1313 * everything before the DRR_END record. | |
1314 */ | |
1315 if (drre.drr_checksum.zc_word[0] != 0 && | |
1316 ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) | | |
1317 (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) | | |
1318 (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) | | |
1319 (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) { | |
1320 ra.err = ECKSUM; | |
1321 goto out; | |
1322 } | |
1323 | |
2199 | 1324 ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> |
1325 ds_dir->dd_pool, replay_end_check, replay_end_sync, | |
1326 os, drrb, 3); | |
789 | 1327 goto out; |
1544 | 1328 } |
789 | 1329 default: |
1330 ra.err = EINVAL; | |
1331 goto out; | |
1332 } | |
1544 | 1333 pzc = ra.zc; |
789 | 1334 } |
1335 | |
1336 out: | |
1337 if (os) | |
1338 dmu_objset_close(os); | |
1339 | |
1340 /* | |
1341 * Make sure we don't rollback/destroy unless we actually | |
1342 * processed the begin properly. 'os' will only be set if this | |
1343 * is the case. | |
1344 */ | |
2199 | 1345 if (ra.err && os && tosnap && strchr(tosnap, '@')) { |
789 | 1346 /* |
1347 * rollback or destroy what we created, so we don't | |
1348 * leave it in the restoring state. | |
1349 */ | |
2199 | 1350 dsl_dataset_t *ds; |
1351 int err; | |
1352 | |
1353 cp = strchr(tosnap, '@'); | |
1354 *cp = '\0'; | |
1355 err = dsl_dataset_open(tosnap, | |
1356 DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, | |
1357 FTAG, &ds); | |
1358 if (err == 0) { | |
1359 txg_wait_synced(ds->ds_dir->dd_pool, 0); | |
1360 if (drrb->drr_fromguid) { | |
1361 /* incremental: rollback to most recent snap */ | |
1362 (void) dsl_dataset_rollback(ds); | |
1363 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
1364 } else { | |
1365 /* full: destroy whole fs */ | |
1366 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); | |
1367 (void) dsl_dataset_destroy(tosnap); | |
789 | 1368 } |
1369 } | |
2199 | 1370 *cp = '@'; |
789 | 1371 } |
1372 | |
1373 kmem_free(ra.buf, ra.bufsize); | |
1374 if (sizep) | |
1375 *sizep = ra.voff; | |
1376 return (ra.err); | |
1377 } | |
1378 | |
1379 /* | |
1380 * Intent log support: sync the block at <os, object, offset> to disk. | |
1381 * N.B. and XXX: the caller is responsible for serializing dmu_sync()s | |
1382 * of the same block, and for making sure that the data isn't changing | |
1383 * while dmu_sync() is writing it. | |
1384 * | |
1385 * Return values: | |
1386 * | |
1387 * EALREADY: this txg has already been synced, so there's nothing to to. | |
1388 * The caller should not log the write. | |
1389 * | |
1390 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. | |
1391 * The caller should not log the write. | |
1392 * | |
1393 * EINPROGRESS: the block is in the process of being synced by the | |
1394 * usual mechanism (spa_sync()), so we can't sync it here. | |
1395 * The caller should txg_wait_synced() and not log the write. | |
1396 * | |
1397 * EBUSY: another thread is trying to dmu_sync() the same dbuf. | |
1398 * (This case cannot arise under the current locking rules.) | |
1399 * The caller should txg_wait_synced() and not log the write. | |
1400 * | |
1401 * ESTALE: the block was dirtied or freed while we were writing it, | |
1402 * so the data is no longer valid. | |
1403 * The caller should txg_wait_synced() and not log the write. | |
1404 * | |
1405 * 0: success. Sets *bp to the blkptr just written, and sets | |
1406 * *blkoff to the data's offset within that block. | |
1407 * The caller should log this blkptr/blkoff in its lr_write_t. | |
1408 */ | |
1409 int | |
1410 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, | |
1411 blkptr_t *bp, uint64_t txg) | |
1412 { | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1413 objset_impl_t *osi = os->os; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1414 dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool; |
789 | 1415 tx_state_t *tx = &dp->dp_tx; |
1416 dmu_buf_impl_t *db; | |
1417 blkptr_t *blk; | |
1418 int err; | |
1544 | 1419 zbookmark_t zb; |
789 | 1420 |
1421 ASSERT(RW_LOCK_HELD(&tx->tx_suspend)); | |
1422 ASSERT(BP_IS_HOLE(bp)); | |
1423 ASSERT(txg != 0); | |
1424 | |
1425 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", | |
1426 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); | |
1427 | |
1428 /* | |
1544 | 1429 * XXX why is this routine using dmu_buf_*() and casting between |
1430 * dmu_buf_impl_t and dmu_buf_t? | |
1431 */ | |
1432 | |
1433 /* | |
789 | 1434 * If this txg already synced, there's nothing to do. |
1435 */ | |
1436 if (txg <= tx->tx_synced_txg) { | |
1437 /* | |
1438 * If we're running ziltest, we need the blkptr regardless. | |
1439 */ | |
1440 if (txg > spa_freeze_txg(dp->dp_spa)) { | |
1544 | 1441 err = dmu_buf_hold(os, object, offset, |
1442 FTAG, (dmu_buf_t **)&db); | |
1443 if (err) | |
1444 return (err); | |
789 | 1445 /* if db_blkptr == NULL, this was an empty write */ |
1446 if (db->db_blkptr) | |
1447 *bp = *db->db_blkptr; /* structure assignment */ | |
1448 else | |
1449 bzero(bp, sizeof (blkptr_t)); | |
1450 *blkoff = offset - db->db.db_offset; | |
1451 ASSERT3U(*blkoff, <, db->db.db_size); | |
1544 | 1452 dmu_buf_rele((dmu_buf_t *)db, FTAG); |
789 | 1453 return (0); |
1454 } | |
1455 return (EALREADY); | |
1456 } | |
1457 | |
1458 /* | |
1459 * If this txg is in the middle of syncing, just wait for it. | |
1460 */ | |
1461 if (txg == tx->tx_syncing_txg) { | |
1462 ASSERT(txg != tx->tx_open_txg); | |
1463 return (EINPROGRESS); | |
1464 } | |
1465 | |
1544 | 1466 err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db); |
1467 if (err) | |
1468 return (err); | |
789 | 1469 |
1470 mutex_enter(&db->db_mtx); | |
1471 | |
1472 /* | |
1473 * If this dbuf isn't dirty, must have been free_range'd. | |
1474 * There's no need to log writes to freed blocks, so we're done. | |
1475 */ | |
1476 if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) { | |
1477 mutex_exit(&db->db_mtx); | |
1544 | 1478 dmu_buf_rele((dmu_buf_t *)db, FTAG); |
789 | 1479 return (ENOENT); |
1480 } | |
1481 | |
1482 blk = db->db_d.db_overridden_by[txg&TXG_MASK]; | |
1483 | |
1484 /* | |
1485 * If we already did a dmu_sync() of this dbuf in this txg, | |
1486 * free the old block before writing the new one. | |
1487 */ | |
1488 if (blk != NULL) { | |
1489 ASSERT(blk != IN_DMU_SYNC); | |
1490 if (blk == IN_DMU_SYNC) { | |
1491 mutex_exit(&db->db_mtx); | |
1544 | 1492 dmu_buf_rele((dmu_buf_t *)db, FTAG); |
789 | 1493 return (EBUSY); |
1494 } | |
1495 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); | |
1496 if (!BP_IS_HOLE(blk)) { | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1497 (void) arc_free(NULL, osi->os_spa, txg, blk, |
789 | 1498 NULL, NULL, ARC_WAIT); |
1499 } | |
1500 kmem_free(blk, sizeof (blkptr_t)); | |
1501 } | |
1502 | |
1503 db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; | |
1504 mutex_exit(&db->db_mtx); | |
1505 | |
1506 blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); | |
1507 blk->blk_birth = 0; /* mark as invalid */ | |
1508 | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1509 zb.zb_objset = osi->os_dsl_dataset->ds_object; |
1544 | 1510 zb.zb_object = db->db.db_object; |
1511 zb.zb_level = db->db_level; | |
1512 zb.zb_blkid = db->db_blkid; | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1513 err = arc_write(NULL, osi->os_spa, |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1514 zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum), |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1515 zio_compress_select(db->db_dnode->dn_compress, osi->os_compress), |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1516 dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type), |
789 | 1517 txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, |
1544 | 1518 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); |
789 | 1519 ASSERT(err == 0); |
1520 | |
1521 if (!BP_IS_HOLE(blk)) { | |
1522 blk->blk_fill = 1; | |
1523 BP_SET_TYPE(blk, db->db_dnode->dn_type); | |
1524 BP_SET_LEVEL(blk, 0); | |
1525 } | |
1526 | |
1527 /* copy the block pointer back to caller */ | |
1528 *bp = *blk; /* structure assignment */ | |
1529 *blkoff = offset - db->db.db_offset; | |
1530 ASSERT3U(*blkoff, <, db->db.db_size); | |
1531 | |
1532 mutex_enter(&db->db_mtx); | |
1533 if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) { | |
1534 /* we were dirtied/freed during the sync */ | |
1535 ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL); | |
1536 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); | |
1537 mutex_exit(&db->db_mtx); | |
1544 | 1538 dmu_buf_rele((dmu_buf_t *)db, FTAG); |
789 | 1539 /* Note that this block does not free on disk until txg syncs */ |
1540 | |
1541 /* | |
1542 * XXX can we use ARC_NOWAIT here? | |
1543 * XXX should we be ignoring the return code? | |
1544 */ | |
1545 if (!BP_IS_HOLE(blk)) { | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1546 (void) arc_free(NULL, osi->os_spa, txg, blk, |
789 | 1547 NULL, NULL, ARC_WAIT); |
1548 } | |
1549 kmem_free(blk, sizeof (blkptr_t)); | |
1550 return (ESTALE); | |
1551 } | |
1552 | |
1553 db->db_d.db_overridden_by[txg&TXG_MASK] = blk; | |
1554 mutex_exit(&db->db_mtx); | |
1544 | 1555 dmu_buf_rele((dmu_buf_t *)db, FTAG); |
789 | 1556 ASSERT3U(txg, >, tx->tx_syncing_txg); |
1557 return (0); | |
1558 } | |
1559 | |
1560 uint64_t | |
1561 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) | |
1562 { | |
1544 | 1563 dnode_t *dn; |
1564 | |
1565 /* XXX assumes dnode_hold will not get an i/o error */ | |
1566 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1567 uint64_t rv = dnode_max_nonzero_offset(dn); |
1568 dnode_rele(dn, FTAG); | |
1569 return (rv); | |
1570 } | |
1571 | |
1572 int | |
1573 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, | |
1574 dmu_tx_t *tx) | |
1575 { | |
1544 | 1576 dnode_t *dn; |
1577 int err; | |
1578 | |
1579 err = dnode_hold(os->os, object, FTAG, &dn); | |
1580 if (err) | |
1581 return (err); | |
1582 err = dnode_set_blksz(dn, size, ibs, tx); | |
789 | 1583 dnode_rele(dn, FTAG); |
1584 return (err); | |
1585 } | |
1586 | |
1587 void | |
1588 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, | |
1589 dmu_tx_t *tx) | |
1590 { | |
1544 | 1591 dnode_t *dn; |
1592 | |
1593 /* XXX assumes dnode_hold will not get an i/o error */ | |
1594 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1595 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); |
1596 dn->dn_checksum = checksum; | |
1597 dnode_setdirty(dn, tx); | |
1598 dnode_rele(dn, FTAG); | |
1599 } | |
1600 | |
1601 void | |
1602 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, | |
1603 dmu_tx_t *tx) | |
1604 { | |
1544 | 1605 dnode_t *dn; |
1606 | |
1607 /* XXX assumes dnode_hold will not get an i/o error */ | |
1608 (void) dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1609 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); |
1610 dn->dn_compress = compress; | |
1611 dnode_setdirty(dn, tx); | |
1612 dnode_rele(dn, FTAG); | |
1613 } | |
1614 | |
1775
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1615 /* |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1616 * XXX - eventually, this should take into account per-dataset (or |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1617 * even per-object?) user requests for higher levels of replication. |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1618 */ |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1619 int |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1620 dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1621 { |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1622 int ncopies = 1; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1623 |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1624 if (dmu_ot[ot].ot_metadata) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1625 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1626 if (zb->zb_level != 0) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1627 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1628 if (zb->zb_objset == 0 && zb->zb_object == 0) |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1629 ncopies++; |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1630 return (MIN(ncopies, spa_max_replication(spa))); |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1631 } |
e51e26b432c0
6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
billm
parents:
1731
diff
changeset
|
1632 |
789 | 1633 int |
1634 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) | |
1635 { | |
1636 dnode_t *dn; | |
1637 int i, err; | |
1638 | |
1544 | 1639 err = dnode_hold(os->os, object, FTAG, &dn); |
1640 if (err) | |
1641 return (err); | |
789 | 1642 /* |
1643 * Sync any current changes before | |
1644 * we go trundling through the block pointers. | |
1645 */ | |
1646 for (i = 0; i < TXG_SIZE; i++) { | |
1596
2e2377ccbf85
6395371 ASSERT in dmu_tx_count_free: blkid + i < dn->dn_phys->dn_nblkptr
ahrens
parents:
1544
diff
changeset
|
1647 if (list_link_active(&dn->dn_dirty_link[i])) |
789 | 1648 break; |
1649 } | |
1650 if (i != TXG_SIZE) { | |
1651 dnode_rele(dn, FTAG); | |
1652 txg_wait_synced(dmu_objset_pool(os), 0); | |
1544 | 1653 err = dnode_hold(os->os, object, FTAG, &dn); |
1654 if (err) | |
1655 return (err); | |
789 | 1656 } |
1657 | |
1658 err = dnode_next_offset(dn, hole, off, 1, 1); | |
1659 dnode_rele(dn, FTAG); | |
1660 | |
1661 return (err); | |
1662 } | |
1663 | |
1664 void | |
1665 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) | |
1666 { | |
1667 rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1668 mutex_enter(&dn->dn_mtx); | |
1669 | |
1670 doi->doi_data_block_size = dn->dn_datablksz; | |
1671 doi->doi_metadata_block_size = dn->dn_indblkshift ? | |
1672 1ULL << dn->dn_indblkshift : 0; | |
1673 doi->doi_indirection = dn->dn_nlevels; | |
1674 doi->doi_checksum = dn->dn_checksum; | |
1675 doi->doi_compress = dn->dn_compress; | |
2082 | 1676 doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + |
1677 SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; | |
789 | 1678 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; |
1679 doi->doi_type = dn->dn_type; | |
1680 doi->doi_bonus_size = dn->dn_bonuslen; | |
1681 doi->doi_bonus_type = dn->dn_bonustype; | |
1682 | |
1683 mutex_exit(&dn->dn_mtx); | |
1684 rw_exit(&dn->dn_struct_rwlock); | |
1685 } | |
1686 | |
1687 /* | |
1688 * Get information on a DMU object. | |
1689 * If doi is NULL, just indicates whether the object exists. | |
1690 */ | |
1691 int | |
1692 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) | |
1693 { | |
1544 | 1694 dnode_t *dn; |
1695 int err = dnode_hold(os->os, object, FTAG, &dn); | |
789 | 1696 |
1544 | 1697 if (err) |
1698 return (err); | |
789 | 1699 |
1700 if (doi != NULL) | |
1701 dmu_object_info_from_dnode(dn, doi); | |
1702 | |
1703 dnode_rele(dn, FTAG); | |
1704 return (0); | |
1705 } | |
1706 | |
1707 /* | |
1708 * As above, but faster; can be used when you have a held dbuf in hand. | |
1709 */ | |
1710 void | |
1711 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) | |
1712 { | |
1713 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); | |
1714 } | |
1715 | |
1716 /* | |
1717 * Faster still when you only care about the size. | |
1718 * This is specifically optimized for zfs_getattr(). | |
1719 */ | |
1720 void | |
1721 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) | |
1722 { | |
1723 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; | |
1724 | |
1725 *blksize = dn->dn_datablksz; | |
2082 | 1726 /* add 1 for dnode space */ |
1727 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> | |
1728 SPA_MINBLOCKSHIFT) + 1; | |
789 | 1729 } |
1730 | |
1544 | 1731 /* |
1732 * Given a bookmark, return the name of the dataset, object, and range in | |
1733 * human-readable format. | |
1734 */ | |
1735 int | |
1736 spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen, | |
1737 char *objname, size_t objlen, char *range, size_t rangelen) | |
1738 { | |
1739 dsl_pool_t *dp; | |
1740 dsl_dataset_t *ds = NULL; | |
1741 objset_t *os = NULL; | |
1742 dnode_t *dn = NULL; | |
1743 int err, shift; | |
1744 | |
1745 if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64) | |
1746 return (ENOSPC); | |
1747 | |
1748 dp = spa_get_dsl(spa); | |
1749 if (zb->zb_objset != 0) { | |
1750 rw_enter(&dp->dp_config_rwlock, RW_READER); | |
1751 err = dsl_dataset_open_obj(dp, zb->zb_objset, | |
1752 NULL, DS_MODE_NONE, FTAG, &ds); | |
1753 if (err) { | |
1754 rw_exit(&dp->dp_config_rwlock); | |
1755 return (err); | |
1756 } | |
1757 dsl_dataset_name(ds, dsname); | |
1758 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); | |
1759 rw_exit(&dp->dp_config_rwlock); | |
1760 | |
1761 err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os); | |
1762 if (err) | |
1763 goto out; | |
1764 | |
1765 } else { | |
1766 dsl_dataset_name(NULL, dsname); | |
1767 os = dp->dp_meta_objset; | |
1768 } | |
1769 | |
1770 | |
1771 if (zb->zb_object == DMU_META_DNODE_OBJECT) { | |
1772 (void) strncpy(objname, "mdn", objlen); | |
1773 } else { | |
1774 (void) snprintf(objname, objlen, "%lld", | |
1775 (longlong_t)zb->zb_object); | |
1776 } | |
1777 | |
1778 err = dnode_hold(os->os, zb->zb_object, FTAG, &dn); | |
1779 if (err) | |
1780 goto out; | |
1781 | |
1782 shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) + | |
1783 zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); | |
1784 (void) snprintf(range, rangelen, "%llu-%llu", | |
1785 (u_longlong_t)(zb->zb_blkid << shift), | |
1786 (u_longlong_t)((zb->zb_blkid+1) << shift)); | |
1787 | |
1788 out: | |
1789 if (dn) | |
1790 dnode_rele(dn, FTAG); | |
1791 if (os && os != dp->dp_meta_objset) | |
1792 dmu_objset_close(os); | |
1793 return (err); | |
1794 } | |
1795 | |
789 | 1796 void |
1797 byteswap_uint64_array(void *vbuf, size_t size) | |
1798 { | |
1799 uint64_t *buf = vbuf; | |
1800 size_t count = size >> 3; | |
1801 int i; | |
1802 | |
1803 ASSERT((size & 7) == 0); | |
1804 | |
1805 for (i = 0; i < count; i++) | |
1806 buf[i] = BSWAP_64(buf[i]); | |
1807 } | |
1808 | |
1809 void | |
1810 byteswap_uint32_array(void *vbuf, size_t size) | |
1811 { | |
1812 uint32_t *buf = vbuf; | |
1813 size_t count = size >> 2; | |
1814 int i; | |
1815 | |
1816 ASSERT((size & 3) == 0); | |
1817 | |
1818 for (i = 0; i < count; i++) | |
1819 buf[i] = BSWAP_32(buf[i]); | |
1820 } | |
1821 | |
1822 void | |
1823 byteswap_uint16_array(void *vbuf, size_t size) | |
1824 { | |
1825 uint16_t *buf = vbuf; | |
1826 size_t count = size >> 1; | |
1827 int i; | |
1828 | |
1829 ASSERT((size & 1) == 0); | |
1830 | |
1831 for (i = 0; i < count; i++) | |
1832 buf[i] = BSWAP_16(buf[i]); | |
1833 } | |
1834 | |
1835 /* ARGSUSED */ | |
1836 void | |
1837 byteswap_uint8_array(void *vbuf, size_t size) | |
1838 { | |
1839 } | |
1840 | |
1841 void | |
1842 dmu_init(void) | |
1843 { | |
1844 dbuf_init(); | |
1845 dnode_init(); | |
1846 arc_init(); | |
1847 } | |
1848 | |
1849 void | |
1850 dmu_fini(void) | |
1851 { | |
1852 arc_fini(); | |
1853 dnode_fini(); | |
1854 dbuf_fini(); | |
1855 } |