Mercurial > illumos > illumos-gate

/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * Copyright (c) 1980, 1986, 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms are permitted
 * provided that: (1) source distributions retain this entire copyright
 * notice and comment, and (2) distributions including binaries display
 * the following acknowledgement:  ``This product includes software
 * developed by the University of California, Berkeley and its contributors''
 * in the documentation or other materials provided with the distribution
 * and in all advertising materials mentioning features or use of this
 * software. Neither the name of the University nor the names of its
 * contributors may be used to endorse or promote products derived
 * from this software without specific prior written permission.
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdarg.h>
#include <libadm.h>
#include <note.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/mntent.h>
#include <sys/filio.h>
#include <sys/fs/ufs_fs.h>
#include <sys/vnode.h>
#include <sys/fs/ufs_acl.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_log.h>
#define	_KERNEL
#include <sys/fs/ufs_fsdir.h>
#undef _KERNEL
#include <sys/mnttab.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <ctype.h>
#include <sys/vfstab.h>
#include <sys/lockfs.h>
#include <errno.h>
#include <sys/cmn_err.h>
#include <sys/dkio.h>
#include <sys/vtoc.h>
#include <sys/efi_partition.h>
#include <fslib.h>
#include <inttypes.h>
#include "fsck.h"

caddr_t mount_point = NULL;

static int64_t diskreads, totalreads;	/* Disk cache statistics */

static int log_checksum(int32_t *, int32_t *, int);
static void vdirerror(fsck_ino_t, caddr_t, va_list);
static struct mnttab *search_mnttab(caddr_t, caddr_t, caddr_t, size_t);
static struct vfstab *search_vfstab(caddr_t, caddr_t, caddr_t, size_t);
static void vpwarn(caddr_t, va_list);
static int getline(FILE *, caddr_t, int);
static struct bufarea *alloc_bufarea(void);
static void rwerror(caddr_t, diskaddr_t, int rval);
static void debugclean(void);
static void report_io_prob(caddr_t, diskaddr_t, size_t, ssize_t);
static void freelogblk(daddr32_t);
static void verrexit(caddr_t, va_list);
static void vpfatal(caddr_t, va_list);
static diskaddr_t get_device_size(int, caddr_t);
static diskaddr_t brute_force_get_device_size(int);
static void cg_constants(int, daddr32_t *, daddr32_t *, daddr32_t *,
	    daddr32_t *, daddr32_t *, daddr32_t *);

int
ftypeok(struct dinode *dp)
{
	switch (dp->di_mode & IFMT) {

	case IFDIR:
	case IFREG:
	case IFBLK:
	case IFCHR:
	case IFLNK:
	case IFSOCK:
	case IFIFO:
	case IFSHAD:
	case IFATTRDIR:
		return (1);

	default:
		if (debug)
			(void) printf("bad file type 0%o\n", dp->di_mode);
		return (0);
	}
}

int
acltypeok(struct dinode *dp)
{
	if (CHECK_ACL_ALLOWED(dp->di_mode & IFMT))
		return (1);

	if (debug)
		(void) printf("bad file type for acl I=%d: 0%o\n",
		    dp->di_shadow, dp->di_mode);
	return (0);
}

NOTE(PRINTFLIKE(1))
int
reply(caddr_t fmt, ...)
{
	va_list ap;
	char line[80];

	if (preen)
		pfatal("INTERNAL ERROR: GOT TO reply() in preen mode");

	if (mflag) {
		/*
		 * We don't know what's going on, so don't potentially
		 * make things worse by having errexit() write stuff
		 * out to disk.
		 */
		(void) printf(
		    "\n%s: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.\n",
		    devname);
		exit(EXERRFATAL);
	}

	va_start(ap, fmt);
	(void) putchar('\n');
	(void) vprintf(fmt, ap);
	(void) putchar('?');
	(void) putchar(' ');
	va_end(ap);

	if (nflag || fswritefd < 0) {
		(void) printf(" no\n\n");
		return (0);
	}
	if (yflag) {
		(void) printf(" yes\n\n");
		return (1);
	}
	(void) fflush(stdout);
	if (getline(stdin, line, sizeof (line)) == EOF)
		errexit("\n");
	(void) printf("\n");
	if (line[0] == 'y' || line[0] == 'Y') {
		return (1);
	} else {
		return (0);
	}
}

int
getline(FILE *fp, caddr_t loc, int maxlen)
{
	int n;
	caddr_t p, lastloc;

	p = loc;
	lastloc = &p[maxlen-1];
	while ((n = getc(fp)) != '\n') {
		if (n == EOF)
			return (EOF);
		if (!isspace(n) && p < lastloc)
			*p++ = (char)n;
	}
	*p = '\0';
	/* LINTED pointer difference won't overflow */
	return (p - loc);
}

/*
 * Malloc buffers and set up cache.
 */
void
bufinit(void)
{
	struct bufarea *bp;
	int bufcnt, i;
	caddr_t bufp;

	bufp = malloc((size_t)sblock.fs_bsize);
	if (bufp == NULL)
		goto nomem;
	initbarea(&cgblk);
	cgblk.b_un.b_buf = bufp;
	bufhead.b_next = bufhead.b_prev = &bufhead;
	bufcnt = MAXBUFSPACE / sblock.fs_bsize;
	if (bufcnt < MINBUFS)
		bufcnt = MINBUFS;
	for (i = 0; i < bufcnt; i++) {
		bp = (struct bufarea *)malloc(sizeof (struct bufarea));
		if (bp == NULL) {
			if (i >= MINBUFS)
				goto noalloc;
			goto nomem;
		}

		bufp = malloc((size_t)sblock.fs_bsize);
		if (bufp == NULL) {
			free((void *)bp);
			if (i >= MINBUFS)
				goto noalloc;
			goto nomem;
		}
		initbarea(bp);
		bp->b_un.b_buf = bufp;
		bp->b_prev = &bufhead;
		bp->b_next = bufhead.b_next;
		bufhead.b_next->b_prev = bp;
		bufhead.b_next = bp;
	}
noalloc:
	bufhead.b_size = i;	/* save number of buffers */
	pbp = pdirbp = NULL;
	return;

nomem:
	errexit("cannot allocate buffer pool\n");
	/* NOTREACHED */
}

/*
 * Undo a bufinit().
 */
void
unbufinit(void)
{
	int cnt;
	struct bufarea *bp, *nbp;

	cnt = 0;
	for (bp = bufhead.b_prev; bp != NULL && bp != &bufhead; bp = nbp) {
		cnt++;
		flush(fswritefd, bp);
		nbp = bp->b_prev;
		/*
		 * We're discarding the entire chain, so this isn't
		 * technically necessary.  However, it doesn't hurt
		 * and lint's data flow analysis is much happier
		 * (this prevents it from thinking there's a chance
		 * of our using memory elsewhere after it's been released).
		 */
		nbp->b_next = bp->b_next;
		bp->b_next->b_prev = nbp;
		free((void *)bp->b_un.b_buf);
		free((void *)bp);
	}

	if (bufhead.b_size != cnt)
		errexit("Panic: cache lost %d buffers\n",
			bufhead.b_size - cnt);
}

/*
 * Manage a cache of directory blocks.
 */
struct bufarea *
getdatablk(daddr32_t blkno, size_t size)
{
	struct bufarea *bp;

	for (bp = bufhead.b_next; bp != &bufhead; bp = bp->b_next)
		if (bp->b_bno == fsbtodb(&sblock, blkno)) {
			goto foundit;
		}
	for (bp = bufhead.b_prev; bp != &bufhead; bp = bp->b_prev)
		if ((bp->b_flags & B_INUSE) == 0)
			break;
	if (bp == &bufhead) {
		bp = alloc_bufarea();
		if (bp == NULL) {
			errexit("deadlocked buffer pool\n");
			/* NOTREACHED */
		}
	}
	/*
	 * We're at the same logical level as getblk(), so if there
	 * are any errors, we'll let our caller handle them.
	 */
	diskreads++;
	(void) getblk(bp, blkno, size);

foundit:
	totalreads++;
	bp->b_cnt++;
	/*
	 * Move the buffer to head of linked list if it isn't
	 * already there.
	 */
	if (bufhead.b_next != bp) {
		bp->b_prev->b_next = bp->b_next;
		bp->b_next->b_prev = bp->b_prev;
		bp->b_prev = &bufhead;
		bp->b_next = bufhead.b_next;
		bufhead.b_next->b_prev = bp;
		bufhead.b_next = bp;
	}
	bp->b_flags |= B_INUSE;
	return (bp);
}

void
brelse(struct bufarea *bp)
{
	bp->b_cnt--;
	if (bp->b_cnt == 0) {
		bp->b_flags &= ~B_INUSE;
	}
}

struct bufarea *
getblk(struct bufarea *bp, daddr32_t blk, size_t size)
{
	diskaddr_t dblk;

	dblk = fsbtodb(&sblock, blk);
	if (bp->b_bno == dblk)
		return (bp);
	flush(fswritefd, bp);
	bp->b_errs = fsck_bread(fsreadfd, bp->b_un.b_buf, dblk, size);
	bp->b_bno = dblk;
	bp->b_size = size;
	return (bp);
}

void
flush(int fd, struct bufarea *bp)
{
	int i, j;
	caddr_t sip;
	long size;

	if (!bp->b_dirty)
		return;

	/*
	 * It's not our buf, so if there are errors, let whoever
	 * acquired it deal with the actual problem.
	 */
	if (bp->b_errs != 0)
		pfatal("WRITING ZERO'ED BLOCK %lld TO DISK\n", bp->b_bno);
	bp->b_dirty = 0;
	bp->b_errs = 0;
	bwrite(fd, bp->b_un.b_buf, bp->b_bno, (long)bp->b_size);
	if (bp != &sblk) {
		return;
	}

	/*
	 * We're flushing the superblock, so make sure all the
	 * ancillary bits go out as well.
	 */
	sip = (caddr_t)sblock.fs_u.fs_csp;
	for (i = 0, j = 0; i < sblock.fs_cssize; i += sblock.fs_bsize, j++) {
		size = sblock.fs_cssize - i < sblock.fs_bsize ?
		    sblock.fs_cssize - i : sblock.fs_bsize;
		bwrite(fswritefd, sip,
		    fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag),
		    size);
		sip += size;
	}
}

static void
rwerror(caddr_t mesg, diskaddr_t blk, int rval)
{
	int olderr = errno;

	if (!preen)
		(void) printf("\n");

	if (rval == -1)
		pfatal("CANNOT %s: DISK BLOCK %lld: %s",
		    mesg, blk, strerror(olderr));
	else
		pfatal("CANNOT %s: DISK BLOCK %lld", mesg, blk);

	if (reply("CONTINUE") == 0) {
		exitstat = EXERRFATAL;
		errexit("Program terminated\n");
	}
}

void
ckfini(void)
{
	int64_t percentage;

	if (fswritefd < 0)
		return;

	flush(fswritefd, &sblk);
	/*
	 * Were we using a backup superblock?
	 */
	if (havesb && sblk.b_bno != SBOFF / dev_bsize) {
		if (preen || reply("UPDATE STANDARD SUPERBLOCK") == 1) {
			sblk.b_bno = SBOFF / dev_bsize;
			sbdirty();
			flush(fswritefd, &sblk);
		}
	}
	flush(fswritefd, &cgblk);
	if (cgblk.b_un.b_buf != NULL) {
		free((void *)cgblk.b_un.b_buf);
		cgblk.b_un.b_buf = NULL;
	}
	unbufinit();
	pbp = NULL;
	pdirbp = NULL;
	if (debug) {
		/*
		 * Note that we only count cache-related reads.
		 * Anything that called fsck_bread() or getblk()
		 * directly are explicitly not cached, so they're not
		 * included here.
		 */
		if (totalreads != 0)
			percentage = diskreads * 100 / totalreads;
		else
			percentage = 0;

		(void) printf("cache missed %lld of %lld reads (%lld%%)\n",
		    (longlong_t)diskreads, (longlong_t)totalreads,
		    (longlong_t)percentage);
	}

	(void) close(fsreadfd);
	(void) close(fswritefd);
	fsreadfd = -1;
	fswritefd = -1;
}

int
fsck_bread(int fd, caddr_t buf, diskaddr_t blk, size_t size)
{
	caddr_t cp;
	int i;
	int errs;
	offset_t offset = ldbtob(blk);
	offset_t addr;

	/*
	 * In our universe, nothing exists before the superblock, so
	 * just pretend it's always zeros.  This is the complement of
	 * bwrite()'s ignoring write requests into that space.
	 */
	if (blk < SBLOCK) {
		if (debug)
			(void) printf(
			    "WARNING: fsck_bread() passed blkno < %d (%lld)\n",
			    SBLOCK, (longlong_t)blk);
		(void) memset(buf, 0, (size_t)size);
		return (1);
	}

	if (llseek(fd, offset, SEEK_SET) < 0) {
		rwerror("SEEK", blk, -1);
	}

	if ((i = read(fd, buf, size)) == size) {
		return (0);
	}
	rwerror("READ", blk, i);
	if (llseek(fd, offset, SEEK_SET) < 0) {
		rwerror("SEEK", blk, -1);
	}
	errs = 0;
	(void) memset(buf, 0, (size_t)size);
	pwarn("THE FOLLOWING SECTORS COULD NOT BE READ:");
	for (cp = buf, i = 0; i < btodb(size); i++, cp += DEV_BSIZE) {
		addr = ldbtob(blk + i);
		if (llseek(fd, addr, SEEK_SET) < 0 ||
		    read(fd, cp, (int)secsize) < 0) {
			iscorrupt = 1;
			(void) printf(" %llu", blk + (u_longlong_t)i);
			errs++;
		}
	}
	(void) printf("\n");
	return (errs);
}

void
bwrite(int fd, caddr_t buf, diskaddr_t blk, int64_t size)
{
	int i;
	int n;
	caddr_t cp;
	offset_t offset = ldbtob(blk);
	offset_t addr;

	if (fd < 0)
		return;
	if (blk < SBLOCK) {
		if (debug)
			(void) printf(
		    "WARNING: Attempt to write illegal blkno %lld on %s\n",
			    (longlong_t)blk, devname);
		return;
	}
	if (llseek(fd, offset, SEEK_SET) < 0) {
		rwerror("SEEK", blk, -1);
	}
	if ((i = write(fd, buf, (int)size)) == size) {
		fsmodified = 1;
		return;
	}
	rwerror("WRITE", blk, i);
	if (llseek(fd, offset, SEEK_SET) < 0) {
		rwerror("SEEK", blk, -1);
	}
	pwarn("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:");
	for (cp = buf, i = 0; i < btodb(size); i++, cp += DEV_BSIZE) {
		n = 0;
		addr = ldbtob(blk + i);
		if (llseek(fd, addr, SEEK_SET) < 0 ||
		    (n = write(fd, cp, DEV_BSIZE)) < 0) {
			iscorrupt = 1;
			(void) printf(" %llu", blk + (u_longlong_t)i);
		} else if (n > 0) {
			fsmodified = 1;
		}

	}
	(void) printf("\n");
}

/*
 * Allocates the specified number of contiguous fragments.
 */
daddr32_t
allocblk(int wantedfrags)
{
	int block, leadfrag, tailfrag;
	daddr32_t selected;
	size_t size;
	struct bufarea *bp;

	/*
	 * It's arguable whether we should just fail, or instead
	 * error out here.  Since we should only ever be asked for
	 * a single fragment or an entire block (i.e., sblock.fs_frag),
	 * we'll fail out because anything else means somebody
	 * changed code without considering all of the ramifications.
	 */
	if (wantedfrags <= 0 || wantedfrags > sblock.fs_frag) {
		exitstat = EXERRFATAL;
		errexit("allocblk() asked for %d frags.  "
			"Legal range is 1 to %d",
			wantedfrags, sblock.fs_frag);
	}

	/*
	 * For each filesystem block, look at every possible starting
	 * offset within the block such that we can get the number of
	 * contiguous fragments that we need.  This is a drastically
	 * simplified version of the kernel's mapsearch() and alloc*().
	 * It's also correspondingly slower.
	 */
	for (block = 0; block < maxfsblock - sblock.fs_frag;
	    block += sblock.fs_frag) {
		for (leadfrag = 0; leadfrag <= sblock.fs_frag - wantedfrags;
		    leadfrag++) {
			/*
			 * Is first fragment of candidate run available?
			 */
			if (testbmap(block + leadfrag))
				continue;
			/*
			 * Are the rest of them available?
			 */
			for (tailfrag = 1; tailfrag < wantedfrags; tailfrag++)
				if (testbmap(block + leadfrag + tailfrag))
					break;
			if (tailfrag < wantedfrags) {
				/*
				 * No, skip the known-unusable run.
				 */
				leadfrag += tailfrag;
				continue;
			}
			/*
			 * Found what we need, so claim them.
			 */
			for (tailfrag = 0; tailfrag < wantedfrags; tailfrag++)
				setbmap(block + leadfrag + tailfrag);
			n_blks += wantedfrags;
			size = wantedfrags * sblock.fs_fsize;
			selected = block + leadfrag;
			bp = getdatablk(selected, size);
			(void) memset((void *)bp->b_un.b_buf, 0, size);
			dirty(bp);
			brelse(bp);
			if (debug)
				(void) printf(
		    "allocblk: selected %d (in block %d), frags %d, size %d\n",
				    selected, selected % sblock.fs_bsize,
				    wantedfrags, (int)size);
			return (selected);
		}
	}
	return (0);
}

/*
 * Free a previously allocated block
 */
void
freeblk(fsck_ino_t ino, daddr32_t blkno, int frags)
{
	struct inodesc idesc;

	if (debug)
		(void) printf("debug: freeing %d fragments starting at %d\n",
		    frags, blkno);

	init_inodesc(&idesc);

	idesc.id_number = ino;
	idesc.id_blkno = blkno;
	idesc.id_numfrags = frags;
	idesc.id_truncto = -1;

	/*
	 * Nothing in the return status has any relevance to how
	 * we're using pass4check(), so just ignore it.
	 */
	(void) pass4check(&idesc);
}

/*
 * Fill NAMEBUF with a path starting in CURDIR for INO.  Assumes
 * that the given buffer is at least MAXPATHLEN + 1 characters.
 */
void
getpathname(caddr_t namebuf, fsck_ino_t curdir, fsck_ino_t ino)
{
	int len;
	caddr_t cp;
	struct dinode *dp;
	struct inodesc idesc;
	struct inoinfo *inp;

	if (debug)
		(void) printf("debug: getpathname(curdir %d, ino %d)\n",
		    curdir, ino);

	if ((curdir == 0) || (!INO_IS_DVALID(curdir))) {
		(void) strcpy(namebuf, "?");
		return;
	}

	if ((curdir == UFSROOTINO) && (ino == UFSROOTINO)) {
		(void) strcpy(namebuf, "/");
		return;
	}

	init_inodesc(&idesc);
	idesc.id_type = DATA;
	cp = &namebuf[MAXPATHLEN - 1];
	*cp = '\0';

	/*
	 * In the case of extended attributes, our
	 * parent won't necessarily be a directory, so just
	 * return what we've found with a prefix indicating
	 * that it's an XATTR.  Presumably our caller will
	 * know what's going on and do something useful, like
	 * work out the path of the parent and then combine
	 * the two names.
	 *
	 * Can't use strcpy(), etc, because we've probably
	 * already got some name information in the buffer and
	 * the usual trailing \0 would lose it.
	 */
	dp = ginode(curdir);
	if ((dp->di_mode & IFMT) == IFATTRDIR) {
		idesc.id_number = curdir;
		idesc.id_parent = ino;
		idesc.id_func = findname;
		idesc.id_name = namebuf;
		idesc.id_fix = NOFIX;
		if ((ckinode(dp, &idesc, CKI_TRAVERSE) & FOUND) == 0) {
			*cp-- = '?';
		}

		len = sizeof (XATTR_DIR_NAME) - 1;
		cp -= len;
		(void) memmove(cp, XATTR_DIR_NAME, len);
		goto attrname;
	}

	/*
	 * If curdir == ino, need to get a handle on .. so we
	 * can search it for ino's name.  Otherwise, just search
	 * the given directory for ino.  Repeat until out of space
	 * or a full path has been built.
	 */
	if (curdir != ino) {
		idesc.id_parent = curdir;
		goto namelookup;
	}
	while (ino != UFSROOTINO && ino != 0) {
		idesc.id_number = ino;
		idesc.id_func = findino;
		idesc.id_name = "..";
		idesc.id_fix = NOFIX;
		if ((ckinode(ginode(ino), &idesc, CKI_TRAVERSE) & FOUND) == 0) {
			inp = getinoinfo(ino);
			if ((inp == NULL) || (inp->i_parent == 0)) {
				break;
			}
			idesc.id_parent = inp->i_parent;
		}

		/*
		 * To get this far, id_parent must have the inode
		 * number for `..' in it.  By definition, that's got
		 * to be a directory, so search it for the inode of
		 * interest.
		 */
namelookup:
		idesc.id_number = idesc.id_parent;
		idesc.id_parent = ino;
		idesc.id_func = findname;
		idesc.id_name = namebuf;
		idesc.id_fix = NOFIX;
		if ((ckinode(ginode(idesc.id_number),
		    &idesc, CKI_TRAVERSE) & FOUND) == 0) {
			break;
		}
		/*
		 * Prepend to what we've accumulated so far.  If
		 * there's not enough room for even one more path element
		 * (of the worst-case length), then bail out.
		 */
		len = strlen(namebuf);
		cp -= len;
		if (cp < &namebuf[MAXNAMLEN])
			break;
		(void) memmove(cp, namebuf, len);
		*--cp = '/';

		/*
		 * Corner case for a looped-to-itself directory.
		 */
		if (ino == idesc.id_number)
			break;

		/*
		 * Climb one level of the hierarchy.  In other words,
		 * the current .. becomes the inode to search for and
		 * its parent becomes the directory to search in.
		 */
		ino = idesc.id_number;
	}

	/*
	 * If we hit a discontinuity in the hierarchy, indicate it by
	 * prefixing the path so far with `?'.  Otherwise, the first
	 * character will be `/' as a side-effect of the *--cp above.
	 *
	 * The special case is to handle the situation where we're
	 * trying to look something up in UFSROOTINO, but didn't find
	 * it.
	 */
	if (ino != UFSROOTINO || cp == &namebuf[MAXPATHLEN - 1]) {
		if (cp > namebuf)
			cp--;
		*cp = '?';
	}

	/*
	 * The invariants being used for buffer integrity are:
	 * - namebuf[] is terminated with \0 before anything else
	 * - cp is always <= the last element of namebuf[]
	 * - the new path element is always stored at the
	 *   beginning of namebuf[], and is no more than MAXNAMLEN-1
	 *   characters
	 * - cp is is decremented by the number of characters in
	 *   the new path element
	 * - if, after the above accounting for the new element's
	 *   size, there is no longer enough room at the beginning of
	 *   namebuf[] for a full-sized path element and a slash,
	 *   terminate the loop.  cp is in the range
	 *   &namebuf[0]..&namebuf[MAXNAMLEN - 1]
	 */
attrname:
	/* LINTED per the above discussion */
	(void) memmove(namebuf, cp, &namebuf[MAXPATHLEN] - cp);
}

/* ARGSUSED */
void
catch(int dummy)
{
	ckfini();
	exit(EXSIGNAL);
}

/*
 * When preening, allow a single quit to signal
 * a special exit after filesystem checks complete
 * so that reboot sequence may be interrupted.
 */
/* ARGSUSED */
void
catchquit(int dummy)
{
	(void) printf("returning to single-user after filesystem check\n");
	interrupted = 1;
	(void) signal(SIGQUIT, SIG_DFL);
}


/*
 * determine whether an inode should be fixed.
 */
NOTE(PRINTFLIKE(2))
int
dofix(struct inodesc *idesc, caddr_t msg, ...)
{
	int rval = 0;
	va_list ap;

	va_start(ap, msg);

	switch (idesc->id_fix) {

	case DONTKNOW:
		if (idesc->id_type == DATA)
			vdirerror(idesc->id_number, msg, ap);
		else
			vpwarn(msg, ap);
		if (preen) {
			idesc->id_fix = FIX;
			rval = ALTERED;
			break;
		}
		if (reply("SALVAGE") == 0) {
			idesc->id_fix = NOFIX;
			break;
		}
		idesc->id_fix = FIX;
		rval = ALTERED;
		break;

	case FIX:
		rval = ALTERED;
		break;

	case NOFIX:
		break;

	default:
		errexit("UNKNOWN INODESC FIX MODE %d\n", (int)idesc->id_fix);
	}

	va_end(ap);
	return (rval);
}

NOTE(PRINTFLIKE(1))
void
errexit(caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	verrexit(fmt, ap);
	/* NOTREACHED */
}

NOTE(PRINTFLIKE(1))
static void
verrexit(caddr_t fmt, va_list ap)
{
	static int recursing = 0;

	if (!recursing) {
		recursing = 1;
		if (errorlocked || iscorrupt) {
			if (havesb) {
				sblock.fs_clean = FSBAD;
				sblock.fs_state = FSOKAY - (long)sblock.fs_time;
				sblock.fs_state = -sblock.fs_state;
				sbdirty();
				write_altsb(fswritefd);
				flush(fswritefd, &sblk);
			}
		}
		ckfini();
		recursing = 0;
	}
	(void) vprintf(fmt, ap);
	if (fmt[strlen(fmt) - 1] != '\n')
		(void) putchar('\n');
	exit((exitstat != 0) ? exitstat : EXERRFATAL);
}

/*
 * An unexpected inconsistency occured.
 * Die if preening, otherwise just print message and continue.
 */
NOTE(PRINTFLIKE(1))
void
pfatal(caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vpfatal(fmt, ap);
	va_end(ap);
}

NOTE(PRINTFLIKE(1))
static void
vpfatal(caddr_t fmt, va_list ap)
{
	if (preen) {
		if (*fmt != '\0') {
			(void) printf("%s: ", devname);
			(void) vprintf(fmt, ap);
			(void) printf("\n");
		}
		(void) printf(
		    "%s: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.\n",
		    devname);
		if (havesb) {
			sblock.fs_clean = FSBAD;
			sblock.fs_state = -(FSOKAY - (long)sblock.fs_time);
			sbdirty();
			flush(fswritefd, &sblk);
		}
		/*
		 * We're exiting, it doesn't really matter that our
		 * caller doesn't get to call va_end().
		 */
		if (exitstat == 0)
			exitstat = EXFNDERRS;
		exit(exitstat);
	}
	if (*fmt != '\0') {
		(void) vprintf(fmt, ap);
	}
}

/*
 * Pwarn just prints a message when not preening,
 * or a warning (preceded by filename) when preening.
 */
NOTE(PRINTFLIKE(1))
void
pwarn(caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vpwarn(fmt, ap);
	va_end(ap);
}

NOTE(PRINTFLIKE(1))
static void
vpwarn(caddr_t fmt, va_list ap)
{
	if (*fmt != '\0') {
		if (preen)
			(void) printf("%s: ", devname);
		(void) vprintf(fmt, ap);
	}
}

/*
 * Like sprintf(), except the buffer is dynamically allocated
 * and returned, instead of being passed in.  A pointer to the
 * buffer is stored in *RET, and FMT is the usual format string.
 * The number of characters in *RET (excluding the trailing \0,
 * to be consistent with the other *printf() routines) is returned.
 *
 * Solaris doesn't have asprintf(3C) yet, unfortunately.
 */
NOTE(PRINTFLIKE(2))
int
fsck_asprintf(caddr_t *ret, caddr_t fmt, ...)
{
	int len;
	caddr_t buffer;
	va_list ap;

	va_start(ap, fmt);
	len = vsnprintf(NULL, 0, fmt, ap);
	va_end(ap);

	buffer = malloc((len + 1) * sizeof (char));
	if (buffer == NULL) {
		errexit("Out of memory in asprintf\n");
		/* NOTREACHED */
	}

	va_start(ap, fmt);
	(void) vsnprintf(buffer, len + 1, fmt, ap);
	va_end(ap);

	*ret = buffer;
	return (len);
}

/*
 * So we can take advantage of kernel routines in ufs_subr.c.
 */
/* PRINTFLIKE2 */
void
cmn_err(int level, caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	if (level == CE_PANIC) {
		(void) printf("INTERNAL INCONSISTENCY:");
		verrexit(fmt, ap);
	} else {
		(void) vprintf(fmt, ap);
	}
	va_end(ap);
}

/*
 * Check to see if unraw version of name is already mounted.
 * Updates devstr with the device name if devstr is not NULL
 * and str_size is positive.
 */
int
mounted(caddr_t name, caddr_t devstr, size_t str_size)
{
	int found;
	struct mnttab *mntent;

	mntent = search_mnttab(NULL, unrawname(name), devstr, str_size);
	if (mntent == NULL)
		return (M_NOMNT);

	/*
	 * It's mounted.  With or without write access?
	 */
	if (hasmntopt(mntent, MNTOPT_RO) != 0)
		found = M_RO;	/* mounted as RO */
	else
		found = M_RW; 	/* mounted as R/W */

	if (mount_point == NULL) {
		mount_point = strdup(mntent->mnt_mountp);
		if (mount_point == NULL) {
			errexit("fsck: memory allocation failure: %s",
				strerror(errno));
			/* NOTREACHED */
		}

		if (devstr != NULL && str_size > 0)
			(void) strlcpy(devstr, mntent->mnt_special, str_size);
	}

	return (found);
}

/*
 * Check to see if name corresponds to an entry in vfstab, and that the entry
 * does not have option ro.
 */
int
writable(caddr_t name)
{
	int rw = 1;
	struct vfstab vfsbuf, vfskey;
	FILE *vfstab;

	vfstab = fopen(VFSTAB, "r");
	if (vfstab == NULL) {
		(void) printf("can't open %s\n", VFSTAB);
		return (1);
	}
	(void) memset((void *)&vfskey, 0, sizeof (vfskey));
	vfsnull(&vfskey);
	vfskey.vfs_special = unrawname(name);
	vfskey.vfs_fstype = MNTTYPE_UFS;
	if ((getvfsany(vfstab, &vfsbuf, &vfskey) == 0) &&
	    (hasvfsopt(&vfsbuf, MNTOPT_RO))) {
		rw = 0;
	}
	(void) fclose(vfstab);
	return (rw);
}

/*
 * debugclean
 */
static void
debugclean(void)
{
	if (!debug)
		return;

	if ((iscorrupt == 0) && (isdirty == 0))
		return;

	if ((sblock.fs_clean == FSSTABLE) || (sblock.fs_clean == FSCLEAN) ||
	    (sblock.fs_clean == FSLOG && islog && islogok) ||
	    ((FSOKAY == (sblock.fs_state + sblock.fs_time)) && !errorlocked))
		return;

	(void) printf("WARNING: inconsistencies detected on %s filesystem %s\n",
	    sblock.fs_clean == FSSTABLE ? "stable" :
	    sblock.fs_clean == FSLOG ? "logging" :
	    sblock.fs_clean == FSFIX ? "being fixed" : "clean",
	    devname);
}

/*
 * updateclean
 *	Carefully and transparently update the clean flag.
 *
 * `iscorrupt' has to be in its final state before this is called.
 */
int
updateclean(void)
{
	int freedlog = 0;
	struct bufarea cleanbuf;
	size_t size;
	ssize_t io_res;
	diskaddr_t bno;
	char fsclean;
	int fsreclaim;
	char fsflags;
	int flags_ok = 1;
	daddr32_t fslogbno;
	offset_t sblkoff;
	time_t t;

	/*
	 * debug stuff
	 */
	debugclean();

	/*
	 * set fsclean to its appropriate value
	 */
	fslogbno = sblock.fs_logbno;
	fsclean = sblock.fs_clean;
	fsreclaim = sblock.fs_reclaim;
	fsflags = sblock.fs_flags;
	if (FSOKAY != (sblock.fs_state + sblock.fs_time) && !errorlocked) {
		fsclean = FSACTIVE;
	}
	/*
	 * If ufs log is not okay, note that we need to clear it.
	 */
	examinelog(sblock.fs_logbno, NULL);
	if (fslogbno && !(islog && islogok)) {
		fsclean = FSACTIVE;
		fslogbno = 0;
	}

	/*
	 * if necessary, update fs_clean and fs_state
	 */
	switch (fsclean) {

	case FSACTIVE:
		if (!iscorrupt) {
			fsclean = FSSTABLE;
			fsreclaim = 0;
		}
		break;

	case FSCLEAN:
	case FSSTABLE:
		if (iscorrupt) {
			fsclean = FSACTIVE;
		} else {
			fsreclaim = 0;
		}
		break;

	case FSLOG:
		if (iscorrupt) {
			fsclean = FSACTIVE;
		} else if (!islog || fslogbno == 0) {
			fsclean = FSSTABLE;
			fsreclaim = 0;
		} else if (fflag) {
			fsreclaim = 0;
		}
		break;

	case FSFIX:
		fsclean = FSBAD;
		if (errorlocked && !iscorrupt) {
			fsclean = islog ? FSLOG : FSCLEAN;
		}
		break;

	default:
		if (iscorrupt) {
			fsclean = FSACTIVE;
		} else {
			fsclean = FSSTABLE;
			fsreclaim = 0;
		}
	}

	if (largefile_count > 0)
		fsflags |= FSLARGEFILES;
	else
		fsflags &= ~FSLARGEFILES;

	/*
	 * There can be two discrepencies here.  A) The superblock
	 * shows no largefiles but we found some while scanning.
	 * B) The superblock indicates the presence of largefiles,
	 * but none are present.  Note that if preening, the superblock
	 * is silently corrected.
	 */
	if ((fsflags == FSLARGEFILES && sblock.fs_flags != FSLARGEFILES) ||
	    (fsflags != FSLARGEFILES && sblock.fs_flags == FSLARGEFILES))
		flags_ok = 0;

	if (debug)
		(void) printf(
		    "** largefile count=%d, fs.fs_flags=%x, flags_ok %d\n",
		    largefile_count, sblock.fs_flags, flags_ok);

	/*
	 * If fs is unchanged, do nothing.
	 */
	if ((!isdirty) && (flags_ok) &&
	    (fslogbno == sblock.fs_logbno) &&
	    (sblock.fs_clean == fsclean) &&
	    (sblock.fs_reclaim == fsreclaim) &&
	    (FSOKAY == (sblock.fs_state + sblock.fs_time))) {
		if (errorlocked) {
			if (!do_errorlock(LOCKFS_ULOCK))
				pwarn(
		    "updateclean(unchanged): unlock(LOCKFS_ULOCK) failed\n");
		}
		return (freedlog);
	}

	/*
	 * if user allows, update superblock state
	 */
	if (debug) {
		(void) printf(
	    "superblock: flags 0x%x logbno %d clean %d reclaim %d state 0x%x\n",
		    sblock.fs_flags, sblock.fs_logbno,
		    sblock.fs_clean, sblock.fs_reclaim,
		    sblock.fs_state + sblock.fs_time);
		(void) printf(
	    "calculated: flags 0x%x logbno %d clean %d reclaim %d state 0x%x\n",
		    fsflags, fslogbno, fsclean, fsreclaim, FSOKAY);
	}
	if (!isdirty && !preen && !rerun &&
	    (reply("FILE SYSTEM STATE IN SUPERBLOCK IS WRONG; FIX") == 0))
		return (freedlog);

	(void) time(&t);
	sblock.fs_time = (time32_t)t;
	if (debug)
		printclean();

	if (sblock.fs_logbno != fslogbno) {
		examinelog(sblock.fs_logbno, &freelogblk);
		freedlog++;
	}

	sblock.fs_logbno = fslogbno;
	sblock.fs_clean = fsclean;
	sblock.fs_state = FSOKAY - (long)sblock.fs_time;
	sblock.fs_reclaim = fsreclaim;
	sblock.fs_flags = fsflags;

	/*
	 * if superblock can't be written, return
	 */
	if (fswritefd < 0)
		return (freedlog);

	/*
	 * Read private copy of superblock, update clean flag, and write it.
	 */
	bno  = sblk.b_bno;
	size = sblk.b_size;

	sblkoff = ldbtob(bno);

	if ((cleanbuf.b_un.b_buf = malloc(size)) == NULL)
		errexit("out of memory");
	if (llseek(fsreadfd, sblkoff, SEEK_SET) == -1) {
		(void) printf("COULD NOT SEEK TO SUPERBLOCK AT %lld: %s\n",
		    (longlong_t)bno, strerror(errno));
		goto out;
	}

	if ((io_res = read(fsreadfd, cleanbuf.b_un.b_buf, size)) != size) {
		report_io_prob("READ FROM", bno, size, io_res);
		goto out;
	}

	cleanbuf.b_un.b_fs->fs_logbno  = sblock.fs_logbno;
	cleanbuf.b_un.b_fs->fs_clean   = sblock.fs_clean;
	cleanbuf.b_un.b_fs->fs_state   = sblock.fs_state;
	cleanbuf.b_un.b_fs->fs_time    = sblock.fs_time;
	cleanbuf.b_un.b_fs->fs_reclaim = sblock.fs_reclaim;
	cleanbuf.b_un.b_fs->fs_flags   = sblock.fs_flags;

	if (llseek(fswritefd, sblkoff, SEEK_SET) == -1) {
		(void) printf("COULD NOT SEEK TO SUPERBLOCK AT %lld: %s\n",
		    (longlong_t)bno, strerror(errno));
		goto out;
	}

	if ((io_res = write(fswritefd, cleanbuf.b_un.b_buf, size)) != size) {
		report_io_prob("WRITE TO", bno, size, io_res);
		goto out;
	}

	/*
	 * 1208040
	 * If we had to use -b to grab an alternate superblock, then we
	 * likely had to do so because of unacceptable differences between
	 * the main and alternate superblocks.  So, we had better update
	 * the alternate superblock as well, or we'll just fail again
	 * the next time we attempt to run fsck!
	 */
	if (bflag != 0) {
		write_altsb(fswritefd);
	}

	if (errorlocked) {
		if (!do_errorlock(LOCKFS_ULOCK))
			pwarn(
		    "updateclean(changed): unlock(LOCKFS_ULOCK) failed\n");
	}

out:
	if (cleanbuf.b_un.b_buf != NULL) {
		free((void *)cleanbuf.b_un.b_buf);
	}

	return (freedlog);
}

static void
report_io_prob(caddr_t what, diskaddr_t bno, size_t expected, ssize_t failure)
{
	if (failure < 0)
		(void) printf("COULD NOT %s SUPERBLOCK AT %d: %s\n",
		    what, (int)bno, strerror(errno));
	else if (failure == 0)
		(void) printf("COULD NOT %s SUPERBLOCK AT %d: EOF\n",
		    what, (int)bno);
	else
		(void) printf("SHORT %s SUPERBLOCK AT %d: %u out of %u bytes\n",
		    what, (int)bno, (unsigned)failure, (unsigned)expected);
}

/*
 * print out clean info
 */
void
printclean(void)
{
	caddr_t s;

	if (FSOKAY != (sblock.fs_state + sblock.fs_time) && !errorlocked)
		s = "unknown";
	else
		switch (sblock.fs_clean) {

		case FSACTIVE:
			s = "active";
			break;

		case FSCLEAN:
			s = "clean";
			break;

		case FSSTABLE:
			s = "stable";
			break;

		case FSLOG:
			s = "logging";
			break;

		case FSBAD:
			s = "is bad";
			break;

		case FSFIX:
			s = "being fixed";
			break;

		default:
			s = "unknown";
		}

	if (preen)
		pwarn("is %s.\n", s);
	else
		(void) printf("** %s is %s.\n", devname, s);
}

int
is_errorlocked(caddr_t fs)
{
	int		retval;
	struct stat64	statb;
	caddr_t		mountp;
	struct mnttab	*mntent;

	retval = 0;

	if (!fs)
		return (0);

	if (stat64(fs, &statb) < 0)
		return (0);

	if (S_ISDIR(statb.st_mode)) {
		mountp = fs;
	} else if (S_ISBLK(statb.st_mode) || S_ISCHR(statb.st_mode)) {
		mntent = search_mnttab(NULL, fs, NULL, 0);
		if (mntent == NULL)
			return (0);
		mountp = mntent->mnt_mountp;
		if (mountp == NULL) /* theoretically a can't-happen */
			return (0);
	} else {
		return (0);
	}

	/*
	 * From here on, must `goto out' to avoid memory leakage.
	 */

	if (elock_combuf == NULL)
		elock_combuf =
			(caddr_t)calloc(LOCKFS_MAXCOMMENTLEN, sizeof (char));
	else
		elock_combuf =
			(caddr_t)realloc(elock_combuf, LOCKFS_MAXCOMMENTLEN);

	if (elock_combuf == NULL)
		goto out;

	(void) memset((void *)elock_combuf, 0, LOCKFS_MAXCOMMENTLEN);

	if (elock_mountp != NULL) {
		free(elock_mountp);
	}

	elock_mountp = strdup(mountp);
	if (elock_mountp == NULL)
		goto out;

	if (mountfd < 0) {
		if ((mountfd = open64(mountp, O_RDONLY)) == -1)
			goto out;
	}

	if (lfp == NULL) {
		lfp = (struct lockfs *)malloc(sizeof (struct lockfs));
		if (lfp == NULL)
			goto out;
		(void) memset((void *)lfp, 0, sizeof (struct lockfs));
	}

	lfp->lf_comlen = LOCKFS_MAXCOMMENTLEN;
	lfp->lf_comment = elock_combuf;

	if (ioctl(mountfd, _FIOLFSS, lfp) == -1)
		goto out;

	/*
	 * lint believes that the ioctl() (or any other function
	 * taking lfp as an arg) could free lfp.  This is not the
	 * case, however.
	 */
	retval = LOCKFS_IS_ELOCK(lfp);

out:
	return (retval);
}

/*
 * Given a name which is known to be a directory, see if it appears
 * in the vfstab.  If so, return the entry's block (special) device
 * field via devstr.
 */
int
check_vfstab(caddr_t name, caddr_t devstr, size_t str_size)
{
	return (NULL != search_vfstab(name, NULL, devstr, str_size));
}

/*
 * Given a name which is known to be a directory, see if it appears
 * in the mnttab.  If so, return the entry's block (special) device
 * field via devstr.
 */
int
check_mnttab(caddr_t name, caddr_t devstr, size_t str_size)
{
	return (NULL != search_mnttab(name, NULL, devstr, str_size));
}

/*
 * Search for mount point and/or special device in the given file.
 * The first matching entry is returned.
 *
 * If an entry is found and str_size is greater than zero, then
 * up to size_str bytes of the special device name from the entry
 * are copied to devstr.
 */

#define	SEARCH_TAB_BODY(st_type, st_file, st_mount, st_special, \
			st_nuller, st_init, st_searcher) \
	{ \
		FILE *fp; \
		struct st_type *retval = NULL; \
		struct st_type key; \
		static struct st_type buffer; \
		\
		/* LINTED ``assigned value never used'' */ \
		st_nuller(&key); \
		key.st_mount = mountp; \
		key.st_special = special; \
		st_init; \
		\
		if ((fp = fopen(st_file, "r")) == NULL) \
			return (NULL); \
		\
		if (st_searcher(fp, &buffer, &key) == 0) { \
			retval = &buffer; \
			if (devstr != NULL && str_size > 0 && \
			    buffer.st_special != NULL) { \
				(void) strlcpy(devstr, buffer.st_special, \
				    str_size); \
			} \
		} \
		(void) fclose(fp); \
		return (retval); \
	}

static struct vfstab *
search_vfstab(caddr_t mountp, caddr_t special, caddr_t devstr, size_t str_size)
SEARCH_TAB_BODY(vfstab, VFSTAB, vfs_mountp, vfs_special, vfsnull,
		(retval = retval), getvfsany)

static struct mnttab *
search_mnttab(caddr_t mountp, caddr_t special, caddr_t devstr, size_t str_size)
SEARCH_TAB_BODY(mnttab, MNTTAB, mnt_mountp, mnt_special, mntnull,
		(key.mnt_fstype = MNTTYPE_UFS), getmntany)

int
do_errorlock(int lock_type)
{
	caddr_t	   buf;
	time_t	   now;
	struct tm *local;
	int	   rc;

	if (elock_combuf == NULL)
		errexit("do_errorlock(%s, %d): unallocated elock_combuf\n",
			elock_mountp ? elock_mountp : "<null>",
			lock_type);

	if ((buf = (caddr_t)calloc(LOCKFS_MAXCOMMENTLEN, sizeof (char))) ==
	    NULL) {
		errexit("Couldn't alloc memory for temp. lock status buffer\n");
	}
	if (lfp == NULL) {
		errexit("do_errorlock(%s, %d): lockfs status unallocated\n",
					elock_mountp, lock_type);
	}

	(void) memmove((void *)buf, (void *)elock_combuf,
	    LOCKFS_MAXCOMMENTLEN-1);

	switch (lock_type) {
	case LOCKFS_ELOCK:
		/*
		 * Note that if it is error-locked, we won't get an
		 * error back if we try to error-lock it again.
		 */
		if (time(&now) != (time_t)-1) {
			if ((local = localtime(&now)) != NULL)
				(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
		    "%s [pid:%d fsck start:%02d/%02d/%02d %02d:%02d:%02d",
				    elock_combuf, (int)pid,
				    local->tm_mon + 1, local->tm_mday,
				    (local->tm_year % 100), local->tm_hour,
				    local->tm_min, local->tm_sec);
			else
				(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
				    "%s [fsck pid %d", elock_combuf, pid);

		} else {
			(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
			    "%s [fsck pid %d", elock_combuf, pid);
		}
		break;

	case LOCKFS_ULOCK:
		if (time(&now) != (time_t)-1) {
			if ((local = localtime(&now)) != NULL) {
				(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
				    "%s, done:%02d/%02d/%02d %02d:%02d:%02d]",
				    elock_combuf,
				    local->tm_mon + 1, local->tm_mday,
				    (local->tm_year % 100), local->tm_hour,
				    local->tm_min, local->tm_sec);
			} else {
				(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
				    "%s]", elock_combuf);
			}
		} else {
			(void) snprintf(buf, LOCKFS_MAXCOMMENTLEN,
			    "%s]", elock_combuf);
		}
		if ((rc = ioctl(mountfd, _FIOLFSS, lfp)) == -1) {
			pwarn("do_errorlock: unlock failed: %s\n",
			    strerror(errno));
			goto out;
		}
		break;

	default:
		break;
	}

	(void) memmove((void *)elock_combuf, (void *)buf,
	    LOCKFS_MAXCOMMENTLEN - 1);

	lfp->lf_lock = lock_type;
	lfp->lf_comlen = LOCKFS_MAXCOMMENTLEN;
	lfp->lf_comment = elock_combuf;
	lfp->lf_flags = 0;
	errno = 0;

	if ((rc = ioctl(mountfd, _FIOLFS, lfp)) == -1) {
		if (errno == EINVAL) {
			pwarn("Another fsck active?\n");
			iscorrupt = 0;	/* don't go away mad, just go away */
		} else {
			pwarn("do_errorlock(lock_type:%d, %s) failed: %s\n",
			    lock_type, elock_combuf, strerror(errno));
		}
	}
out:
	if (buf != NULL) {
		free((void *)buf);
	}

	return (rc != -1);
}

/*
 * Shadow inode support.  To register a shadow with a client is to note
 * that an inode (the client) refers to the shadow.
 */

static struct shadowclients *
newshadowclient(struct shadowclients *prev)
{
	struct shadowclients *rc;

	rc = (struct shadowclients *)malloc(sizeof (*rc));
	if (rc == NULL)
		errexit("newshadowclient: cannot malloc shadow client");
	rc->next = prev;
	rc->nclients = 0;

	rc->client = (fsck_ino_t *)malloc(sizeof (fsck_ino_t) *
	    maxshadowclients);
	if (rc->client == NULL)
		errexit("newshadowclient: cannot malloc client array");
	return (rc);
}

void
registershadowclient(fsck_ino_t shadow, fsck_ino_t client,
	struct shadowclientinfo **info)
{
	struct shadowclientinfo *sci;
	struct shadowclients *scc;

	/*
	 * Already have a record for this shadow?
	 */
	for (sci = *info; sci != NULL; sci = sci->next)
		if (sci->shadow == shadow)
			break;
	if (sci == NULL) {
		/*
		 * It's a new shadow, add it to the list
		 */
		sci = (struct shadowclientinfo *)malloc(sizeof (*sci));
		if (sci == NULL)
			errexit("registershadowclient: cannot malloc");
		sci->next = *info;
		*info = sci;
		sci->shadow = shadow;
		sci->totalClients = 0;
		sci->clients = newshadowclient(NULL);
	}

	sci->totalClients++;
	scc = sci->clients;
	if (scc->nclients >= maxshadowclients) {
		scc = newshadowclient(sci->clients);
		sci->clients = scc;
	}

	scc->client[scc->nclients++] = client;
}

/*
 * Locate and discard a shadow.
 */
void
clearshadow(fsck_ino_t shadow, struct shadowclientinfo **info)
{
	struct shadowclientinfo *sci, *prev;

	/*
	 * Do we have a record for this shadow?
	 */
	prev = NULL;
	for (sci = *info; sci != NULL; sci = sci->next) {
		if (sci->shadow == shadow)
			break;
		prev = sci;
	}

	if (sci != NULL) {
		/*
		 * First, pull it off the list, since we know there
		 * shouldn't be any future references to this one.
		 */
		if (prev == NULL)
			*info = sci->next;
		else
			prev->next = sci->next;
		deshadow(sci, clearattrref);
	}
}

/*
 * Discard all memory used to track clients of a shadow.
 */
void
deshadow(struct shadowclientinfo *sci, void (*cb)(fsck_ino_t))
{
	struct shadowclients *clients, *discard;
	int idx;

	clients = sci->clients;
	while (clients != NULL) {
		discard = clients;
		clients = clients->next;
		if (discard->client != NULL) {
			if (cb != NULL) {
				for (idx = 0; idx < discard->nclients; idx++)
					(*cb)(discard->client[idx]);
			}
			free((void *)discard->client);
		}
		free((void *)discard);
	}

	free((void *)sci);
}

/*
 * Allocate more buffer as need arises but allocate one at a time.
 * This is done to make sure that fsck does not exit with error if it
 * needs more buffer to complete its task.
 */
static struct bufarea *
alloc_bufarea(void)
{
	struct bufarea *newbp;
	caddr_t bufp;

	bufp = malloc((unsigned int)sblock.fs_bsize);
	if (bufp == NULL)
		return (NULL);

	newbp = (struct bufarea *)malloc(sizeof (struct bufarea));
	if (newbp == NULL) {
		free((void *)bufp);
		return (NULL);
	}

	initbarea(newbp);
	newbp->b_un.b_buf = bufp;
	newbp->b_prev = &bufhead;
	newbp->b_next = bufhead.b_next;
	bufhead.b_next->b_prev = newbp;
	bufhead.b_next = newbp;
	bufhead.b_size++;
	return (newbp);
}

/*
 * We length-limit in both unrawname() and rawname() to avoid
 * overflowing our arrays or those of our naive, trusting callers.
 */

caddr_t
unrawname(caddr_t name)
{
	caddr_t dp;
	static char fullname[MAXPATHLEN + 1];

	if ((dp = getfullblkname(name)) == NULL)
		return ("");

	(void) strlcpy(fullname, dp, sizeof (fullname));
	/*
	 * Not reporting under debug, as the allocation isn't
	 * reported by getfullblkname.  The idea is that we
	 * produce balanced alloc/free instances.
	 */
	free(dp);

	return (fullname);
}

caddr_t
rawname(caddr_t name)
{
	caddr_t dp;
	static char fullname[MAXPATHLEN + 1];

	if ((dp = getfullrawname(name)) == NULL)
		return ("");

	(void) strlcpy(fullname, dp, sizeof (fullname));
	/*
	 * Not reporting under debug, as the allocation isn't
	 * reported by getfullblkname.  The idea is that we
	 * produce balanced alloc/free instances.
	 */
	free(dp);

	return (fullname);
}

/*
 * Make sure that a cg header looks at least moderately reasonable.
 * We want to be able to trust the contents enough to be able to use
 * the standard accessor macros.  So, besides looking at the obvious
 * such as the magic number, we verify that the offset field values
 * are properly aligned and not too big or small.
 *
 * Returns a NULL pointer if the cg is sane enough for our needs, else
 * a dynamically-allocated string describing all of its faults.
 */
#define	Append_Error(full, full_len, addition, addition_len) \
	if (full == NULL) { \
		full = addition; \
		full_len = addition_len; \
	} else { \
		/* lint doesn't think realloc() understands NULLs */ \
		full = realloc(full, full_len + addition_len + 1); \
		if (full == NULL) { \
			errexit("Out of memory in cg_sanity"); \
			/* NOTREACHED */ \
		} \
		(void) strcpy(full + full_len, addition); \
		full_len += addition_len; \
		free(addition); \
	}

caddr_t
cg_sanity(struct cg *cgp, int cgno)
{
	caddr_t full_err;
	caddr_t this_err = NULL;
	int full_len, this_len;
	daddr32_t ndblk;
	daddr32_t exp_btotoff, exp_boff, exp_iusedoff;
	daddr32_t exp_freeoff, exp_nextfreeoff;

	cg_constants(cgno, &exp_btotoff, &exp_boff, &exp_iusedoff,
	    &exp_freeoff, &exp_nextfreeoff, &ndblk);

	full_err = NULL;
	full_len = 0;

	if (!cg_chkmagic(cgp)) {
		this_len = fsck_asprintf(&this_err,
		    "BAD CG MAGIC NUMBER (0x%x should be 0x%x)\n",
		    cgp->cg_magic, CG_MAGIC);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_cgx != cgno) {
		this_len = fsck_asprintf(&this_err,
		    "WRONG CG NUMBER (%d should be %d)\n",
		    cgp->cg_cgx, cgno);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_btotoff & 3) != 0) {
		this_len = fsck_asprintf(&this_err,
		    "BLOCK TOTALS OFFSET %d NOT FOUR-BYTE ALIGNED\n",
		    cgp->cg_btotoff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_boff & 1) != 0) {
		this_len = fsck_asprintf(&this_err,
	    "FREE BLOCK POSITIONS TABLE OFFSET %d NOT TWO-BYTE ALIGNED\n",
		    cgp->cg_boff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_ncyl < 1) || (cgp->cg_ncyl > sblock.fs_cpg)) {
		if (cgp->cg_ncyl < 1) {
			this_len = fsck_asprintf(&this_err,
	    "IMPOSSIBLE NUMBER OF CYLINDERS IN GROUP (%d is less than 1)\n",
			    cgp->cg_ncyl);
		} else {
			this_len = fsck_asprintf(&this_err,
	    "IMPOSSIBLE NUMBER OF CYLINDERS IN GROUP (%d is greater than %d)\n",
			    cgp->cg_ncyl, sblock.fs_cpg);
		}
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_niblk != sblock.fs_ipg) {
		this_len = fsck_asprintf(&this_err,
		    "INCORRECT NUMBER OF INODES IN GROUP (%d should be %d)\n",
		    cgp->cg_niblk, sblock.fs_ipg);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_ndblk != ndblk) {
		this_len = fsck_asprintf(&this_err,
	    "INCORRECT NUMBER OF DATA BLOCKS IN GROUP (%d should be %d)\n",
		    cgp->cg_ndblk, ndblk);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_rotor < 0) || (cgp->cg_rotor >= ndblk)) {
		this_len = fsck_asprintf(&this_err,
		    "IMPOSSIBLE BLOCK ALLOCATION ROTOR POSITION "
		    "(%d should be at least 0 and less than %d)\n",
		    cgp->cg_rotor, ndblk);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_frotor < 0) || (cgp->cg_frotor >= ndblk)) {
		this_len = fsck_asprintf(&this_err,
		    "IMPOSSIBLE FRAGMENT ALLOCATION ROTOR POSITION "
		    "(%d should be at least 0 and less than %d)\n",
		    cgp->cg_frotor, ndblk);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if ((cgp->cg_irotor < 0) || (cgp->cg_irotor >= sblock.fs_ipg)) {
		this_len = fsck_asprintf(&this_err,
		    "IMPOSSIBLE INODE ALLOCATION ROTOR POSITION "
		    "(%d should be at least 0 and less than %d)\n",
		    cgp->cg_irotor, sblock.fs_ipg);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_btotoff != exp_btotoff) {
		this_len = fsck_asprintf(&this_err,
		    "INCORRECT BLOCK TOTALS OFFSET (%d should be %d)\n",
		    cgp->cg_btotoff, exp_btotoff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_boff != exp_boff) {
		this_len = fsck_asprintf(&this_err,
		    "BAD FREE BLOCK POSITIONS TABLE OFFSET (%d should %d)\n",
		    cgp->cg_boff, exp_boff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_iusedoff != exp_iusedoff) {
		this_len = fsck_asprintf(&this_err,
		    "INCORRECT USED INODE MAP OFFSET (%d should be %d)\n",
		    cgp->cg_iusedoff, exp_iusedoff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_freeoff != exp_freeoff) {
		this_len = fsck_asprintf(&this_err,
		    "INCORRECT FREE FRAGMENT MAP OFFSET (%d should be %d)\n",
		    cgp->cg_freeoff, exp_freeoff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	if (cgp->cg_nextfreeoff != exp_nextfreeoff) {
		this_len = fsck_asprintf(&this_err,
		    "END OF HEADER POSITION INCORRECT (%d should be %d)\n",
		    cgp->cg_nextfreeoff, exp_nextfreeoff);
		Append_Error(full_err, full_len, this_err, this_len);
	}

	return (full_err);
}

#undef	Append_Error

/*
 * This is taken from mkfs, and is what is used to come up with the
 * original values for a struct cg.  This implies that, since these
 * are all constants, recalculating them now should give us the same
 * thing as what's on disk.
 */
static void
cg_constants(int cgno, daddr32_t *btotoff, daddr32_t *boff,
	daddr32_t *iusedoff, daddr32_t *freeoff, daddr32_t *nextfreeoff,
	daddr32_t *ndblk)
{
	daddr32_t cbase, dmax;
	struct cg *cgp;

	(void) getblk(&cgblk, (diskaddr_t)cgtod(&sblock, cgno),
	    (size_t)sblock.fs_cgsize);
	cgp = cgblk.b_un.b_cg;

	cbase = cgbase(&sblock, cgno);
	dmax = cbase + sblock.fs_fpg;
	if (dmax > sblock.fs_size)
		dmax = sblock.fs_size;

	/* LINTED pointer difference won't overflow */
	*btotoff = &cgp->cg_space[0] - (uchar_t *)(&cgp->cg_link);
	*boff = *btotoff + sblock.fs_cpg * sizeof (daddr32_t);
	*iusedoff = *boff + sblock.fs_cpg * sblock.fs_nrpos * sizeof (int16_t);
	*freeoff = *iusedoff + howmany(sblock.fs_ipg, NBBY);
	*nextfreeoff = *freeoff +
		howmany(sblock.fs_cpg * sblock.fs_spc / NSPF(&sblock), NBBY);
	*ndblk = dmax - cbase;
}

/*
 * Corrects all fields in the cg that can be done with the available
 * redundant data.
 */
void
fix_cg(struct cg *cgp, int cgno)
{
	daddr32_t exp_btotoff, exp_boff, exp_iusedoff;
	daddr32_t exp_freeoff, exp_nextfreeoff;
	daddr32_t ndblk;

	cg_constants(cgno, &exp_btotoff, &exp_boff, &exp_iusedoff,
	    &exp_freeoff, &exp_nextfreeoff, &ndblk);

	if (cgp->cg_cgx != cgno) {
		cgp->cg_cgx = cgno;
	}

	if ((cgp->cg_ncyl < 1) || (cgp->cg_ncyl > sblock.fs_cpg)) {
		if (cgno == (sblock.fs_ncg - 1)) {
			cgp->cg_ncyl = sblock.fs_ncyl -
				(sblock.fs_cpg * cgno);
		} else {
			cgp->cg_ncyl = sblock.fs_cpg;
		}
	}

	if (cgp->cg_niblk != sblock.fs_ipg) {
		/*
		 * This is not used by the kernel, so it's pretty
		 * harmless if it's wrong.
		 */
		cgp->cg_niblk = sblock.fs_ipg;
	}

	if (cgp->cg_ndblk != ndblk) {
		cgp->cg_ndblk = ndblk;
	}

	/*
	 * For the rotors, any position's valid, so pick the one we know
	 * will always exist.
	 */
	if ((cgp->cg_rotor < 0) || (cgp->cg_rotor >= cgp->cg_ndblk)) {
		cgp->cg_rotor = 0;
	}

	if ((cgp->cg_frotor < 0) || (cgp->cg_frotor >= cgp->cg_ndblk)) {
		cgp->cg_frotor = 0;
	}

	if ((cgp->cg_irotor < 0) || (cgp->cg_irotor >= sblock.fs_ipg)) {
		cgp->cg_irotor = 0;
	}

	/*
	 * For btotoff and boff, if they're misaligned they won't
	 * match the expected values, so we're catching both cases
	 * here.  Of course, if any of these are off, it seems likely
	 * that the tables really won't be where we calculate they
	 * should be anyway.
	 */
	if (cgp->cg_btotoff != exp_btotoff) {
		cgp->cg_btotoff = exp_btotoff;
	}

	if (cgp->cg_boff != exp_boff) {
		cgp->cg_boff = exp_boff;
	}

	if (cgp->cg_iusedoff != exp_iusedoff) {
		cgp->cg_iusedoff = exp_iusedoff;
	}

	if (cgp->cg_freeoff != exp_freeoff) {
		cgp->cg_freeoff = exp_freeoff;
	}

	if (cgp->cg_nextfreeoff != exp_nextfreeoff) {
		cgp->cg_nextfreeoff = exp_nextfreeoff;
	}

	/*
	 * Reset the magic, as we've recreated this cg, also
	 * update the cg_time, as we're writing out the cg
	 */
	cgp->cg_magic = CG_MAGIC;
	cgp->cg_time = time(NULL);

	/*
	 * We know there was at least one correctable problem,
	 * or else we wouldn't have been called.  So instead of
	 * marking the buffer dirty N times above, just do it
	 * once here.
	 */
	cgdirty();
}

void
examinelog(daddr32_t start, void (*cb)(daddr32_t))
{
	struct bufarea *bp;
	extent_block_t *ebp;
	extent_t *ep;
	daddr32_t nfno, fno;
	int i;
	int j;

	if (start < SBLOCK)
		return;

	/*
	 * Read errors will return zeros, which will cause us
	 * to do nothing harmful, so don't need to handle it.
	 */
	bp = getdatablk(logbtofrag(&sblock, sblock.fs_logbno),
			(size_t)sblock.fs_bsize);
	ebp = (void *)bp->b_un.b_buf;

	/*
	 * Does it look like a log allocation table?
	 */
	/* LINTED pointer cast is aligned */
	if (!log_checksum(&ebp->chksum, (int32_t *)bp->b_un.b_buf,
	    sblock.fs_bsize))
		return;
	if (ebp->type != LUFS_EXTENTS || ebp->nextents == 0)
		return;

	ep = &ebp->extents[0];
	for (i = 0; i < ebp->nextents; ++i, ++ep) {
		fno = logbtofrag(&sblock, ep->pbno);
		nfno = dbtofsb(&sblock, ep->nbno);
		for (j = 0; j < nfno; ++j, ++fno) {
			/*
			 * Invoke the callback first, so that pass1 can
			 * mark the log blocks in-use.  Then, if any
			 * subsequent pass over the log shows us that a
			 * block got freed (say, it was also claimed by
			 * an inode that we cleared), we can safely declare
			 * the log bad.
			 */
			if (cb != NULL)
				(*cb)(fno);
			if (!testbmap(fno))
				islogok = 0;
		}
	}
	brelse(bp);

	if (cb != NULL) {
		fno = logbtofrag(&sblock, sblock.fs_logbno);
		for (j = 0; j < sblock.fs_frag; ++j, ++fno)
			(*cb)(fno);
	}
}

static void
freelogblk(daddr32_t frag)
{
	freeblk(sblock.fs_logbno, frag, 1);
}

caddr_t
file_id(fsck_ino_t inum, mode_t mode)
{
	static char name[MAXPATHLEN + 1];

	if (lfdir == inum) {
		return (lfname);
	}

	if ((mode & IFMT) == IFDIR) {
		(void) strcpy(name, "DIR");
	} else if ((mode & IFMT) == IFATTRDIR) {
		(void) strcpy(name, "ATTR DIR");
	} else if ((mode & IFMT) == IFSHAD) {
		(void) strcpy(name, "ACL");
	} else {
		(void) strcpy(name, "FILE");
	}

	return (name);
}

/*
 * Simple initializer for inodesc structures, so users of only a few
 * fields don't have to worry about getting the right defaults for
 * everything out.
 */
void
init_inodesc(struct inodesc *idesc)
{
	/*
	 * Most fields should be zero, just hit the special cases.
	 */
	(void) memset((void *)idesc, 0, sizeof (struct inodesc));
	idesc->id_fix = DONTKNOW;
	idesc->id_lbn = -1;
	idesc->id_truncto = -1;
	idesc->id_firsthole = -1;
}

/*
 * Compare routine for tsearch(C) to use on ino_t instances.
 */
int
ino_t_cmp(const void *left, const void *right)
{
	const fsck_ino_t lino = (const fsck_ino_t)left;
	const fsck_ino_t rino = (const fsck_ino_t)right;

	return (lino - rino);
}

int
cgisdirty(void)
{
	return (cgblk.b_dirty);
}

void
cgflush(void)
{
	flush(fswritefd, &cgblk);
}

void
dirty(struct bufarea *bp)
{
	if (fswritefd < 0) {
		pfatal("SETTING DIRTY FLAG IN READ_ONLY MODE\n");
	} else {
		(bp)->b_dirty = 1;
		isdirty = 1;
	}
}

void
initbarea(struct bufarea *bp)
{
	(bp)->b_dirty = 0;
	(bp)->b_bno = (diskaddr_t)-1LL;
	(bp)->b_flags = 0;
	(bp)->b_cnt = 0;
	(bp)->b_errs = 0;
}

/*
 * Partition-sizing routines adapted from ../newfs/newfs.c.
 * Needed because calcsb() needs to use mkfs to work out what the
 * superblock should be, and mkfs insists on being told how many
 * sectors to use.
 *
 * Error handling assumes we're never called while preening.
 *
 * XXX This should be extracted into a ../ufslib.{c,h},
 *     in the same spirit to ../../fslib.{c,h}.  Once that is
 *     done, both fsck and newfs should be modified to link
 *     against it.
 */

static int label_type;

#define	LABEL_TYPE_VTOC		1
#define	LABEL_TYPE_EFI		2
#define	LABEL_TYPE_OTHER	3

#define	MB			(1024 * 1024)
#define	SECTORS_PER_TERABYTE	(1LL << 31)
#define	FS_SIZE_UPPER_LIMIT	0x100000000000LL

diskaddr_t
getdisksize(caddr_t disk, int fd)
{
	int rpm;
	struct dk_geom g;
	struct dk_cinfo ci;
	diskaddr_t actual_size;

	/*
	 * get_device_size() determines the actual size of the
	 * device, and also the disk's attributes, such as geometry.
	 */
	actual_size = get_device_size(fd, disk);

	if (label_type == LABEL_TYPE_VTOC) {
		if (ioctl(fd, DKIOCGGEOM, &g)) {
			pwarn("%s: Unable to read Disk geometry", disk);
			return (0);
		}
		if (sblock.fs_nsect == 0)
			sblock.fs_nsect = g.dkg_nsect;
		if (sblock.fs_ntrak == 0)
			sblock.fs_ntrak = g.dkg_nhead;
		if (sblock.fs_rps == 0) {
			rpm = ((int)g.dkg_rpm <= 0) ? 3600: g.dkg_rpm;
			sblock.fs_rps = rpm / 60;
		}
	}

	if (sblock.fs_bsize == 0)
		sblock.fs_bsize = MAXBSIZE;

	/*
	 * Adjust maxcontig by the device's maxtransfer. If maxtransfer
	 * information is not available, default to the min of a MB and
	 * maxphys.
	 */
	if (sblock.fs_maxcontig == -1 && ioctl(fd, DKIOCINFO, &ci) == 0) {
		sblock.fs_maxcontig = ci.dki_maxtransfer * DEV_BSIZE;
		if (sblock.fs_maxcontig < 0) {
			int gotit, maxphys;

			gotit = fsgetmaxphys(&maxphys, NULL);

			/*
			 * If we cannot get the maxphys value, default
			 * to ufs_maxmaxphys (MB).
			 */
			if (gotit) {
				sblock.fs_maxcontig = MIN(maxphys, MB);
			} else {
				sblock.fs_maxcontig = MB;
			}
		}
		sblock.fs_maxcontig /= sblock.fs_bsize;
	}

	return (actual_size);
}

/*
 * Figure out how big the partition we're dealing with is.
 */
static diskaddr_t
get_device_size(int fd, caddr_t name)
{
	struct vtoc vtoc;
	struct dk_gpt *efi_vtoc;
	diskaddr_t slicesize = 0;

	int index = read_vtoc(fd, &vtoc);

	if (index >= 0) {
		label_type = LABEL_TYPE_VTOC;
	} else {
		if (index == VT_ENOTSUP || index == VT_ERROR) {
			/* it might be an EFI label */
			index = efi_alloc_and_read(fd, &efi_vtoc);
			if (index >= 0)
				label_type = LABEL_TYPE_EFI;
		}
	}

	if (index < 0) {
		/*
		 * Since both attempts to read the label failed, we're
		 * going to fall back to a brute force approach to
		 * determining the device's size:  see how far out we can
		 * perform reads on the device.
		 */

		slicesize = brute_force_get_device_size(fd);
		if (slicesize == 0) {
			switch (index) {
			case VT_ERROR:
				pwarn("%s: %s\n", name, strerror(errno));
				break;
			case VT_EIO:
				pwarn("%s: I/O error accessing VTOC", name);
				break;
			case VT_EINVAL:
				pwarn("%s: Invalid field in VTOC", name);
				break;
			default:
				pwarn("%s: unknown error %d accessing VTOC",
				    name, index);
				break;
			}
			return (0);
		} else {
			label_type = LABEL_TYPE_OTHER;
		}
	}

	if (label_type == LABEL_TYPE_EFI) {
		slicesize = efi_vtoc->efi_parts[index].p_size;
		efi_free(efi_vtoc);
	} else if (label_type == LABEL_TYPE_VTOC) {
		/*
		 * In the vtoc struct, p_size is a 32-bit signed quantity.
		 * In the dk_gpt struct (efi's version of the vtoc), p_size
		 * is an unsigned 64-bit quantity.  By casting the vtoc's
		 * psize to an unsigned 32-bit quantity, it will be copied
		 * to 'slicesize' (an unsigned 64-bit diskaddr_t) without
		 * sign extension.
		 */

		slicesize = (uint32_t)vtoc.v_part[index].p_size;
	}

	return (slicesize);
}

/*
 * brute_force_get_device_size
 *
 * Determine the size of the device by seeing how far we can
 * read.  Doing an llseek( , , SEEK_END) would probably work
 * in most cases, but we've seen at least one third-party driver
 * which doesn't correctly support the SEEK_END option when the
 * the device is greater than a terabyte.
 */

static diskaddr_t
brute_force_get_device_size(int fd)
{
	diskaddr_t	min_fail = 0;
	diskaddr_t	max_succeed = 0;
	diskaddr_t	cur_db_off;
	char 		buf[DEV_BSIZE];

	/*
	 * First, see if we can read the device at all, just to
	 * eliminate errors that have nothing to do with the
	 * device's size.
	 */

	if (((llseek(fd, (offset_t)0, SEEK_SET)) == -1) ||
	    ((read(fd, buf, DEV_BSIZE)) == -1))
		return (0);  /* can't determine size */

	/*
	 * Now, go sequentially through the multiples of 4TB
	 * to find the first read that fails (this isn't strictly
	 * the most efficient way to find the actual size if the
	 * size really could be anything between 0 and 2**64 bytes.
	 * We expect the sizes to be less than 16 TB for some time,
	 * so why do a bunch of reads that are larger than that?
	 * However, this algorithm *will* work for sizes of greater
	 * than 16 TB.  We're just not optimizing for those sizes.)
	 */

	/*
	 * XXX lint uses 32-bit arithmetic for doing flow analysis.
	 * We're using > 32-bit constants here.  Therefore, its flow
	 * analysis is wrong.  For the time being, ignore complaints
	 * from it about the body of the for() being unreached.
	 */
	for (cur_db_off = SECTORS_PER_TERABYTE * 4;
	    (min_fail == 0) && (cur_db_off < FS_SIZE_UPPER_LIMIT);
	    cur_db_off += 4 * SECTORS_PER_TERABYTE) {
		if ((llseek(fd, (offset_t)(cur_db_off * DEV_BSIZE),
		    SEEK_SET) == -1) ||
		    (read(fd, buf, DEV_BSIZE) != DEV_BSIZE))
			min_fail = cur_db_off;
		else
			max_succeed = cur_db_off;
	}

	/*
	 * XXX Same lint flow analysis problem as above.
	 */
	if (min_fail == 0)
		return (0);

	/*
	 * We now know that the size of the device is less than
	 * min_fail and greater than or equal to max_succeed.  Now
	 * keep splitting the difference until the actual size in
	 * sectors in known.  We also know that the difference
	 * between max_succeed and min_fail at this time is
	 * 4 * SECTORS_PER_TERABYTE, which is a power of two, which
	 * simplifies the math below.
	 */

	while (min_fail - max_succeed > 1) {
		cur_db_off = max_succeed + (min_fail - max_succeed)/2;
		if (((llseek(fd, (offset_t)(cur_db_off * DEV_BSIZE),
		    SEEK_SET)) == -1) ||
		    ((read(fd, buf, DEV_BSIZE)) != DEV_BSIZE))
			min_fail = cur_db_off;
		else
			max_succeed = cur_db_off;
	}

	/* the size is the last successfully read sector offset plus one */
	return (max_succeed + 1);
}

static void
vfileerror(fsck_ino_t cwd, fsck_ino_t ino, caddr_t fmt, va_list ap)
{
	struct dinode *dp;
	char pathbuf[MAXPATHLEN + 1];

	vpwarn(fmt, ap);
	(void) putchar(' ');
	pinode(ino);
	(void) printf("\n");
	getpathname(pathbuf, cwd, ino);
	if (ino < UFSROOTINO || ino > maxino) {
		pfatal("NAME=%s\n", pathbuf);
		return;
	}
	dp = ginode(ino);
	if (ftypeok(dp))
		pfatal("%s=%s\n", file_id(ino, dp->di_mode), pathbuf);
	else
		pfatal("NAME=%s\n", pathbuf);
}

void
direrror(fsck_ino_t ino, caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfileerror(ino, ino, fmt, ap);
	va_end(ap);
}

static void
vdirerror(fsck_ino_t ino, caddr_t fmt, va_list ap)
{
	vfileerror(ino, ino, fmt, ap);
}

void
fileerror(fsck_ino_t cwd, fsck_ino_t ino, caddr_t fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfileerror(cwd, ino, fmt, ap);
	va_end(ap);
}

/*
 * Adds the given inode to the orphaned-directories list, limbo_dirs.
 * Assumes that the caller has set INCLEAR in the inode's statemap[]
 * entry.
 *
 * With INCLEAR set, the inode will get ignored by passes 2 and 3,
 * meaning it's effectively an orphan.  It needs to be noted now, so
 * it will be remembered in pass 4.
 */

void
add_orphan_dir(fsck_ino_t ino)
{
	if (tsearch((void *)ino, &limbo_dirs, ino_t_cmp) == NULL)
		errexit("add_orphan_dir: out of memory");
}

/*
 * Remove an inode from the orphaned-directories list, presumably
 * because it's been cleared.
 */
void
remove_orphan_dir(fsck_ino_t ino)
{
	(void) tdelete((void *)ino, &limbo_dirs, ino_t_cmp);
}

/*
 * log_setsum() and log_checksum() are equivalent to lufs.c:setsum()
 * and lufs.c:checksum().
 */
static void
log_setsum(int32_t *sp, int32_t *lp, int nb)
{
	int32_t csum = 0;

	*sp = 0;
	nb /= sizeof (int32_t);
	while (nb--)
		csum += *lp++;
	*sp = csum;
}

static int
log_checksum(int32_t *sp, int32_t *lp, int nb)
{
	int32_t ssum = *sp;

	log_setsum(sp, lp, nb);
	if (ssum != *sp) {
		*sp = ssum;
		return (0);
	}
	return (1);
}
author	casper
date	Mon, 02 Apr 2007 02:03:22 -0700
parents	e626dd5cb7ec
children	a454cf8a2c90