view usr/src/cmd/fs.d/nfs/nfslog/dbtab.c @ 4:1a15d5aaf794

synchronized with onnv_86 (6202) in onnv-gate
author Koji Uno <koji.uno@sun.com>
date Mon, 31 Aug 2009 14:38:03 +0900
parents c9caec207d52
children
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * Code to maintain the runtime and on-disk filehandle mapping table for
 * nfslog.
 */

#include <assert.h>
#include <errno.h>
#include <ctype.h>
#include <nfs/nfs.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <strings.h>
#include <syslog.h>
#include <unistd.h>
#include <dirent.h>
#include <ndbm.h>
#include <time.h>
#include <libintl.h>
#include <sys/types.h>
#include <nfs/nfs.h>
#include <nfs/nfs_log.h>
#include "fhtab.h"
#include "nfslogd.h"

#define	ROUNDUP32(val)		(((val) + 3) & ~3)

/*
 * It is important that this string not match the length of the
 * file handle key length NFS_FHMAXDATA
 */
#define	DB_VERSION_STRING	"NFSLOG_DB_VERSION"
#define	DB_VERSION		"1"

#define	MAX_PRUNE_REC_CNT	100000

fhandle_t	public_fh = { 0 };

struct db_list {
	fsid_t		fsid;		/* filesystem fsid */
	char		*path;		/* dbm filepair path */
	DBM		*db;		/* open dbm database */
	bool_t		getall;		/* TRUE if all dbm for prefix open */
	struct db_list	*next;		/* next db */
};

static struct db_list *db_fs_list = NULL;
static	char	err_str[] = "DB I/O error has occurred";
struct link_keys {
	fh_secondary_key	lnkey;
	int			lnsize;
	struct link_keys	*next;
};
extern int debug;
extern time_t mapping_update_interval;
extern time_t prune_timeout;

static int fill_link_key(char *linkkey, fhandle_t *dfh, char *name);
static struct db_list *db_get_db(char *fhpath, fsid_t *fsid, int *errorp,
	int create_flag);
static struct db_list *db_get_all_databases(char *fhpath, bool_t getall);
static void debug_print_fhlist(FILE *fp, fhlist_ent *fhrecp);
static void debug_print_linkinfo(FILE *fp, linkinfo_ent *fhrecp);
static void debug_print_key(FILE *fp, char *str1, char *str2, char *key,
	int ksize);
static void debug_print_key_and_data(FILE *fp, char *str1, char *str2,
	char *key, int ksize, char *data, int dsize);
static int store_record(struct db_list *dbp, void *keyaddr, int keysize,
	void *dataaddr, int datasize, char *str);
static void *fetch_record(struct db_list *dbp, void *keyaddr, int keysize,
	void *dataaddr, int *errorp, char *str);
static int delete_record(struct db_list *dbp, void *keyaddr, int keysize,
	char *str);
static int db_update_fhrec(struct db_list *dbp, void *keyaddr, int keysize,
	fhlist_ent *fhrecp, char *str);
static int db_update_linkinfo(struct db_list *dbp, void *keyaddr, int keysize,
	linkinfo_ent *linkp, char *str);
static fhlist_ent *create_primary_struct(struct db_list *dbp, fhandle_t *dfh,
	char *name, fhandle_t *fh, uint_t flags, fhlist_ent *fhrecp,
	int *errorp);
static fhlist_ent *db_add_primary(struct db_list *dbp, fhandle_t *dfh,
	char *name, fhandle_t *fh, uint_t flags, fhlist_ent *fhrecp,
	int *errorp);
static linkinfo_ent *get_next_link(struct db_list *dbp, char *linkkey,
	int *linksizep, linkinfo_ent *linkp, void **cookiep,
	int *errorp, char *msg);
static void free_link_cookies(void *cookie);
static void add_mc_path(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhlist_ent *fhrecp, linkinfo_ent *linkp, int *errorp);
static linkinfo_ent *create_link_struct(struct db_list *dbp, fhandle_t *dfh,
	char *name, fhlist_ent *fhrecp, int *errorp);
static int db_add_secondary(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhandle_t *fh, fhlist_ent *fhrecp);
static linkinfo_ent *update_next_link(struct db_list *dbp, char *nextkey,
	int nextsize, char *prevkey, int prevsize, int *errorp);
static int update_prev_link(struct db_list *dbp, char *nextkey, int nextsize,
	char *prevkey, int prevsize);
static linkinfo_ent *update_linked_list(struct db_list *dbp, char *nextkey,
	int nextsize, char *prevkey, int prevsize, int *errorp);
static int db_update_primary_new_head(struct db_list *dbp,
	linkinfo_ent *dellinkp, linkinfo_ent *nextlinkp, fhlist_ent *fhrecp);
static int delete_link_by_key(struct db_list *dbp, char *linkkey,
	int *linksizep, int *errorp, char *errstr);
static int delete_link(struct db_list *dbp, fhandle_t *dfh, char *name,
	char *nextlinkkey, int *nextlinksizep, int *errorp, char *errstr);

/*
 * The following functions do the actual database I/O. Currently use DBM.
 */

/*
 * The "db_*" functions are functions that access the database using
 * database-specific calls. Currently the only database supported is
 * dbm. Because of the limitations of this database, in particular when
 * it comes to manipulating records with the same key, or using multiple keys,
 * the following design decisions have been made:
 *
 *	Each file system has a separate dbm file, which are kept open as
 *		accessed, listed in a linked list.
 *	Two possible access mode are available for each file - either by
 *		file handle, or by directory file handle and name. Since
 *		dbm does not allow multiple keys, we will have a primary
 *		and secondary key for each file/link.
 *	The primary key is the pair (inode,gen) which can be obtained
 *		from the file handle. This points to a record with
 *		the full file handle and the secondary key (dfh-key,name)
 *		for one of the links.
 *	The secondary key is the pair (dfh-key,name) where dfh-key is
 *		the primary key for the directory and the name is the
 *		link name. It points to a record that contains the primary
 *		key for the file and to the previous and next hard link
 *		found for this file (if they exist).
 *
 * Summary of operations:
 *	Adding a new file: Create the primary record and secondary (link)
 *		record and add both to the database. The link record
 *		would have prev and next links set to NULL.
 *
 *	Adding a link to a file in the database: Add the link record,
 *		to the head of the links list (i.e. prev = NULL, next =
 *		secondary key recorded in the primary record). Update
 *		the primary record to point to the new link, and the
 *		secondary record for the old head of list to point to new.
 *
 *	Deleting a file: Delete the link record. If it is the last link
 *		then mark the primary record as deleted but don't delete
 *		that one from the database (in case some clients still
 *		hold the file handle). If there are other links, and the
 *		deleted link is the head of the list (in the primary
 *		record), update the primary record with the new head.
 *
 *	Renaming a file: Add the new link and then delete the old one.
 *
 *	Lookup by file handle (read, write, lookup, etc.) - fetch primary rec.
 *	Lookup by dir info (delete, link, rename) - fetch secondary rec.
 *
 *	XXX NOTE: The code is written single-threaded. To make it multi-
 *	threaded, the following considerations must be made:
 *	1. Changes/access to the db list must be atomic.
 *	2. Changes/access for a specific file handle must be atomic
 *	   (example: deleting a link may affect up to 4 separate database
 *	   entries: the deleted link, the prev and next links if exist,
 *	   and the filehandle entry, if it points to the deleted link -
 *	   these changes must be atomic).
 */

/*
 * Create a link key given directory fh and name
 */
static int
fill_link_key(char *linkkey, fhandle_t *dfh, char *name)
{
	int	linksize, linksize32;

	(void) memcpy(linkkey, &dfh->fh_data, dfh->fh_len);
	(void) strcpy(&linkkey[dfh->fh_len], name);
	linksize = dfh->fh_len + strlen(name) + 1;
	linksize32 = ROUNDUP32(linksize);
	if (linksize32 > linksize)
		bzero(&linkkey[linksize], linksize32 - linksize);
	return (linksize32);
}

/*
 * db_get_db - gets the database for the filesystem, or creates one
 * if none exists. Return the pointer for the database in *dbpp if success.
 * Return 0 for success, error code otherwise.
 */
static struct db_list *
db_get_db(char *fhpath, fsid_t *fsid, int *errorp, int create_flag)
{
	struct db_list	*p, *newp;
	char		fsidstr[30];
	datum		key, data;

	*errorp = 0;
	for (p = db_fs_list;
		(p != NULL) && memcmp(&p->fsid, fsid, sizeof (*fsid));
		p = p->next);
	if (p != NULL) {
		/* Found it */
		return (p);
	}
	/* Create it */
	if ((newp = calloc(1, sizeof (*newp))) == NULL) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"db_get_db: malloc db failed: Error %s"),
			strerror(*errorp));
		return (NULL);
	}
	(void) sprintf(fsidstr, "%08x%08x", fsid->val[0], fsid->val[1]);
	if ((newp->path = malloc(strlen(fhpath) + 2 + strlen(fsidstr)))
		== NULL) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"db_get_db: malloc dbpath failed: Error %s"),
			strerror(*errorp));
		goto err_exit;
	}
	(void) sprintf(newp->path, "%s.%s", fhpath, fsidstr);
	/*
	 * The open mode is masked by UMASK.
	 */
	if ((newp->db = dbm_open(newp->path, create_flag | O_RDWR, 0666))
		== NULL) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"db_get_db: dbm_open db '%s' failed: Error %s"),
			newp->path, strerror(*errorp));
		if (*errorp == 0)	/* should not happen but may */
			*errorp = -1;
		goto err_exit;
	}
	/*
	 * Add the version identifier (have to check first in the
	 * case the db exists)
	 */
	key.dptr = DB_VERSION_STRING;
	key.dsize = strlen(DB_VERSION_STRING);
	data = dbm_fetch(newp->db, key);
	if (data.dptr == NULL) {
		data.dptr = DB_VERSION;
		data.dsize = strlen(DB_VERSION);
		(void) dbm_store(newp->db, key, data, DBM_INSERT);
	}

	(void) memcpy(&newp->fsid, fsid, sizeof (*fsid));
	newp->next = db_fs_list;
	db_fs_list = newp;
	if (debug > 1) {
		(void) printf("db_get_db: db %s opened\n", newp->path);
	}
	return (newp);

err_exit:
	if (newp != NULL) {
		if (newp->db != NULL) {
			dbm_close(newp->db);
		}
		if (newp->path != NULL) {
			free(newp->path);
		}
		free(newp);
	}
	return (NULL);
}

/*
 * db_get_all_databases - gets the database for any filesystem. This is used
 * when any database will do - typically to retrieve the path for the
 * public filesystem. If any database is open - return the first one,
 * otherwise, search for it using fhpath. If getall is TRUE, open all
 * matching databases, and mark them (to indicate that all such were opened).
 * Return the pointer for a matching database if success.
 */
static struct db_list *
db_get_all_databases(char *fhpath, bool_t getall)
{
	char		*dirptr, *fhdir, *fhpathname;
	int		len, error;
	DIR		*dirp;
	struct dirent	*dp;
	fsid_t		fsid;
	struct db_list	*dbp, *ret_dbp;

	for (dbp = db_fs_list; dbp != NULL; dbp = dbp->next) {
		if (strncmp(fhpath, dbp->path, strlen(fhpath)) == 0)
			break;
	}
	if (dbp != NULL) {
		/*
		 * if one database for that prefix is open, and  either only
		 * one is needed, or already opened all such databases,
		 * return here without exhaustive search
		 */
		if (!getall || dbp->getall)
			return (dbp);
	}
	if ((fhdir = strdup(fhpath)) == NULL) {
		syslog(LOG_ERR, gettext(
			"db_get_all_databases: strdup '%s' Error '%s*'"),
			fhpath, strerror(errno));
		return (NULL);
	}
	fhpathname = NULL;
	ret_dbp = NULL;
	if ((dirptr = strrchr(fhdir, '/')) == NULL) {
		/* no directory */
		goto exit;
	}
	if ((fhpathname = strdup(&dirptr[1])) == NULL) {
		syslog(LOG_ERR, gettext(
			"db_get_all_databases: strdup '%s' Error '%s*'"),
			&dirptr[1], strerror(errno));
		goto exit;
	}
	/* Terminate fhdir string at last '/' */
	dirptr[1] = '\0';
	/* Search the directory */
	if (debug > 2) {
		(void) printf("db_get_all_databases: search '%s' for '%s*'\n",
			fhdir, fhpathname);
	}
	if ((dirp = opendir(fhdir)) == NULL) {
		syslog(LOG_ERR, gettext(
			"db_get_all_databases: opendir '%s' Error '%s*'"),
			fhdir, strerror(errno));
		goto exit;
	}
	len = strlen(fhpathname);
	while ((dp = readdir(dirp)) != NULL) {
		if (strncmp(fhpathname, dp->d_name, len) == 0) {
			dirptr = &dp->d_name[len + 1];
			if (*(dirptr - 1) != '.') {
				continue;
			}
			(void) sscanf(dirptr, "%08lx%08lx",
			    (ulong_t *)&fsid.val[0], (ulong_t *)&fsid.val[1]);
			dbp = db_get_db(fhpath, &fsid, &error, 0);
			if (dbp != NULL) {
				ret_dbp = dbp;
				if (!getall)
					break;
				dbp->getall = TRUE;
			}
		}
	}
	(void) closedir(dirp);
exit:
	if (fhpathname != NULL)
		free(fhpathname);
	if (fhdir != NULL)
		free(fhdir);
	return (ret_dbp);
}

static void
debug_print_key(FILE *fp, char *str1, char *str2, char *key, int ksize)
{
	(void) fprintf(fp, "%s: %s key (%d) ", str1, str2, ksize);
	debug_opaque_print(fp, key, ksize);
	/* may be inode,name - try to print the fields */
	if (ksize >= NFS_FHMAXDATA) {
		(void) fprintf(fp, ": inode ");
		debug_opaque_print(fp, &key[2], sizeof (int));
		(void) fprintf(fp, ", gen ");
		debug_opaque_print(fp, &key[2 + sizeof (int)], sizeof (int));
		if (ksize > NFS_FHMAXDATA) {
			(void) fprintf(fp, ", name '%s'", &key[NFS_FHMAXDATA]);
		}
	}
	(void) fprintf(fp, "\n");
}

static void
debug_print_linkinfo(FILE *fp, linkinfo_ent *linkp)
{
	if (linkp == NULL)
		return;
	(void) fprintf(fp, "linkinfo:\ndfh: ");
	debug_opaque_print(fp, (void *)&linkp->dfh, sizeof (linkp->dfh));
	(void) fprintf(fp, "\nname: '%s'", LN_NAME(linkp));
	(void) fprintf(fp, "\nmtime 0x%x, atime 0x%x, flags 0x%x, reclen %d\n",
		linkp->mtime, linkp->atime, linkp->flags, linkp->reclen);
	(void) fprintf(fp, "offsets: fhkey %d, name %d, next %d, prev %d\n",
		linkp->fhkey_offset, linkp->name_offset, linkp->next_offset,
		linkp->prev_offset);
	debug_print_key(fp, "fhkey", "", LN_FHKEY(linkp), LN_FHKEY_LEN(linkp));
	debug_print_key(fp, "next", "", LN_NEXT(linkp), LN_NEXT_LEN(linkp));
	debug_print_key(fp, "prev", "", LN_PREV(linkp), LN_PREV_LEN(linkp));
}

static void
debug_print_fhlist(FILE *fp, fhlist_ent *fhrecp)
{
	if (fhrecp == NULL)
		return;
	(void) fprintf(fp, "fhrec:\nfh: ");
	debug_opaque_print(fp, (void *)&fhrecp->fh, sizeof (fhrecp->fh));
	(void) fprintf(fp, "name '%s', dfh: ", fhrecp->name);
	debug_opaque_print(fp, (void *)&fhrecp->dfh, sizeof (fhrecp->dfh));
	(void) fprintf(fp, "\nmtime 0x%x, atime 0x%x, flags 0x%x, reclen %d\n",
		fhrecp->mtime, fhrecp->atime, fhrecp->flags, fhrecp->reclen);
}

static void
debug_print_key_and_data(FILE *fp, char *str1, char *str2, char *key,
	int ksize, char *data, int dsize)
{
	debug_print_key(fp, str1, str2, key, ksize);
	(void) fprintf(fp, " ==> (%p,%d)\n", (void *)data, dsize);
	if (ksize > NFS_FHMAXDATA) {
		linkinfo_ent inf;
		/* probably a link struct */
		(void) memcpy(&inf, data, sizeof (linkinfo_ent));
		debug_print_linkinfo(fp, &inf);
	} else if (ksize == NFS_FHMAXDATA) {
		fhlist_ent inf;
		/* probably an fhlist struct */
		(void) memcpy(&inf, data, sizeof (linkinfo_ent));
		debug_print_fhlist(fp, &inf);
	} else {
		/* don't know... */
		debug_opaque_print(fp, data, dsize);
	}
}

/*
 * store_record - store the record in the database and return 0 for success
 * or error code otherwise.
 */
static int
store_record(struct db_list *dbp, void *keyaddr, int keysize, void *dataaddr,
	int datasize, char *str)
{
	datum	key, data;
	int	error;
	char	*errfmt = "store_record: dbm_store failed, Error: %s\n";
	char	*err;

	errno = 0;
	key.dptr = keyaddr;
	key.dsize = keysize;
	data.dptr = dataaddr;
	data.dsize = datasize;

	if (debug > 2) {
		debug_print_key_and_data(stdout, str, "dbm_store:\n    ",
			key.dptr, key.dsize, data.dptr, data.dsize);
	}
	if (dbm_store(dbp->db, key, data, DBM_REPLACE) < 0) {
		/* Could not store */
		error = dbm_error(dbp->db);
		dbm_clearerr(dbp->db);

		if (error) {
			if (errno)
				err = strerror(errno);
			else {
				err = err_str;
				errno = EIO;
			}
		} else { /* should not happen but sometimes does */
			err = err_str;
			errno = -1;
		}
		if (debug) {
			debug_print_key(stderr, str, "store_record:"
				"dbm_store:\n", key.dptr, key.dsize);
			(void) fprintf(stderr, errfmt, err);
		} else
			syslog(LOG_ERR, gettext(errfmt), err);
		return (errno);
	}
	return (0);
}

/*
 * fetch_record - fetch the record from the database and return 0 for success
 * and errno for failure.
 * dataaddr is an optional valid address for the result. If dataaddr
 * is non-null, then that memory is already alloc'd. Else, alloc it, and
 * the caller must free the returned struct when done.
 */
static void *
fetch_record(struct db_list *dbp, void *keyaddr, int keysize, void *dataaddr,
	int *errorp, char *str)
{
	datum	key, data;
	char	*errfmt = "fetch_record: dbm_fetch failed, Error: %s\n";
	char	*err;

	errno = 0;
	*errorp = 0;
	key.dptr = keyaddr;
	key.dsize = keysize;

	data = dbm_fetch(dbp->db, key);
	if (data.dptr == NULL) {
		/* see if there is a database error */
		if (dbm_error(dbp->db)) {
			/* clear and report the database error */
			dbm_clearerr(dbp->db);
			*errorp = EIO;
			err = strerror(*errorp);
			syslog(LOG_ERR, gettext(errfmt), err);
		} else {
			/* primary record not in database */
			*errorp = ENOENT;
		}
		if (debug > 3) {
			err = strerror(*errorp);
			debug_print_key(stderr, str, "fetch_record:"
				"dbm_fetch:\n", key.dptr, key.dsize);
			(void) fprintf(stderr, errfmt, err);
		}
		return (NULL);
	}

	/* copy to local struct because dbm may return non-aligned pointers */
	if ((dataaddr == NULL) &&
	    ((dataaddr = malloc(data.dsize)) == NULL)) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"%s: dbm_fetch - malloc %ld: Error %s"),
			str, data.dsize, strerror(*errorp));
		return (NULL);
	}
	(void) memcpy(dataaddr, data.dptr, data.dsize);
	if (debug > 3) {
		debug_print_key_and_data(stdout, str, "fetch_record:"
			"dbm_fetch:\n", key.dptr, key.dsize,
			dataaddr, data.dsize);
	}
	*errorp = 0;
	return (dataaddr);
}

/*
 * delete_record - delete the record from the database and return 0 for success
 * or error code for failure.
 */
static int
delete_record(struct db_list *dbp, void *keyaddr, int keysize, char *str)
{
	datum	key;
	int	error = 0;
	char	*errfmt = "delete_record: dbm_delete failed, Error: %s\n";
	char	*err;

	errno = 0;
	key.dptr = keyaddr;
	key.dsize = keysize;

	if (debug > 2) {
		debug_print_key(stdout, str, "delete_record:"
			"dbm_delete:\n", key.dptr, key.dsize);
	}
	if (dbm_delete(dbp->db, key) < 0) {
		error = dbm_error(dbp->db);
		dbm_clearerr(dbp->db);

		if (error) {
			if (errno)
				err = strerror(errno);
			else {
				err = err_str;
				errno = EIO;
			}
		} else { /* should not happen but sometimes does */
			err = err_str;
			errno = -1;
		}
		if (debug) {
			debug_print_key(stderr, str, "delete_record:"
				"dbm_delete:\n", key.dptr, key.dsize);
			(void) fprintf(stderr, errfmt, err);
		} else
			syslog(LOG_ERR, gettext(errfmt), err);
	}
	return (errno);
}

/*
 * db_update_fhrec - puts fhrec in db with updated atime if more than
 * mapping_update_interval seconds passed. Return 0 if success, error otherwise.
 */
static int
db_update_fhrec(struct db_list *dbp, void *keyaddr, int keysize,
	fhlist_ent *fhrecp, char *str)
{
	time_t	cur_time = time(0);

	if (difftime(cur_time, fhrecp->atime) >= mapping_update_interval) {
		fhrecp->atime = cur_time;
		return (store_record(dbp, keyaddr, keysize,
				fhrecp, fhrecp->reclen, str));
	}
	return (0);
}

/*
 * db_update_linkinfo - puts linkinfo in db with updated atime if more than
 * mapping_update_interval seconds passed. Return 0 if success, error otherwise.
 */
static int
db_update_linkinfo(struct db_list *dbp, void *keyaddr, int keysize,
	linkinfo_ent *linkp, char *str)
{
	time_t	cur_time = time(0);

	if (difftime(cur_time, linkp->atime) >= mapping_update_interval) {
		linkp->atime = cur_time;
		return (store_record(dbp, keyaddr, keysize,
				linkp, linkp->reclen, str));
	}
	return (0);
}

/*
 * create_primary_struct - add primary record to the database.
 * Database must be open when this function is called.
 * If success, return the added database entry. fhrecp may be used to
 * provide an existing memory area, else malloc it. If failed, *errorp
 * contains the error code and return NULL.
 */
static fhlist_ent *
create_primary_struct(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhandle_t *fh, uint_t flags, fhlist_ent *fhrecp, int *errorp)
{
	int		reclen, reclen1;
	fhlist_ent	*new_fhrecp = fhrecp;

	reclen1 = offsetof(fhlist_ent, name) + strlen(name) + 1;
	reclen = ROUNDUP32(reclen1);
	if (fhrecp == NULL) {	/* allocated the memory */
		if ((new_fhrecp = malloc(reclen)) == NULL) {
			*errorp = errno;
			syslog(LOG_ERR, gettext(
				"create_primary_struct: malloc %d Error %s"),
				reclen, strerror(*errorp));
			return (NULL);
		}
	}
	/* Fill in the fields */
	(void) memcpy(&new_fhrecp->fh, fh, sizeof (*fh));
	(void) memcpy(&new_fhrecp->dfh, dfh, sizeof (*dfh));
	new_fhrecp->flags = flags;
	if (dfh == &public_fh)
		new_fhrecp->flags |= PUBLIC_PATH;
	else
		new_fhrecp->flags &= ~PUBLIC_PATH;
	new_fhrecp->mtime = time(0);
	new_fhrecp->atime = new_fhrecp->mtime;
	(void) strcpy(new_fhrecp->name, name);
	if (reclen1 < reclen) {
		bzero((char *)((uintptr_t)new_fhrecp + reclen1),
			reclen - reclen1);
	}
	new_fhrecp->reclen = reclen;
	*errorp = store_record(dbp, &fh->fh_data, fh->fh_len, new_fhrecp,
			new_fhrecp->reclen, "create_primary_struct");
	if (*errorp != 0) {
		/* Could not store */
		if (fhrecp == NULL)	/* caller did not supply pointer */
			free(new_fhrecp);
		return (NULL);
	}
	return (new_fhrecp);
}

/*
 * db_add_primary - add primary record to the database.
 * If record already in and live, return it (even if for a different link).
 * If in database but marked deleted, replace it. If not in database, add it.
 * Database must be open when this function is called.
 * If success, return the added database entry. fhrecp may be used to
 * provide an existing memory area, else malloc it. If failed, *errorp
 * contains the error code and return NULL.
 */
static fhlist_ent *
db_add_primary(struct db_list *dbp, fhandle_t *dfh, char *name, fhandle_t *fh,
	uint_t flags, fhlist_ent *fhrecp, int *errorp)
{
	fhlist_ent	*new_fhrecp;
	fh_primary_key	fhkey;

	if (debug > 2)
		(void) printf("db_add_primary entered: name '%s'\n", name);

	bcopy(&fh->fh_data, fhkey, fh->fh_len);
	new_fhrecp = fetch_record(dbp, fhkey, fh->fh_len, (void *)fhrecp,
			errorp, "db_add_primary");
	if (new_fhrecp != NULL) {
		/* primary record is in the database */
		/* Update atime if needed */
		*errorp = db_update_fhrec(dbp, fhkey, fh->fh_len, new_fhrecp,
				"db_add_primary put fhrec");
		if (debug > 2)
			(void) printf("db_add_primary exits(2): name '%s'\n",
				name);
		return (new_fhrecp);
	}
	/* primary record not in database - create it */
	new_fhrecp = create_primary_struct(dbp, dfh, name, fh, flags,
			fhrecp, errorp);
	if (new_fhrecp == NULL) {
		/* Could not store */
		if (debug > 2)
			(void) printf(
				"db_add_primary exits(1): name '%s' Error %s\n",
				name, ((*errorp >= 0) ? strerror(*errorp) :
					"Unknown"));

		return (NULL);
	}
	if (debug > 2)
		(void) printf("db_add_primary exits(0): name '%s'\n", name);
	return (new_fhrecp);
}

/*
 * get_next_link - get and check the next link in the chain.
 * Re-use space if linkp param non-null. Also set *linkkey and *linksizep
 * to values for next link (*linksizep set to 0 if last link).
 * cookie is used to detect corrupted link entries XXXXXXX
 * Return the link pointer or NULL if none.
 */
static linkinfo_ent *
get_next_link(struct db_list *dbp, char *linkkey, int *linksizep,
	linkinfo_ent *linkp, void **cookiep, int *errorp, char *msg)
{
	int	linksize, nextsize;
	char	*nextkey;
	linkinfo_ent *new_linkp = linkp;
	struct link_keys *lnp;

	linksize = *linksizep;
	if (linksize == 0)
		return (NULL);
	*linksizep = 0;
	new_linkp = fetch_record(dbp, linkkey, linksize, (void *)linkp,
			errorp, msg);
	if (new_linkp == NULL)
		return (NULL);

	/* Set linkkey to point to next record */
	nextsize = LN_NEXT_LEN(new_linkp);
	if (nextsize == 0)
		return (new_linkp);

	/* Add this key to the cookie list */
	if ((lnp = malloc(sizeof (struct link_keys))) == NULL) {
		syslog(LOG_ERR, gettext("get_next_key: malloc error %s\n"),
			strerror(errno));
		if ((new_linkp != NULL) && (linkp == NULL))
			free(new_linkp);
		return (NULL);
	}
	(void) memcpy(lnp->lnkey, linkkey, linksize);
	lnp->lnsize = linksize;
	lnp->next = *(struct link_keys **)cookiep;
	*cookiep = (void *)lnp;

	/* Make sure record does not point to itself or other internal loops */
	nextkey = LN_NEXT(new_linkp);
	for (; lnp != NULL; lnp = lnp->next) {
		if ((nextsize == lnp->lnsize) && (memcmp(
			lnp->lnkey, nextkey, nextsize) == 0)) {

			/*
			 * XXX This entry's next pointer points to
			 * itself. This is only a work-around, remove
			 * this check once bug 4203186 is fixed.
			 */
			if (debug) {
				(void) fprintf(stderr,
				"%s: get_next_link: last record invalid.\n",
					msg);
				debug_print_key_and_data(stderr, msg,
					"invalid rec:\n ", linkkey, linksize,
					(char *)new_linkp, new_linkp->reclen);
			}
			/* Return as if this is the last link */
			return (new_linkp);
		}
	}
	(void) memcpy(linkkey, nextkey, nextsize);
	*linksizep = nextsize;
	return (new_linkp);
}

/*
 * free_link_cookies - free the cookie list
 */
static void
free_link_cookies(void *cookie)
{
	struct link_keys *dellnp, *lnp;

	lnp = (struct link_keys *)cookie;
	while (lnp != NULL) {
		dellnp = lnp;
		lnp = lnp->next;
		free(dellnp);
	}
}

/*
 * add_mc_path - add a mc link to a file that has other links. Add it at end
 * of linked list. Called when it's known there are other links.
 */
static void
add_mc_path(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhlist_ent *fhrecp, linkinfo_ent *linkp, int *errorp)
{
	fh_secondary_key	linkkey;
	int			linksize, len;
	linkinfo_ent		lastlink, *lastlinkp;
	void			*cookie;

	linksize = fill_link_key(linkkey, &fhrecp->dfh, fhrecp->name);
	cookie = NULL;
	do {
		lastlinkp = get_next_link(dbp, linkkey, &linksize, &lastlink,
				&cookie, errorp, "add_mc_path");
	} while (linksize > 0);
	free_link_cookies(cookie);
	/* reached end of list */
	if (lastlinkp == NULL) {
		/* nothing to do */
		if (debug > 1) {
			(void) fprintf(stderr, "add_mc_path link is null\n");
		}
		return;
	}
	/* Add new link after last link */
	/*
	 * next - link key for the next in the list - add at end so null.
	 * prev - link key for the previous link in the list.
	 */
	linkp->prev_offset = linkp->next_offset;	/* aligned */
	linksize = fill_link_key(LN_PREV(linkp), &lastlinkp->dfh,
				LN_NAME(lastlinkp));
	linkp->reclen = linkp->prev_offset + linksize;	/* aligned */

	/* Add the link information to the database */
	linksize = fill_link_key(linkkey, dfh, name);
	*errorp = store_record(dbp, linkkey, linksize,
			linkp, linkp->reclen, "add_mc_path");
	if (*errorp != 0)
		return;

	/* Now update previous last link to point forward to new link */
	/* Copy prev link out since it's going to be overwritten */
	linksize = LN_PREV_LEN(lastlinkp);
	(void) memcpy(linkkey, LN_PREV(lastlinkp), linksize);
	/* Update previous last link to point to new one */
	len = fill_link_key(LN_NEXT(lastlinkp), dfh, name);
	lastlinkp->prev_offset = lastlinkp->next_offset + len;	/* aligned */
	(void) memcpy(LN_PREV(lastlinkp), linkkey, linksize);
	lastlinkp->reclen = lastlinkp->prev_offset + linksize;
	/* Update the link information to the database */
	linksize = fill_link_key(linkkey, &lastlinkp->dfh, LN_NAME(lastlinkp));
	*errorp = store_record(dbp, linkkey, linksize,
			lastlinkp, lastlinkp->reclen, "add_mc_path prev");
}

/*
 * create_link_struct - create the secondary struct.
 * (dfh,name) is the secondary key, fhrec is the primary record for the file
 * and linkpp is a place holder for the record (could be null).
 * Insert the record to the database.
 * Return 0 if success, error otherwise.
 */
static linkinfo_ent *
create_link_struct(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhlist_ent *fhrecp, int *errorp)
{
	fh_secondary_key	linkkey;
	int			len, linksize;
	linkinfo_ent		*linkp;

	if ((linkp = malloc(sizeof (linkinfo_ent))) == NULL) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"create_link_struct: malloc failed: Error %s"),
			strerror(*errorp));
		return (NULL);
	}
	if (dfh == &public_fh)
		linkp->flags |= PUBLIC_PATH;
	else
		linkp->flags &= ~PUBLIC_PATH;
	(void) memcpy(&linkp->dfh, dfh, sizeof (*dfh));
	linkp->mtime = time(0);
	linkp->atime = linkp->mtime;
	/* Calculate offsets of variable fields */
	/* fhkey - primary key (inode/gen) */
	/* name - component name (in directory dfh) */
	linkp->fhkey_offset = ROUNDUP32(offsetof(linkinfo_ent, varbuf));
	len = fill_link_key(LN_FHKEY(linkp), &fhrecp->fh, name);
	linkp->name_offset = linkp->fhkey_offset + fhrecp->fh.fh_len;
	linkp->next_offset = linkp->fhkey_offset + len;	/* aligned */
	/*
	 * next - link key for the next link in the list - NULL if it's
	 * the first link. If this is the public fs, only one link allowed.
	 * Avoid setting a multi-component path as primary path,
	 * unless no choice.
	 */
	len = 0;
	if (memcmp(&fhrecp->dfh, dfh, sizeof (*dfh)) ||
	    strcmp(fhrecp->name, name)) {
		/* different link than the one that's in the record */
		if (dfh == &public_fh) {
			/* parent is public fh - either multi-comp or root */
			if (memcmp(&fhrecp->fh, &public_fh,
				sizeof (public_fh))) {
				/* multi-comp path */
				add_mc_path(dbp, dfh, name, fhrecp, linkp,
						errorp);
				if (*errorp != 0) {
					free(linkp);
					return (NULL);
				}
				return (linkp);
			}
		} else {
			/* new link to a file with a different one already */
			len = fill_link_key(LN_NEXT(linkp), &fhrecp->dfh,
				fhrecp->name);
		}
	}
	/*
	 * prev - link key for the previous link in the list - since we
	 * always insert at the front of the list, it's always initially NULL.
	 */
	linkp->prev_offset = linkp->next_offset + len;	/* aligned */
	linkp->reclen = linkp->prev_offset;

	/* Add the link information to the database */
	linksize = fill_link_key(linkkey, dfh, name);
	*errorp = store_record(dbp, linkkey, linksize, linkp, linkp->reclen,
			"create_link_struct");
	if (*errorp != 0) {
		free(linkp);
		return (NULL);
	}
	return (linkp);
}

/*
 * db_add_secondary - add secondary record to the database (for the directory
 * information).
 * Assumes this is a new link, not yet in the database, and that the primary
 * record is already in.
 * If fhrecp is non-null, then fhrecp is the primary record.
 * Database must be open when this function is called.
 * Return 0 if success, error code otherwise.
 */
static int
db_add_secondary(struct db_list *dbp, fhandle_t *dfh, char *name,
	fhandle_t *fh, fhlist_ent *fhrecp)
{
	int			nextsize, len, error;
	linkinfo_ent		nextlink, *newlinkp, *nextlinkp;
	uint_t			fhflags;
	char			*nextaddr;
	fhlist_ent		*new_fhrecp = fhrecp;
	fh_primary_key		fhkey;

	if (debug > 2)
		(void) printf("db_add_secondary entered: name '%s'\n", name);

	bcopy(&fh->fh_data, fhkey, fh->fh_len);
	if (fhrecp == NULL) {
		/* Fetch the primary record */
		new_fhrecp = fetch_record(dbp, fhkey, fh->fh_len, NULL,
				&error, "db_add_secondary primary");
		if (new_fhrecp == NULL) {
			return (error);
		}
	}
	/* Update fhrec atime if needed */
	error = db_update_fhrec(dbp, fhkey, fh->fh_len, new_fhrecp,
			"db_add_secondary primary");
	fhflags = new_fhrecp->flags;
	/* now create and insert the secondary record */
	newlinkp = create_link_struct(dbp, dfh, name, new_fhrecp, &error);
	if (fhrecp == NULL) {
		free(new_fhrecp);
		new_fhrecp = NULL;
	}
	if (newlinkp == NULL) {
		if (debug > 2)
			(void) printf("create_link_struct '%s' Error %s\n",
				name, ((error >= 0) ? strerror(error) :
					"Unknown"));
		return (error);
	}
	nextsize = LN_NEXT_LEN(newlinkp);
	if (nextsize == 0) {
		/* No next - can exit now */
		if (debug > 2)
			(void) printf("db_add_secondary: no next link\n");
		free(newlinkp);
		return (0);
	}

	/*
	 * Update the linked list to point to new head: replace head of
	 * list in the primary record, then update previous secondary record
	 * to point to new head
	 */
	new_fhrecp = create_primary_struct(dbp, dfh, name, fh, fhflags,
			new_fhrecp, &error);
	if (new_fhrecp == NULL) {
		if (debug > 2)
			(void) printf(
				"db_add_secondary: replace primary failed\n");
		free(newlinkp);
		return (error);
	} else if (fhrecp == NULL) {
		free(new_fhrecp);
	}

	/*
	 * newlink is the new head of the list, with its "next" pointing to
	 * the old head, and its "prev" pointing to NULL. We now need to
	 * modify the "next" entry to have its "prev" point to the new entry.
	 */
	nextaddr = LN_NEXT(newlinkp);
	if (debug > 2) {
		debug_print_key(stderr, "db_add_secondary", "next key\n    ",
			nextaddr, nextsize);
	}
	/* Get the next link entry from the database */
	nextlinkp = fetch_record(dbp, nextaddr, nextsize, (void *)&nextlink,
			&error, "db_add_secondary next link");
	if (nextlinkp == NULL) {
		if (debug > 2)
			(void) printf(
				"db_add_secondary: fetch next link failed\n");
		free(newlinkp);
		return (error);
	}

	/*
	 * since the "prev" field is the only field to be changed, and it's
	 * the last in the link record, we only need to modify it (and reclen).
	 * Re-use link to update the next record.
	 */
	len = fill_link_key(LN_PREV(nextlinkp), dfh, name);
	nextlinkp->reclen = nextlinkp->prev_offset + len;
	error = store_record(dbp, nextaddr, nextsize, nextlinkp,
			nextlinkp->reclen, "db_add_secondary");
	if (debug > 2)
		(void) printf(
			"db_add_secondary exits(%d): name '%s'\n", error, name);
	free(newlinkp);
	return (error);
}

/*
 * Update the next link to point to the new prev.
 * Return 0 for success, error code otherwise.
 * If successful, and nextlinkpp is non-null,
 * *nextlinkpp contains the record for the next link, since
 * we may will it if the primary record should be updated.
 */
static linkinfo_ent *
update_next_link(struct db_list *dbp, char *nextkey, int nextsize,
	char *prevkey, int prevsize, int *errorp)
{
	linkinfo_ent	*nextlinkp, *linkp1;

	if ((nextlinkp = malloc(sizeof (linkinfo_ent))) == NULL) {
		*errorp = errno;
		syslog(LOG_ERR, gettext(
			"update_next_link: malloc next Error %s"),
			strerror(*errorp));
		return (NULL);
	}
	linkp1 = nextlinkp;
	nextlinkp = fetch_record(dbp, nextkey, nextsize, nextlinkp,
			errorp, "update next");
	/* if there is no next record - ok */
	if (nextlinkp == NULL) {
		/* Return no error */
		*errorp = 0;
		free(linkp1);
		return (NULL);
	}
	/* Set its prev to the prev of the deleted record */
	nextlinkp->reclen = ROUNDUP32(nextlinkp->reclen -
				LN_PREV_LEN(nextlinkp) + prevsize);
	/* Change the len and set prev */
	if (prevsize > 0) {
		(void) memcpy(LN_PREV(nextlinkp), prevkey, prevsize);
	}
	/* No other changes needed because prev is last field */
	*errorp = store_record(dbp, nextkey, nextsize, nextlinkp,
			nextlinkp->reclen, "update_next");
	if (*errorp != 0) {
		free(nextlinkp);
		nextlinkp = NULL;
	}
	return (nextlinkp);
}

/*
 * Update the prev link to point to the new next.
 * Return 0 for success, error code otherwise.
 */
static int
update_prev_link(struct db_list *dbp, char *nextkey, int nextsize,
	char *prevkey, int prevsize)
{
	linkinfo_ent	prevlink, *prevlinkp;
	int		diff, error;

	/* Update its next to the given one */
	prevlinkp = fetch_record(dbp, prevkey, prevsize, &prevlink, &error,
			"update prev");
	/* if error there is no next record - ok */
	if (prevlinkp == NULL) {
		return (0);
	}
	diff = nextsize - LN_NEXT_LEN(prevlinkp);
	prevlinkp->reclen = ROUNDUP32(prevlinkp->reclen + diff);
	/* Change the len and set next - may push prev */
	if (diff != 0) {
		char	*ptr = LN_PREV(prevlinkp);

		prevlinkp->prev_offset += diff;
		(void) memcpy(LN_PREV(prevlinkp), ptr, LN_PREV_LEN(prevlinkp));
	}
	if (nextsize > 0) {
		(void) memcpy(LN_NEXT(prevlinkp), nextkey, nextsize);
	}
	/* Store updated record */
	error = store_record(dbp, prevkey, prevsize, prevlinkp,
			prevlinkp->reclen, "update_prev");
	return (error);
}

/*
 * update_linked_list - update the next link to point back to prev, and vice
 * versa. Normally called by delete_link to drop the deleted link from the
 * linked list of hard links for the file. next and prev are the keys of next
 * and previous links for the deleted link in the list (could be NULL).
 * Return 0 for success, error code otherwise.
 * If successful, and nextlinkpp is non-null,
 * return the record for the next link, since
 * if the primary record should be updated we'll need it. In this case,
 * actually allocate the space for it because we can't tell otherwise.
 */
static linkinfo_ent *
update_linked_list(struct db_list *dbp, char *nextkey, int nextsize,
	char *prevkey, int prevsize, int *errorp)
{
	linkinfo_ent	*nextlinkp = NULL;

	*errorp = 0;
	if (nextsize > 0) {
		nextlinkp = update_next_link(dbp, nextkey, nextsize,
				prevkey, prevsize, errorp);
		if (nextlinkp == NULL) {
			/* not an error if no next link */
			if (*errorp != 0) {
				if (debug > 1) {
					(void) fprintf(stderr,
						"update_next_link Error %s\n",
					((*errorp >= 0) ? strerror(*errorp) :
						"Unknown"));
				}
				return (NULL);
			}
		}
	}
	if (prevsize > 0) {
		*errorp = update_prev_link(dbp, nextkey, nextsize,
				prevkey, prevsize);
		if (*errorp != 0) {
			if (debug > 1) {
				(void) fprintf(stderr,
					"update_prev_link Error %s\n",
					((*errorp >= 0) ? strerror(*errorp) :
					"Unknown"));
			}
			if (nextlinkp != NULL)
				free(nextlinkp);
			nextlinkp = NULL;
		}
	}
	return (nextlinkp);
}

/*
 * db_update_primary_new_head - Update a primary record that the head of
 * the list is deleted. Similar to db_add_primary, but the primary record
 * must exist, and is always replaced with one pointing to the new link,
 * unless it does not point to the deleted link. If the link we deleted
 * was the last link, the delete the primary record as well.
 * Return 0 for success, error code otherwise.
 */
static int
db_update_primary_new_head(struct db_list *dbp, linkinfo_ent *dellinkp,
	linkinfo_ent *nextlinkp, fhlist_ent *fhrecp)
{
	int			error;
	char			*name, *next_name;
	fhandle_t		*dfh;
	fh_primary_key		fhkey;

	dfh = &dellinkp->dfh;
	name = LN_NAME(dellinkp);
	/* If the deleted link was not the head of the list, we are done */
	if (memcmp(&fhrecp->dfh, dfh, sizeof (*dfh)) ||
	    strcmp(fhrecp->name, name)) {
		/* should never be here... */
		if (debug > 1) {
			(void) fprintf(stderr,
				"db_update_primary_new_head: primary "
				"is for [%s,", name);
			debug_opaque_print(stderr, (void *)dfh, sizeof (*dfh));
			(void) fprintf(stderr, "], not [%s,", fhrecp->name);
			debug_opaque_print(stderr, (void *)&fhrecp->dfh,
				sizeof (fhrecp->dfh));
			(void) fprintf(stderr, "]\n");
		}
		return (0);	/* not head of list so done */
	}
	/* Set the head to nextkey if exists. Otherwise, mark file as deleted */
	bcopy(&fhrecp->fh.fh_data, fhkey, fhrecp->fh.fh_len);
	if (nextlinkp == NULL) {
		/* last link */
		/* remove primary record from database */
		(void) delete_record(dbp,
			fhkey, fhrecp->fh.fh_len,
			"db_update_primary_new_head: fh delete");
		return (0);
	} else {
		/*
		 * There are still "live" links, so update the primary record.
		 */
		next_name = LN_NAME(nextlinkp);
		fhrecp->reclen = ROUNDUP32(offsetof(fhlist_ent, name) +
					strlen(next_name) + 1);
		/* Replace link data with the info for the next link */
		(void) memcpy(&fhrecp->dfh, &nextlinkp->dfh,
			sizeof (nextlinkp->dfh));
		(void) strcpy(fhrecp->name, next_name);
	}
	/* not last link */
	fhrecp->mtime = time(0);
	fhrecp->atime = fhrecp->mtime;
	error = store_record(dbp,
			fhkey, fhrecp->fh.fh_len, fhrecp,
			fhrecp->reclen, "db_update_primary_new_head: fh");
	return (error);
}

/*
 * Exported functions
 */

/*
 * db_add - add record to the database. If dfh, fh and name are all here,
 * add both primary and secondary records. If fh is not available, don't
 * add anything...
 * Assumes this is a new file, not yet in the database and that the record
 * for fh is already in.
 * Return 0 for success, error code otherwise.
 */
int
db_add(char *fhpath, fhandle_t *dfh, char *name, fhandle_t *fh, uint_t flags)
{
	struct db_list	*dbp = NULL;
	fhlist_ent	fhrec, *fhrecp;
	int		error = 0;

	if (fh == NULL) {
		/* nothing to add */
		return (EINVAL);
	}
	if (fh == &public_fh) {
		dbp = db_get_all_databases(fhpath, FALSE);
	} else {
		dbp = db_get_db(fhpath, &fh->fh_fsid, &error, O_CREAT);
	}
	for (; dbp != NULL; dbp = ((fh != &public_fh) ? NULL : dbp->next)) {
		if (debug > 3) {
			(void) printf("db_add: name '%s', db '%s'\n",
				name, dbp->path);
		}
		fhrecp = db_add_primary(dbp, dfh, name, fh, flags,
				&fhrec, &error);
		if (fhrecp == NULL) {
			continue;
		}
		if ((dfh == NULL) || (name == NULL)) {
			/* Can't add link information */
			syslog(LOG_ERR, gettext(
				"db_add: dfh %p, name %p - invalid"),
				(void *)dfh, (void *)name);
			error = EINVAL;
			continue;
		}
		if (fh == &public_fh) {
			while ((fhrecp != NULL) && strcmp(name, fhrecp->name)) {
				/* Replace the public fh rather than add link */
				error = db_delete_link(fhpath, dfh,
						fhrecp->name);
				fhrecp = db_add_primary(dbp, dfh, name, fh,
						flags, &fhrec, &error);
			}
			if (fhrecp == NULL) {
				continue;
			}
		}
		error = db_add_secondary(dbp, dfh, name, fh, fhrecp);
		if (fhrecp != &fhrec) {
			free(fhrecp);
		}
	}
	return (error);
}

/*
 * db_lookup - search the database for the file identified by fh.
 * Return the entry in *fhrecpp if found, or NULL with error set otherwise.
 */
fhlist_ent *
db_lookup(char *fhpath, fhandle_t *fh, fhlist_ent *fhrecp, int *errorp)
{
	struct db_list	*dbp;
	fh_primary_key	fhkey;

	if ((fhpath == NULL) || (fh == NULL) || (errorp == NULL)) {
		if (errorp != NULL)
			*errorp = EINVAL;
		return (NULL);
	}
	*errorp = 0;
	if (fh == &public_fh) {
		dbp = db_get_all_databases(fhpath, FALSE);
	} else {
		dbp = db_get_db(fhpath, &fh->fh_fsid, errorp, O_CREAT);
	}
	if (dbp == NULL) {
		/* Could not get or create database */
		return (NULL);
	}
	bcopy(&fh->fh_data, fhkey, fh->fh_len);
	fhrecp = fetch_record(dbp, fhkey, fh->fh_len, fhrecp,
			errorp, "db_lookup");
	/* Update fhrec atime if needed */
	if (fhrecp != NULL) {
		*errorp = db_update_fhrec(dbp, fhkey, fh->fh_len, fhrecp,
				"db_lookup");
	}
	return (fhrecp);
}

/*
 * db_lookup_link - search the database for the file identified by (dfh,name).
 * If the link was found, use it to search for the primary record.
 * Return 0 and set the entry in *fhrecpp if found, return error otherwise.
 */
fhlist_ent *
db_lookup_link(char *fhpath, fhandle_t *dfh, char *name, fhlist_ent *fhrecp,
	int *errorp)
{
	struct db_list		*dbp;
	fh_secondary_key	linkkey;
	linkinfo_ent		*linkp;
	int			linksize, fhkeysize;
	char			*fhkey;

	if ((fhpath == NULL) || (dfh == NULL) || (name == NULL) ||
		(errorp == NULL)) {
		if (errorp != NULL)
			*errorp = EINVAL;
		return (NULL);
	}
	*errorp = 0;
	if (dfh == &public_fh) {
		dbp = db_get_all_databases(fhpath, FALSE);
	} else {
		dbp = db_get_db(fhpath, &dfh->fh_fsid, errorp, O_CREAT);
	}
	if (dbp == NULL) {
		/* Could not get or create database */
		return (NULL);
	}
	/* Get the link record */
	linksize = fill_link_key(linkkey, dfh, name);
	linkp = fetch_record(dbp, linkkey, linksize, NULL, errorp,
			"db_lookup_link link");
	if (linkp != NULL) {
		/* Now use link to search for fh entry */
		fhkeysize = LN_FHKEY_LEN(linkp);
		fhkey = LN_FHKEY(linkp);
		fhrecp = fetch_record(dbp, fhkey, fhkeysize,
				(void *)fhrecp, errorp, "db_lookup_link fh");
		/* Update fhrec atime if needed */
		if (fhrecp != NULL) {
			*errorp = db_update_fhrec(dbp, fhkey, fhkeysize, fhrecp,
				"db_lookup_link fhrec");
		}
		/* Update link atime if needed */
		*errorp = db_update_linkinfo(dbp, linkkey, linksize, linkp,
			"db_lookup_link link");
		free(linkp);
	} else {
		fhrecp = NULL;
	}
	return (fhrecp);
}

/*
 * delete_link - delete the requested link from the database. If it's the
 * last link in the database for that file then remove the primary record
 * as well. *errorp contains the returned error code.
 * Return ENOENT if link not in database and 0 otherwise.
 */
static int
delete_link_by_key(struct db_list *dbp, char *linkkey, int *linksizep,
	int *errorp, char *errstr)
{
	int			nextsize, prevsize, fhkeysize, linksize;
	char			*nextkey, *prevkey, *fhkey;
	linkinfo_ent		*dellinkp, *nextlinkp;
	fhlist_ent		*fhrecp, fhrec;

	*errorp = 0;
	linksize = *linksizep;
	/* Get the link record */
	dellinkp = fetch_record(dbp, linkkey, linksize, NULL, errorp, errstr);
	if (dellinkp == NULL) {
		/*
		 * Link not in database.
		 */
		if (debug > 2) {
			debug_print_key(stderr, errstr,
				"link not in database\n",
				linkkey, linksize);
		}
		*linksizep = 0;
		return (ENOENT);
	}
	/*
	 * Possibilities:
	 * 1. Normal case - only one link to delete: the link next and
	 *    prev should be NULL, and fhrec's name/dfh are same
	 *    as the link. Remove the link and fhrec.
	 * 2. Multiple hard links, and the deleted link is the head of
	 *    the list. Remove the link and replace the link key in
	 *    the primary record to point to the new head.
	 * 3. Multiple hard links, and the deleted link is not the
	 *    head of the list (not the same as in fhrec) - just
	 *    delete the link and update the previous and next records
	 *    in the links linked list.
	 */

	/* Get next and prev keys for linked list updates */
	nextsize = LN_NEXT_LEN(dellinkp);
	nextkey = ((nextsize > 0) ? LN_NEXT(dellinkp) : NULL);
	prevsize = LN_PREV_LEN(dellinkp);
	prevkey = ((prevsize > 0) ? LN_PREV(dellinkp) : NULL);
	/* Update the linked list for the file */
	nextlinkp = update_linked_list(dbp, nextkey, nextsize,
			prevkey, prevsize, errorp);
	if ((nextlinkp == NULL) && (*errorp != 0)) {
		free(dellinkp);
		*linksizep = 0;
		return (0);
	}
	/* Delete link record */
	*errorp = delete_record(dbp, linkkey, linksize, errstr);
	/* Get the primary key */
	fhkeysize = LN_FHKEY_LEN(dellinkp);
	fhkey = LN_FHKEY(dellinkp);
	fhrecp = fetch_record(dbp, fhkey, fhkeysize,
		&fhrec, errorp, errstr);
	if (fhrecp == NULL) {
		/* Should never happen */
		if (debug > 1) {
			debug_print_key(stderr, errstr,
				"fetch primary for ", linkkey, linksize);
			(void) fprintf(stderr, " Error %s\n",
			((*errorp >= 0) ? strerror(*errorp) : "Unknown"));
		}
	} else if ((*errorp == 0) && (prevsize <= 0)) {
		/* This is the head of the list update primary record */
		*errorp = db_update_primary_new_head(dbp, dellinkp,
				nextlinkp, fhrecp);
	} else {
		/* Update fhrec atime if needed */
		*errorp = db_update_fhrec(dbp, fhkey, fhkeysize, fhrecp,
				errstr);
	}
	*linksizep = nextsize;
	if (nextsize > 0)
		(void) memcpy(linkkey, nextkey, nextsize);
	if (nextlinkp != NULL)
		free(nextlinkp);
	free(dellinkp);
	return (0);
}

/*
 * delete_link - delete the requested link from the database. If it's the
 * last link in the database for that file then remove the primary record
 * as well. If nextlinkkey/sizep are non-null, copy the key and key size of
 * the next link in the chain into them (this would save a dbm_fetch op).
 * Return ENOENT if link not in database and 0 otherwise, with *errorp
 * containing the returned error if any from the delete_link ops.
 */
static int
delete_link(struct db_list *dbp, fhandle_t *dfh, char *name,
	char *nextlinkkey, int *nextlinksizep, int *errorp, char *errstr)
{
	int	linkerr;

	*errorp = 0;
	if ((nextlinkkey != NULL) && (nextlinksizep != NULL)) {
		*nextlinksizep = fill_link_key(nextlinkkey, dfh, name);
		linkerr = delete_link_by_key(dbp, nextlinkkey, nextlinksizep,
				errorp, errstr);
	} else {
		int			linksize;
		fh_secondary_key	linkkey;

		linksize = fill_link_key(linkkey, dfh, name);
		linkerr = delete_link_by_key(dbp, linkkey, &linksize,
				errorp, errstr);
	}
	return (linkerr);
}

/*
 * db_delete_link - search the database for the file system for link.
 * Delete the link from the database. If this is the "primary" link,
 * set the primary record for the next link. If it's the last one,
 * delete the primary record.
 * Return 0 for success, error code otherwise.
 */
int
db_delete_link(char *fhpath, fhandle_t *dfh, char *name)
{
	struct db_list		*dbp;
	int			error = 0;

	if ((fhpath == NULL) || (dfh == NULL) || (name == NULL)) {
		return (EINVAL);
	}
	if (dfh == &public_fh) {
		dbp = db_get_all_databases(fhpath, TRUE);
	} else {
		dbp = db_get_db(fhpath, &dfh->fh_fsid, &error, O_CREAT);
	}
	for (; dbp != NULL; dbp = ((dfh == &public_fh) ? dbp->next : NULL)) {
		(void) delete_link(dbp, dfh, name, NULL, NULL, &error,
			"db_delete_link link");
	}
	return (error);
}

#ifdef DEBUG
/*
 * db_delete - Deletes the fhrec corresponding to the fh. Use only
 * for repairing the fhtable, not for normal handling.
 * Return 0 for success, error code otherwise.
 */
int
db_delete(char *fhpath, fhandle_t *fh)
{
	struct db_list		*dbp;
	int			error = 0;

	if ((fhpath == NULL) || (fh == NULL)) {
		return (EINVAL);
	}
	if (fh == &public_fh) {
		dbp = db_get_all_databases(fhpath, TRUE);
	} else {
		dbp = db_get_db(fhpath, &fh->fh_fsid, &error, O_CREAT);
	}
	for (; dbp != NULL; dbp = ((fh == &public_fh) ? dbp->next : NULL)) {
		/* Get the link record */
		(void) delete_record(dbp, &fh->fh_data, fh->fh_len,
			"db_delete: fh delete");
	}
	return (error);
}
#endif  /* DEBUG */

/*
 * db_rename_link - search the database for the file system for link.
 * Add the new link and delete the old link from the database.
 * Return 0 for success, error code otherwise.
 */
int
db_rename_link(char *fhpath, fhandle_t *from_dfh, char *from_name,
	fhandle_t *to_dfh, char *to_name)
{
	int			error;
	struct db_list		*dbp;
	fhlist_ent		fhrec, *fhrecp;

	if ((fhpath == NULL) || (from_dfh == NULL) || (from_name == NULL) ||
		(to_dfh == NULL) || (to_name == NULL)) {
		return (EINVAL);
	}
	if (from_dfh == &public_fh) {
		dbp = db_get_all_databases(fhpath, FALSE);
	} else {
		dbp = db_get_db(fhpath, &from_dfh->fh_fsid, &error, O_CREAT);
	}
	for (; dbp != NULL;
		dbp = ((from_dfh != &public_fh) ? NULL : dbp->next)) {
		/* find existing link */
		fhrecp = db_lookup_link(fhpath, from_dfh, from_name, &fhrec,
				&error);
		if (fhrecp == NULL) {
			/* Could not find the link */
			continue;
		}
		/* Delete the old link (if last primary record not deleted) */
		error = db_delete_link(fhpath, from_dfh, from_name);
		if (error == 0) {
			error = db_add(fhpath, to_dfh, to_name, &fhrecp->fh,
					fhrecp->flags);
		}
	}
	return (error);
}

/*
 * db_print_all_keys: prints all keys for a given filesystem. If fsidp is
 * NULL, print for all filesystems covered by fhpath.
 */
void
db_print_all_keys(char *fhpath, fsid_t *fsidp, FILE *fp)
{
	struct db_list	*dbp;
	datum		key;
	int		error, len;
	char		strkey[NFS_FHMAXDATA + MAXNAMELEN];
	db_record	rec;
	void		*ptr;

	if ((fhpath == NULL) ||
	    ((fsidp != NULL) && (fsidp == &public_fh.fh_fsid)))
		return;
	if (fsidp == NULL) {
		(void) db_get_all_databases(fhpath, TRUE);
		dbp = db_fs_list;
	} else {
		dbp = db_get_db(fhpath, fsidp, &error, 0);
	}
	if (dbp == NULL) {
		/* Could not get or create database */
		return;
	}
	len = strlen(fhpath);
	for (; dbp != NULL; dbp = ((fsidp != NULL) ? NULL : dbp->next)) {
		if (strncmp(fhpath, dbp->path, len))
			continue;
		(void) fprintf(fp,
			"\nStart print database for fsid 0x%x 0x%x\n",
			dbp->fsid.val[0], dbp->fsid.val[1]);
		(void) fprintf(fp, "=============================\n");
		for (key = dbm_firstkey(dbp->db); key.dptr != NULL;
			key = dbm_nextkey(dbp->db)) {
			(void) memcpy(strkey, key.dptr, key.dsize);
			debug_print_key(fp, "", "", strkey, key.dsize);
			if (debug < 2)
				continue;
			ptr = fetch_record(dbp, key.dptr, key.dsize,
					(void *)&rec, &error, "db_prt_keys");
			if (ptr == NULL)
				continue;
			if (key.dsize == NFS_FHMAXDATA) {
				/* fhrec */
				debug_print_fhlist(fp, &rec.fhlist_rec);
			} else if (key.dsize > NFS_FHMAXDATA) {
				/* linkinfo */
				debug_print_linkinfo(fp, &rec.link_rec);
			}
			(void) fprintf(fp, "-----------------------------\n");
		}
		(void) fprintf(fp, "End print database for fsid 0x%x 0x%x\n",
			dbp->fsid.val[0], dbp->fsid.val[1]);
	}
}

void
debug_opaque_print(FILE *fp, void *buf, int size)
{
	int		bufoffset = 0;
	char		debug_str[200];

	if ((buf == NULL) || (size <= 0))
		return;

	nfslog_opaque_print_buf(buf, size, debug_str, &bufoffset, 200);
	(void) fprintf(fp, debug_str);
}

/*
 * links_timedout() takes a primary records and searches all of its
 * links to see if they all have access times that are older than
 * the 'prune_timeout' value.  TRUE if all links are old and FALSE
 * if there is just one link that has an access time which is recent.
 */
static int
links_timedout(struct db_list *pdb, fhlist_ent *pfe, time_t ts)
{
	fh_secondary_key	linkkey;
	linkinfo_ent		*linkp, link_st;
	int			error;
	int			linksize;
	void			*cookie;

	/* Get the link record */
	linksize = fill_link_key(linkkey, &pfe->dfh, pfe->name);
	cookie = NULL;
	do {
		linkp = get_next_link(pdb, linkkey, &linksize, &link_st,
				&cookie, &error, "links_timedout");
		if ((linkp != NULL) &&
			(difftime(ts, linkp->atime) <= prune_timeout)) {
			/* update primary record to have an uptodate time */
			pfe = fetch_record(pdb, (void *)&pfe->fh.fh_data,
					pfe->fh.fh_len, NULL, &error,
					"links_timedout");
			if (pfe == NULL) {
				syslog(LOG_ERR, gettext(
				"links_timedout: fetch fhrec error %s\n"),
				strerror(error));
			} else {
				if (difftime(pfe->atime, linkp->atime) < 0) {
					/* update fhrec atime */
					pfe->atime = linkp->atime;
					(void) store_record(pdb,
						(void *)&pfe->fh.fh_data,
						pfe->fh.fh_len, pfe,
						pfe->reclen, "links_timedout");
				}
				free(pfe);
			}
			free_link_cookies(cookie);
			return (FALSE);
		}
	} while (linksize > 0);

	free_link_cookies(cookie);
	return (TRUE);
}

/*
 * prune_dbs() will search all of the open databases looking for records
 * that have not been accessed in the last 'prune_timeout' seconds.
 * This search is done on the primary records and a list of potential
 * timeout candidates is built.  The reason for doing this is to not
 * disturb the underlying dbm_firstkey()/dbm_nextkey() sequence; we
 * want to search all of the records in the database.
 * Once we have our candidate list built, we examine each of those
 * item's links to check if the links have been accessed within the
 * 'prune_timeout' seconds.  If neither the primary nor any its links
 * have been accessed, then all of those records are removed/deleted
 * from the database.
 */
int
prune_dbs(char *fhpath)
{
	struct db_list		*pdb;
	datum			key;
	db_record		*ptr;
	struct fhlist_ent 	*pfe;
	int			error, linkerr, linksize;
	time_t			cur_time = time(0);
	fh_secondary_key	linkkey;
	struct thelist {
		struct thelist *next;
		db_record *ptr;
	} 			thelist, *ptl;
	int	cnt = 0;

	if (fhpath != NULL)
		(void) db_get_all_databases(fhpath, TRUE);

	thelist.next = NULL;
	/*
	 * Search each of the open databases
	 */
	for (pdb = db_fs_list; pdb; pdb = pdb->next) {
	    do {
		/* Check each record in the database */
		for (key = dbm_firstkey(pdb->db); key.dptr != NULL;
		    key = dbm_nextkey(pdb->db)) {
			/* We're only interested in primary records */
			if (key.dsize != NFS_FHMAXDATA)
				continue;	/* probably a link record */
			ptr = fetch_record(pdb, key.dptr, key.dsize,
					NULL, &error, "dump_db");
			if (ptr == NULL)
				continue;
			/*
			 * If this record is a primary record and it is
			 * not an export point or a public file handle path,
			 * check it for a ancient access time.
			 */
			if ((ptr->fhlist_rec.flags &
				    (EXPORT_POINT | PUBLIC_PATH)) ||
			    (difftime(cur_time, ptr->fhlist_rec.atime) <=
					prune_timeout)) {
				/* Keep this record in the database */
				free(ptr);
			} else {
				/* Found one?  Save off info about it */
				ptl = malloc(sizeof (struct thelist));
				if (ptl == NULL) {
					syslog(LOG_ERR, gettext(
				"prune_dbs: malloc failed, error %s\n"),
						strerror(errno));
					break;
				}
				ptl->ptr = ptr;
				ptl->next = thelist.next;
				thelist.next = ptl;
				cnt++;	/* count how many records allocated */
				if (cnt > MAX_PRUNE_REC_CNT) {
					/* Limit number of records malloc'd */
					if (debug)
						(void) fprintf(stderr,
				"prune_dbs: halt search - too many records\n");
					break;
				}
			}
		}

		/*
		 * Take the saved records and check their links to make
		 * sure that they have not been accessed as well.
		 */
		for (ptl = thelist.next; ptl; ptl = thelist.next) {
			thelist.next = ptl->next;
			/* Everything timed out? */
			pfe = &(ptl->ptr->fhlist_rec);
			if (links_timedout(pdb,	pfe, cur_time)) {

				/*
				 * Iterate until we run out of links.
				 * We have to do this since there can be
				 * multiple links to a primary record and
				 * we need to delete one at a time.
				 */
				/* Delete the link and get the next */
				linkerr = delete_link(pdb,
						&pfe->dfh, pfe->name, linkkey,
						&linksize, &error, "dump_db");
				while ((linksize > 0) && !(error || linkerr)) {
					/* Delete the link and get the next */
					linkerr = delete_link_by_key(pdb,
						linkkey, &linksize,
						&error, "dump_db");
					if (error || linkerr) {
						break;
					}
				}
				if (linkerr) {
					/* link not in database, primary is */
					/* Should never happen */
					if (debug > 1) {
						(void) fprintf(stderr,
					"prune_dbs: Error primary exists ");
						debug_opaque_print(stderr,
							(void *)&pfe->fh,
							sizeof (pfe->fh));
						(void) fprintf(stderr, "\n");
					}
					if (debug)
						syslog(LOG_ERR, gettext(
					"prune_dbs: Error primary exists\n"));
					(void) delete_record(pdb,
					&pfe->fh.fh_data, pfe->fh.fh_len,
					"prune_dbs: fh delete");
				}
			}
			/* Make sure to free the pointers used in the list */
			free(ptl->ptr);
			free(ptl);
			cnt--;
		}
		thelist.next = NULL;
	    } while (key.dptr != NULL);
	}
	return (0);
}