Mercurial > illumos > illumos-gate
changeset 5312:55bbc18d74af
6588256 HSFS performance needs a boost
author | mg147109 |
---|---|
date | Wed, 24 Oct 2007 07:14:17 -0700 |
parents | 05e4ca164d13 |
children | 870767513b2b |
files | usr/src/uts/common/fs/hsfs/hsfs_node.c usr/src/uts/common/fs/hsfs/hsfs_subr.c usr/src/uts/common/fs/hsfs/hsfs_vfsops.c usr/src/uts/common/fs/hsfs/hsfs_vnops.c usr/src/uts/common/sys/fs/hsfs_node.h |
diffstat | 5 files changed, 1347 insertions(+), 40 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/fs/hsfs/hsfs_node.c Tue Oct 23 17:36:10 2007 -0700 +++ b/usr/src/uts/common/fs/hsfs/hsfs_node.c Wed Oct 24 07:14:17 2007 -0700 @@ -609,6 +609,9 @@ hp->hs_dir_off = off; hp->hs_nodeid = nodeid; hp->hs_seq = 0; + hp->hs_prev_offset = 0; + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; hp->hs_flags = HREF; if (off > HS_SECTOR_SIZE) cmn_err(CE_WARN, "hs_makenode: bad offset");
--- a/usr/src/uts/common/fs/hsfs/hsfs_subr.c Tue Oct 23 17:36:10 2007 -0700 +++ b/usr/src/uts/common/fs/hsfs/hsfs_subr.c Wed Oct 24 07:14:17 2007 -0700 @@ -57,6 +57,7 @@ #define THE_EPOCH 1970 #define END_OF_TIME 2099 +extern int hsfs_lostpage; #ifdef __STDC__ static time_t hs_date_to_gmtime(int year, int mon, int day, int gmtoff); @@ -127,7 +128,25 @@ 1, 0, }; +/* + * Local datatype for defining tables of (Offset, Name) pairs for + * kstats. + */ +typedef struct { + offset_t index; + char *name; +} hsfs_ksindex_t; +static const hsfs_ksindex_t hsfs_kstats[] = { + { 0, "mountpoint" }, + { 1, "pages_lost" }, + { 2, "physical_read_pages" }, + { 3, "cache_read_pages" }, + { 4, "readahead_pages" }, + { 5, "coalesced_pages" }, + { 6, "total_pages_requested" }, + {-1, NULL } +}; /* * hs_parse_dirdate @@ -320,3 +339,107 @@ fsp->hsfs_err_flags |= (1 << errtype); } + +/* + * Callback from kstat framework. Grab a snapshot of the current hsfs + * counters and populate the kstats. + */ +static int +hsfs_kstats_update(kstat_t *ksp, int flag) +{ + struct hsfs *fsp; + kstat_named_t *knp; + uint64_t pages_lost; + uint64_t physical_read_bytes; + uint64_t cache_read_pages; + uint64_t readahead_bytes; + uint64_t coalesced_bytes; + uint64_t total_pages_requested; + + if (flag != KSTAT_READ) + return (EACCES); + + fsp = ksp->ks_private; + knp = ksp->ks_data; + + mutex_enter(&(fsp->hqueue->strategy_lock)); + mutex_enter(&(fsp->hqueue->hsfs_queue_lock)); + + cache_read_pages = fsp->cache_read_pages; + pages_lost = hsfs_lostpage; + physical_read_bytes = fsp->physical_read_bytes; + readahead_bytes = fsp->readahead_bytes; + coalesced_bytes = fsp->coalesced_bytes; + total_pages_requested = fsp->total_pages_requested; + + mutex_exit(&(fsp->hqueue->strategy_lock)); + mutex_exit(&(fsp->hqueue->hsfs_queue_lock)); + + knp++; + (knp++)->value.ui64 = pages_lost; + (knp++)->value.ui64 = howmany(physical_read_bytes, PAGESIZE); + (knp++)->value.ui64 = cache_read_pages; + (knp++)->value.ui64 = howmany(readahead_bytes, PAGESIZE); + (knp++)->value.ui64 = howmany(coalesced_bytes, PAGESIZE); + (knp++)->value.ui64 = total_pages_requested; + + return (0); +} + +/* + * Initialize hsfs kstats, which are all name value pairs with + * values being various counters. + */ +static kstat_t * +hsfs_setup_named_kstats(struct hsfs *fsp, int fsid, char *name, + const hsfs_ksindex_t *ksip, size_t size, int (*update)(kstat_t *, int)) +{ + kstat_t *ksp; + kstat_named_t *knp; + char *np; + char *mntpt = fsp->hsfs_fsmnt; + + size /= sizeof (hsfs_ksindex_t); + ksp = kstat_create("hsfs_fs", fsid, name, "hsfs", + KSTAT_TYPE_NAMED, size-1, KSTAT_FLAG_VIRTUAL); + if (ksp == NULL) + return (NULL); + + ksp->ks_data = kmem_alloc(sizeof (kstat_named_t) * size, KM_SLEEP); + ksp->ks_private = fsp; + ksp->ks_update = update; + ksp->ks_data_size += strlen(mntpt) + 1; + knp = ksp->ks_data; + kstat_named_init(knp, ksip->name, KSTAT_DATA_STRING); + kstat_named_setstr(knp, mntpt); + knp++; + ksip++; + + for (; (np = ksip->name) != NULL; ++knp, ++ksip) { + kstat_named_init(knp, np, KSTAT_DATA_UINT64); + } + kstat_install(ksp); + + return (ksp); +} + +void +hsfs_init_kstats(struct hsfs *fsp, int fsid) +{ + fsp->hsfs_kstats = hsfs_setup_named_kstats(fsp, fsid, "hsfs_read_stats", + hsfs_kstats, sizeof (hsfs_kstats), hsfs_kstats_update); +} + +void +hsfs_fini_kstats(struct hsfs *fsp) +{ + void *data; + + if (fsp->hsfs_kstats != NULL) { + data = fsp->hsfs_kstats->ks_data; + kstat_delete(fsp->hsfs_kstats); + kmem_free(data, (sizeof (hsfs_kstats)) / \ + (sizeof (hsfs_ksindex_t))); + } + fsp->hsfs_kstats = NULL; +}
--- a/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c Tue Oct 23 17:36:10 2007 -0700 +++ b/usr/src/uts/common/fs/hsfs/hsfs_vfsops.c Wed Oct 24 07:14:17 2007 -0700 @@ -137,6 +137,12 @@ hsfs_options }; +/* + * Indicates whether to enable the I/O scheduling and readahead logic + * 1 - Enable, 0 - Do not Enable. + * Debugging purposes. + */ +int do_schedio = 1; static int hsfsfstype; static int hsfsinit(int, char *); @@ -158,6 +164,10 @@ char _depends_on[] = "fs/specfs"; +extern void hsched_init_caches(void); +extern void hsched_fini_caches(void); + + int _init(void) { @@ -185,6 +195,7 @@ vn_freevnodeops(hsfs_vnodeops); hs_fini_hsnode_cache(); + hsched_fini_caches(); return (0); } @@ -204,6 +215,12 @@ uid_t hsfs_default_uid = 0; gid_t hsfs_default_gid = 3; +extern void hsched_init(struct hsfs *fsp, int fsid, + struct modlinkage *modlinkage); +extern void hsched_fini(struct hsfs_queue *hqueue); +extern void hsfs_init_kstats(struct hsfs *fsp, int fsid); +extern void hsfs_fini_kstats(struct hsfs *fsp); + static int hsfs_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, struct cred *cr); static int hsfs_unmount(struct vfs *vfsp, int, struct cred *cr); @@ -261,6 +278,7 @@ hsfsfstype = fstype; mutex_init(&hs_mounttab_lock, NULL, MUTEX_DEFAULT, NULL); hs_init_hsnode_cache(); + hsched_init_caches(); return (0); } @@ -388,6 +406,7 @@ mutex_exit(&hs_mounttab_lock); + hsfs_fini_kstats(fsp); (void) VOP_CLOSE(fsp->hsfs_devvp, FREAD, 1, (offset_t)0, cr); VN_RELE(fsp->hsfs_devvp); /* free path table space */ @@ -402,6 +421,9 @@ if (fsp->hsfs_fsmnt != NULL) kmem_free(fsp->hsfs_fsmnt, strlen(fsp->hsfs_fsmnt) + 1); + hsched_fini(fsp->hqueue); + kmem_free(fsp->hqueue, sizeof (struct hsfs_queue)); + mutex_destroy(&fsp->hsfs_free_lock); rw_destroy(&fsp->hsfs_hash_lock); @@ -888,6 +910,19 @@ */ fsp->hsfs_flags = mount_flags | (fsp->hsfs_flags & HSFSMNT_INODE); + /* + * Setup I/O Scheduling structures + */ + if (do_schedio) { + fsp->hqueue = kmem_alloc(sizeof (struct hsfs_queue), KM_SLEEP); + hsched_init(fsp, fsid, &modlinkage); + } + + /* + * Setup kstats + */ + hsfs_init_kstats(fsp, fsid); + DTRACE_PROBE1(mount__done, struct hsfs *, fsp); /*
--- a/usr/src/uts/common/fs/hsfs/hsfs_vnops.c Tue Oct 23 17:36:10 2007 -0700 +++ b/usr/src/uts/common/fs/hsfs/hsfs_vnops.c Wed Oct 24 07:14:17 2007 -0700 @@ -62,6 +62,9 @@ #include <sys/fbuf.h> #include <sys/dirent.h> #include <sys/errno.h> +#include <sys/dkio.h> +#include <sys/cmn_err.h> +#include <sys/atomic.h> #include <vm/hat.h> #include <vm/page.h> @@ -74,6 +77,16 @@ #include <vm/rm.h> #include <vm/page.h> #include <sys/swap.h> +#include <sys/avl.h> +#include <sys/sunldi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sdt.h> + +/* + * For struct modlinkage + */ +#include <sys/modctl.h> #include <sys/fs/hsfs_spec.h> #include <sys/fs/hsfs_node.h> @@ -83,12 +96,64 @@ #include <fs/fs_subr.h> +/* # of contiguous requests to detect sequential access pattern */ +static int seq_contig_requests = 2; + +/* + * This is the max number os taskq threads that will be created + * if required. Since we are using a Dynamic TaskQ by default only + * one thread is created initially. + * + * NOTE: In the usual hsfs use case this per fs instance number + * of taskq threads should not place any undue load on a system. + * Even on an unusual system with say 100 CDROM drives, 800 threads + * will not be created unless all the drives are loaded and all + * of them are saturated with I/O at the same time! If there is at + * all a complaint of system load due to such an unusual case it + * should be easy enough to change to one per-machine Dynamic TaskQ + * for all hsfs mounts with a nthreads of say 32. + */ +static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */ + +/* Min count of adjacent bufs that will avoid buf coalescing */ +static int hsched_coalesce_min = 2; + +/* + * Kmem caches for heavily used small allocations. Using these kmem + * caches provides a factor of 3 reduction in system time and greatly + * aids overall throughput esp. on SPARC. + */ +struct kmem_cache *hio_cache; +struct kmem_cache *hio_info_cache; + /* * This tunable allows us to ignore inode numbers from rrip-1.12. * In this case, we fall back to our default inode algorithm. */ extern int use_rrip_inodes; +/* + * Free behind logic from UFS to tame our thirst for + * the page cache. + * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more + * explanation. + */ +static int freebehind = 1; +static int smallfile = 0; +static int cache_read_ahead = 0; +static u_offset_t smallfile64 = 32 * 1024; +#define SMALLFILE1_D 1000 +#define SMALLFILE2_D 10 +static u_offset_t smallfile1 = 32 * 1024; +static u_offset_t smallfile2 = 32 * 1024; +static clock_t smallfile_update = 0; /* when to recompute */ +static uint_t smallfile1_d = SMALLFILE1_D; +static uint_t smallfile2_d = SMALLFILE2_D; + +static int hsched_deadline_compare(const void *x1, const void *x2); +static int hsched_offset_compare(const void *x1, const void *x2); +static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra); +int hsched_invoke_strategy(struct hsfs *fsp); /* ARGSUSED */ static int @@ -108,6 +173,7 @@ int error; struct hsnode *hp; uint_t filesize; + int dofree; hp = VTOH(vp); /* @@ -174,6 +240,28 @@ return (0); } + /* + * Freebehind computation taken from: + * usr/src/uts/common/fs/ufs/ufs_vnops.c + */ + if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) { + uint64_t percpufreeb; + if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; + if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; + percpufreeb = ptob((uint64_t)freemem) / ncpus_online; + smallfile1 = percpufreeb / smallfile1_d; + smallfile2 = percpufreeb / smallfile2_d; + smallfile1 = MAX(smallfile1, smallfile); + smallfile1 = MAX(smallfile1, smallfile64); + smallfile2 = MAX(smallfile1, smallfile2); + smallfile_update = drv_hztousec(ddi_get_lbolt()) + + 1000000; + } + + dofree = freebehind && + hp->hs_prev_offset == uiop->uio_loffset && + hp->hs_ra_bytes > 0; + base = segmap_getmapflt(segkmap, vp, (u_offset_t)uiop->uio_loffset, n, 1, S_READ); @@ -189,6 +277,14 @@ flags = SM_DONTNEED; else flags = 0; + + if (dofree) { + flags = SM_FREE | SM_ASYNC; + if ((cache_read_ahead == 0) && + uiop->uio_loffset > smallfile2) + flags |= SM_DONTNEED; + } + error = segmap_release(segkmap, base, flags); } else (void) segmap_release(segkmap, base, 0); @@ -617,6 +713,323 @@ } /* + * The taskq thread that invokes the scheduling function to ensure + * that all readaheads are complete and cleans up the associated + * memory and releases the page lock. + */ +void +hsfs_ra_task(void *arg) +{ + struct hio_info *info = arg; + uint_t count; + struct buf *wbuf; + + ASSERT(info->pp != NULL); + + for (count = 0; count < info->bufsused; count++) { + wbuf = &(info->bufs[count]); + + DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf); + while (sema_tryp(&(info->sema[count])) == 0) { + if (hsched_invoke_strategy(info->fsp)) { + sema_p(&(info->sema[count])); + break; + } + } + sema_destroy(&(info->sema[count])); + DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf); + biofini(&(info->bufs[count])); + } + for (count = 0; count < info->bufsused; count++) { + if (info->vas[count] != NULL) { + ppmapout(info->vas[count]); + } + } + kmem_free(info->vas, info->bufcnt * sizeof (caddr_t)); + kmem_free(info->bufs, info->bufcnt * sizeof (struct buf)); + kmem_free(info->sema, info->bufcnt * sizeof (ksema_t)); + + pvn_read_done(info->pp, 0); + kmem_cache_free(hio_info_cache, info); +} + +/* + * Submit asynchronous readahead requests to the I/O scheduler + * depending on the number of pages to read ahead. These requests + * are asynchronous to the calling thread but I/O requests issued + * subsequently by other threads with higher LBNs must wait for + * these readaheads to complete since we have a single ordered + * I/O pipeline. Thus these readaheads are semi-asynchronous. + * A TaskQ handles waiting for the readaheads to complete. + * + * This function is mostly a copy of hsfs_getapage but somewhat + * simpler. A readahead request is aborted if page allocation + * fails. + */ +/*ARGSUSED*/ +static int +hsfs_getpage_ra( + struct vnode *vp, + u_offset_t off, + struct seg *seg, + caddr_t addr, + struct hsnode *hp, + struct hsfs *fsp, + int xarsiz, + offset_t bof, + int chunk_lbn_count, + int chunk_data_bytes) +{ + struct buf *bufs; + caddr_t *vas; + caddr_t va; + struct page *pp, *searchp, *lastp; + struct vnode *devvp; + ulong_t byte_offset; + size_t io_len_tmp; + uint_t io_off, io_len; + uint_t xlen; + uint_t filsiz; + uint_t secsize; + uint_t bufcnt; + uint_t bufsused; + uint_t count; + uint_t io_end; + uint_t which_chunk_lbn; + uint_t offset_lbn; + uint_t offset_extra; + offset_t offset_bytes; + uint_t remaining_bytes; + uint_t extension; + int remainder; /* must be signed */ + diskaddr_t driver_block; + u_offset_t io_off_tmp; + ksema_t *fio_done; + struct hio_info *info; + size_t len; + + ASSERT(fsp->hqueue != NULL); + + if (addr >= seg->s_base + seg->s_size) { + return (-1); + } + + devvp = fsp->hsfs_devvp; + secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ + + /* file data size */ + filsiz = hp->hs_dirent.ext_size; + + if (off >= filsiz) + return (0); + + extension = 0; + pp = NULL; + + extension += hp->hs_ra_bytes; + + /* + * Some cd writers don't write sectors that aren't used. Also, + * there's no point in reading sectors we'll never look at. So, + * if we're asked to go beyond the end of a file, truncate to the + * length of that file. + * + * Additionally, this behaviour is required by section 6.4.5 of + * ISO 9660:1988(E). + */ + len = MIN(extension ? extension : PAGESIZE, filsiz - off); + + /* A little paranoia */ + if (len <= 0) + return (-1); + + /* + * After all that, make sure we're asking for things in units + * that bdev_strategy() will understand (see bug 4202551). + */ + len = roundup(len, DEV_BSIZE); + + pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, + &io_len_tmp, off, len, 1); + + if (pp == NULL) { + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; + hp->hs_prev_offset = 0; + return (-1); + } + + io_off = (uint_t)io_off_tmp; + io_len = (uint_t)io_len_tmp; + + /* check for truncation */ + /* + * xxx Clean up and return EIO instead? + * xxx Ought to go to u_offset_t for everything, but we + * xxx call lots of things that want uint_t arguments. + */ + ASSERT(io_off == io_off_tmp); + + /* + * get enough buffers for worst-case scenario + * (i.e., no coalescing possible). + */ + bufcnt = (len + secsize - 1) / secsize; + bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP); + vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); + + /* + * Allocate a array of semaphores since we are doing I/O + * scheduling. + */ + fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP); + + /* + * If our filesize is not an integer multiple of PAGESIZE, + * we zero that part of the last page that's between EOF and + * the PAGESIZE boundary. + */ + xlen = io_len & PAGEOFFSET; + if (xlen != 0) + pagezero(pp->p_prev, xlen, PAGESIZE - xlen); + + DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len); + + va = NULL; + lastp = NULL; + searchp = pp; + io_end = io_off + io_len; + for (count = 0, byte_offset = io_off; + byte_offset < io_end; + count++) { + ASSERT(count < bufcnt); + + bioinit(&bufs[count]); + bufs[count].b_edev = devvp->v_rdev; + bufs[count].b_dev = cmpdev(devvp->v_rdev); + bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; + bufs[count].b_iodone = hsfs_iodone; + bufs[count].b_vp = vp; + bufs[count].b_file = vp; + + /* Compute disk address for interleaving. */ + + /* considered without skips */ + which_chunk_lbn = byte_offset / chunk_data_bytes; + + /* factor in skips */ + offset_lbn = which_chunk_lbn * chunk_lbn_count; + + /* convert to physical byte offset for lbn */ + offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); + + /* don't forget offset into lbn */ + offset_extra = byte_offset % chunk_data_bytes; + + /* get virtual block number for driver */ + driver_block = lbtodb(bof + xarsiz + + offset_bytes + offset_extra); + + if (lastp != searchp) { + /* this branch taken first time through loop */ + va = vas[count] = ppmapin(searchp, PROT_WRITE, + (caddr_t)-1); + /* ppmapin() guarantees not to return NULL */ + } else { + vas[count] = NULL; + } + + bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; + bufs[count].b_offset = + (offset_t)(byte_offset - io_off + off); + + /* + * We specifically use the b_lblkno member here + * as even in the 32 bit world driver_block can + * get very large in line with the ISO9660 spec. + */ + + bufs[count].b_lblkno = driver_block; + + remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes) + - byte_offset; + + /* + * remaining_bytes can't be zero, as we derived + * which_chunk_lbn directly from byte_offset. + */ + if ((remaining_bytes + byte_offset) < (off + len)) { + /* coalesce-read the rest of the chunk */ + bufs[count].b_bcount = remaining_bytes; + } else { + /* get the final bits */ + bufs[count].b_bcount = off + len - byte_offset; + } + + remainder = PAGESIZE - (byte_offset % PAGESIZE); + if (bufs[count].b_bcount > remainder) { + bufs[count].b_bcount = remainder; + } + + bufs[count].b_bufsize = bufs[count].b_bcount; + if (((offset_t)byte_offset + bufs[count].b_bcount) > + HS_MAXFILEOFF) { + break; + } + byte_offset += bufs[count].b_bcount; + + /* + * We are scheduling I/O so we need to enqueue + * requests rather than calling bdev_strategy + * here. A later invocation of the scheduling + * function will take care of doing the actual + * I/O as it selects requests from the queue as + * per the scheduling logic. + */ + struct hio *hsio = kmem_cache_alloc(hio_cache, + KM_SLEEP); + + sema_init(&fio_done[count], 0, NULL, + SEMA_DEFAULT, NULL); + hsio->bp = &bufs[count]; + hsio->sema = &fio_done[count]; + hsio->io_lblkno = bufs[count].b_lblkno; + hsio->nblocks = howmany(hsio->bp->b_bcount, + DEV_BSIZE); + + /* used for deadline */ + hsio->io_timestamp = drv_hztousec(ddi_get_lbolt()); + + /* for I/O coalescing */ + hsio->contig_chain = NULL; + hsched_enqueue_io(fsp, hsio, 1); + + lwp_stat_update(LWP_STAT_INBLK, 1); + lastp = searchp; + if ((remainder - bufs[count].b_bcount) < 1) { + searchp = searchp->p_next; + } + } + + bufsused = count; + info = kmem_cache_alloc(hio_info_cache, KM_SLEEP); + info->bufs = bufs; + info->vas = vas; + info->sema = fio_done; + info->bufsused = bufsused; + info->bufcnt = bufcnt; + info->fsp = fsp; + info->pp = pp; + + (void) taskq_dispatch(fsp->hqueue->ra_task, + hsfs_ra_task, info, KM_SLEEP); + /* + * The I/O locked pages are unlocked in our taskq thread. + */ + return (0); +} + +/* * Each file may have a different interleaving on disk. This makes * things somewhat interesting. The gist is that there are some * number of contiguous data sectors, followed by some other number @@ -631,6 +1044,11 @@ * space for the sectors, though, we just point at the appropriate * spot in the relevant page for each of them. This saves us a bunch * of copying. + * + * NOTICE: The code below in hsfs_getapage is mostly same as the code + * in hsfs_getpage_ra above (with some omissions). If you are + * making any change to this function, please also look at + * hsfs_getpage_ra. */ /*ARGSUSED*/ static int @@ -678,6 +1096,8 @@ int xarsiz; diskaddr_t driver_block; u_offset_t io_off_tmp; + ksema_t *fio_done; + int calcdone; /* * We don't support asynchronous operation at the moment, so @@ -715,6 +1135,15 @@ */ if (hp->hs_dirent.intlf_sz == 0) { chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp); + /* + * Optimization: If our pagesize is a multiple of LBN + * bytes, we can avoid breaking up a page into individual + * lbn-sized requests. + */ + if (PAGESIZE % chunk_data_bytes == 0) { + chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp); + chunk_data_bytes = PAGESIZE; + } } else { chunk_data_bytes = LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp); @@ -723,6 +1152,7 @@ reread: err = 0; pagefound = 0; + calcdone = 0; /* * Do some read-ahead. This mostly saves us a bit of @@ -735,35 +1165,15 @@ * the end of the chunk, minus whatever's at the end that * won't exactly fill a page. */ - which_chunk_lbn = (off + len) / chunk_data_bytes; - extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off; - extension -= (extension % PAGESIZE); - if (extension != 0 && extension < filsiz - off) { - len = extension; + if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) { + which_chunk_lbn = (off + len) / chunk_data_bytes; + extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off; + extension -= (extension % PAGESIZE); } else { - len = PAGESIZE; - } - /* - * Some cd writers don't write sectors that aren't used. Also, - * there's no point in reading sectors we'll never look at. So, - * if we're asked to go beyond the end of a file, truncate to the - * length of that file. - * - * Additionally, this behaviour is required by section 6.4.5 of - * ISO 9660:1988(E). - */ - if (len > (filsiz - off)) { - len = filsiz - off; + extension = roundup(len, PAGESIZE); } - /* A little paranoia. */ - ASSERT(len > 0); - - /* - * After all that, make sure we're asking for things in units - * that bdev_strategy() will understand (see bug 4202551). - */ - len = roundup(len, DEV_BSIZE); + atomic_inc_64(&fsp->total_pages_requested); pp = NULL; again: @@ -772,11 +1182,46 @@ /* * Need to really do disk IO to get the page. */ + if (!calcdone) { + extension += hp->hs_ra_bytes; + + /* + * Some cd writers don't write sectors that aren't + * used. Also, there's no point in reading sectors + * we'll never look at. So, if we're asked to go + * beyond the end of a file, truncate to the length + * of that file. + * + * Additionally, this behaviour is required by section + * 6.4.5 of ISO 9660:1988(E). + */ + len = MIN(extension ? extension : PAGESIZE, + filsiz - off); + + /* A little paranoia. */ + ASSERT(len > 0); + + /* + * After all that, make sure we're asking for things + * in units that bdev_strategy() will understand + * (see bug 4202551). + */ + len = roundup(len, DEV_BSIZE); + calcdone = 1; + } + pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, &io_len_tmp, off, len, 0); - if (pp == NULL) + if (pp == NULL) { + /* + * Pressure on memory, roll back readahead + */ + hp->hs_num_contig = 0; + hp->hs_ra_bytes = 0; + hp->hs_prev_offset = 0; goto again; + } io_off = (uint_t)io_off_tmp; io_len = (uint_t)io_len_tmp; @@ -796,17 +1241,22 @@ bufcnt = (len + secsize - 1) / secsize; bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP); vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); + + /* + * Allocate a array of semaphores if we are doing I/O + * scheduling. + */ + if (fsp->hqueue != NULL) + fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), + KM_SLEEP); for (count = 0; count < bufcnt; count++) { + bioinit(&bufs[count]); bufs[count].b_edev = devvp->v_rdev; bufs[count].b_dev = cmpdev(devvp->v_rdev); bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; bufs[count].b_iodone = hsfs_iodone; bufs[count].b_vp = vp; bufs[count].b_file = vp; - sema_init(&bufs[count].b_io, 0, NULL, - SEMA_DEFAULT, NULL); - sema_init(&bufs[count].b_sem, 0, NULL, - SEMA_DEFAULT, NULL); } /* @@ -893,6 +1343,10 @@ * assumes that a page is stored in N * contiguous blocks on the device. * Interleaving violates that assumption. + * + * Update: This is now not so big a problem + * because of the I/O scheduler sitting below + * that can re-order and coalesce I/O requests. */ remainder = PAGESIZE - (byte_offset % PAGESIZE); @@ -907,7 +1361,37 @@ } byte_offset += bufs[count].b_bcount; - (void) bdev_strategy(&bufs[count]); + if (fsp->hqueue == NULL) { + (void) bdev_strategy(&bufs[count]); + + } else { + /* + * We are scheduling I/O so we need to enqueue + * requests rather than calling bdev_strategy + * here. A later invocation of the scheduling + * function will take care of doing the actual + * I/O as it selects requests from the queue as + * per the scheduling logic. + */ + struct hio *hsio = kmem_cache_alloc(hio_cache, + KM_SLEEP); + + sema_init(&fio_done[count], 0, NULL, + SEMA_DEFAULT, NULL); + hsio->bp = &bufs[count]; + hsio->sema = &fio_done[count]; + hsio->io_lblkno = bufs[count].b_lblkno; + hsio->nblocks = howmany(hsio->bp->b_bcount, + DEV_BSIZE); + + /* used for deadline */ + hsio->io_timestamp = + drv_hztousec(ddi_get_lbolt()); + + /* for I/O coalescing */ + hsio->contig_chain = NULL; + hsched_enqueue_io(fsp, hsio, 0); + } lwp_stat_update(LWP_STAT_INBLK, 1); lastp = searchp; @@ -918,17 +1402,51 @@ bufsused = count; /* Now wait for everything to come in */ - for (count = 0; count < bufsused; count++) { - if (err == 0) { - err = biowait(&bufs[count]); - } else - (void) biowait(&bufs[count]); + if (fsp->hqueue == NULL) { + for (count = 0; count < bufsused; count++) { + if (err == 0) { + err = biowait(&bufs[count]); + } else + (void) biowait(&bufs[count]); + } + } else { + for (count = 0; count < bufsused; count++) { + struct buf *wbuf; + + /* + * Invoke scheduling function till our buf + * is processed. In doing this it might + * process bufs enqueued by other threads + * which is good. + */ + wbuf = &bufs[count]; + DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf); + while (sema_tryp(&fio_done[count]) == 0) { + /* + * hsched_invoke_strategy will return 1 + * if the I/O queue is empty. This means + * that there is another thread who has + * issued our buf and is waiting. So we + * just block instead of spinning. + */ + if (hsched_invoke_strategy(fsp)) { + sema_p(&fio_done[count]); + break; + } + } + sema_destroy(&fio_done[count]); + DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf); + + if (err == 0) { + err = geterror(wbuf); + } + } + kmem_free(fio_done, bufcnt * sizeof (ksema_t)); } /* Don't leak resources */ for (count = 0; count < bufcnt; count++) { - sema_destroy(&bufs[count].b_io); - sema_destroy(&bufs[count].b_sem); + biofini(&bufs[count]); if (count < bufsused && vas[count] != NULL) { ppmapout(vas[count]); } @@ -963,6 +1481,7 @@ pl[0] = pp; index = 1; + atomic_inc_64(&fsp->cache_read_pages); /* * Try to lock the next page, if it exists, without @@ -979,6 +1498,29 @@ pl[index++] = pp; } pl[index] = NULL; + + /* + * Schedule a semi-asynchronous readahead if we are + * accessing the last cached page for the current + * file. + * + * Doing this here means that readaheads will be + * issued only if cache-hits occur. This is an advantage + * since cache-hits would mean that readahead is giving + * the desired benefit. If cache-hits do not occur there + * is no point in reading ahead of time - the system + * is loaded anyway. + */ + if (fsp->hqueue != NULL && + hp->hs_prev_offset - off == PAGESIZE && + hp->hs_prev_offset < filsiz && + hp->hs_ra_bytes > 0 && + !page_exists(vp, hp->hs_prev_offset)) { + (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg, + addr + PAGESIZE, hp, fsp, xarsiz, bof, + chunk_lbn_count, chunk_data_bytes); + } + return (0); } @@ -1004,7 +1546,11 @@ { int err; uint_t filsiz; - struct hsnode *hp = VTOH(vp); + struct hsfs *fsp; + struct hsnode *hp; + + fsp = VFS_TO_HSFS(vp->v_vfsp); + hp = VTOH(vp); /* does not support write */ if (rw == S_WRITE) { @@ -1025,6 +1571,55 @@ if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap) return (EFAULT); /* beyond EOF */ + /* + * Async Read-ahead computation. + * This attempts to detect sequential access pattern and + * enables reading extra pages ahead of time. + */ + if (fsp->hqueue != NULL) { + /* + * This check for sequential access also takes into + * account segmap weirdness when reading in chunks + * less than the segmap size of 8K. + */ + if (hp->hs_prev_offset == off || (off < + hp->hs_prev_offset && off + MAX(len, PAGESIZE) + >= hp->hs_prev_offset)) { + if (hp->hs_num_contig < + (seq_contig_requests - 1)) { + hp->hs_num_contig++; + + } else { + /* + * We increase readahead quantum till + * a predefined max. max_readahead_bytes + * is a multiple of PAGESIZE. + */ + if (hp->hs_ra_bytes < + fsp->hqueue->max_ra_bytes) { + hp->hs_ra_bytes += PAGESIZE; + } + } + } else { + /* + * Not contiguous so reduce read ahead counters. + */ + if (hp->hs_ra_bytes > 0) + hp->hs_ra_bytes -= PAGESIZE; + + if (hp->hs_ra_bytes <= 0) { + hp->hs_ra_bytes = 0; + if (hp->hs_num_contig > 0) + hp->hs_num_contig--; + } + } + /* + * Length must be rounded up to page boundary. + * since we read in units of pages. + */ + hp->hs_prev_offset = off + roundup(len, PAGESIZE); + DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp); + } if (protp != NULL) *protp = PROT_ALL; @@ -1127,6 +1722,7 @@ if (pp == NULL) continue; + /* * Normally pvn_getdirty() should return 0, which * impies that it has done the job for us. @@ -1299,6 +1895,469 @@ return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); } +static int +hsched_deadline_compare(const void *x1, const void *x2) +{ + const struct hio *h1 = x1; + const struct hio *h2 = x2; + + if (h1->io_timestamp < h2->io_timestamp) + return (-1); + if (h1->io_timestamp > h2->io_timestamp) + return (1); + + if (h1->io_lblkno < h2->io_lblkno) + return (-1); + if (h1->io_lblkno > h2->io_lblkno) + return (1); + + if (h1 < h2) + return (-1); + if (h1 > h2) + return (1); + + return (0); +} + +static int +hsched_offset_compare(const void *x1, const void *x2) +{ + const struct hio *h1 = x1; + const struct hio *h2 = x2; + + if (h1->io_lblkno < h2->io_lblkno) + return (-1); + if (h1->io_lblkno > h2->io_lblkno) + return (1); + + if (h1 < h2) + return (-1); + if (h1 > h2) + return (1); + + return (0); +} + +void +hsched_init_caches(void) +{ + hio_cache = kmem_cache_create("hsfs_hio_cache", + sizeof (struct hio), 0, NULL, + NULL, NULL, NULL, NULL, 0); + + hio_info_cache = kmem_cache_create("hsfs_hio_info_cache", + sizeof (struct hio_info), 0, NULL, + NULL, NULL, NULL, NULL, 0); +} + +void +hsched_fini_caches(void) +{ + kmem_cache_destroy(hio_cache); + kmem_cache_destroy(hio_info_cache); +} + +/* + * Initialize I/O scheduling structures. This is called via hsfs_mount + */ +void +hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage) +{ + struct hsfs_queue *hqueue = fsp->hqueue; + struct vnode *vp = fsp->hsfs_devvp; + + /* TaskQ name of the form: hsched_task_ + stringof(int) */ + char namebuf[23]; + int error, err; + struct dk_cinfo info; + ldi_handle_t lh; + ldi_ident_t li; + + /* + * Default maxtransfer = 16k chunk + */ + hqueue->dev_maxtransfer = 16384; + + /* + * Try to fetch the maximum device transfer size. This is used to + * ensure that a coalesced block does not exceed the maxtransfer. + */ + err = ldi_ident_from_mod(modlinkage, &li); + if (err) { + cmn_err(CE_NOTE, "hsched_init: Querying device failed"); + cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n", + err); + goto set_ra; + } + + err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li); + ldi_ident_release(li); + if (err) { + cmn_err(CE_NOTE, "hsched_init: Querying device failed"); + cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err); + goto set_ra; + } + + error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL, + CRED(), &err); + err = ldi_close(lh, FREAD, CRED()); + if (err) { + cmn_err(CE_NOTE, "hsched_init: Querying device failed"); + cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err); + } + + if (error == 0) { + hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer); + } + +set_ra: + /* + * Max size of data to read ahead for sequential access pattern. + * Conservative to avoid letting the underlying CD drive to spin + * down, in case the application is reading slowly. + * We read ahead upto a max of 4 pages. + */ + hqueue->max_ra_bytes = PAGESIZE * 8; + + mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL); + mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL); + avl_create(&(hqueue->read_tree), hsched_offset_compare, + sizeof (struct hio), offsetof(struct hio, io_offset_node)); + avl_create(&(hqueue->deadline_tree), hsched_deadline_compare, + sizeof (struct hio), offsetof(struct hio, io_deadline_node)); + + (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid); + hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads, + minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC); + + hqueue->next = NULL; + hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP); +} + +void +hsched_fini(struct hsfs_queue *hqueue) +{ + if (hqueue != NULL) { + avl_destroy(&(hqueue->read_tree)); + avl_destroy(&(hqueue->deadline_tree)); + mutex_destroy(&(hqueue->hsfs_queue_lock)); + mutex_destroy(&(hqueue->strategy_lock)); + + /* + * If there are any existing readahead threads running + * taskq_destroy will wait for them to finish. + */ + taskq_destroy(hqueue->ra_task); + if (hqueue->next != NULL) { + kmem_cache_free(hio_cache, hqueue->next); + } + kmem_free(hqueue->nbuf, sizeof (struct buf)); + } +} + +/* + * Determine if two I/O requests are adjacent to each other so + * that they can coalesced. + */ +#define IS_ADJACENT(io, nio) \ + (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \ + (io)->bp->b_edev == (nio)->bp->b_edev) + +/* + * This performs the actual I/O scheduling logic. We use the Circular + * Look algorithm here. Sort the I/O requests in ascending order of + * logical block number and process them starting with the lowest + * numbered block and progressing towards higher block numbers in the + * queue. Once there are no more higher numbered blocks, start again + * with the lowest one. This is good for CD/DVD as you keep moving + * the head in one direction along the outward spiral track and avoid + * too many seeks as much as possible. The re-ordering also allows + * us to coalesce adjacent requests into one larger request. + * This is thus essentially a 1-way Elevator with front merging. + * + * In addition each read request here has a deadline and will be + * processed out of turn if the deadline (500ms) expires. + * + * This function is necessarily serialized via hqueue->strategy_lock. + * This function sits just below hsfs_getapage and processes all read + * requests orginating from that function. + */ +int +hsched_invoke_strategy(struct hsfs *fsp) +{ + struct hsfs_queue *hqueue; + struct buf *nbuf; + struct hio *fio, *nio, *tio, *prev, *last; + size_t bsize, soffset, offset, data; + int bioret, bufcount; + struct vnode *fvp; + ksema_t *io_done; + caddr_t iodata; + + hqueue = fsp->hqueue; + mutex_enter(&hqueue->strategy_lock); + mutex_enter(&hqueue->hsfs_queue_lock); + + /* + * Check for Deadline expiration first + */ + fio = avl_first(&hqueue->deadline_tree); + + /* + * Paranoid check for empty I/O queue. Both deadline + * and read trees contain same data sorted in different + * ways. So empty deadline tree = empty read tree. + */ + if (fio == NULL) { + /* + * Remove the sentinel if there was one. + */ + if (hqueue->next != NULL) { + avl_remove(&hqueue->read_tree, hqueue->next); + kmem_cache_free(hio_cache, hqueue->next); + hqueue->next = NULL; + } + mutex_exit(&hqueue->hsfs_queue_lock); + mutex_exit(&hqueue->strategy_lock); + return (1); + } + + if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp + < HSFS_READ_DEADLINE) { + /* + * Apply standard scheduling logic. This uses the + * C-LOOK approach. Process I/O requests in ascending + * order of logical block address till no subsequent + * higher numbered block request remains. Then start + * again from the lowest numbered block in the queue. + * + * We do this cheaply here by means of a sentinel. + * The last processed I/O structure from the previous + * invocation of this func, is left dangling in the + * read_tree so that we can easily scan to the next + * higher numbered request and remove the sentinel. + */ + fio = NULL; + if (hqueue->next != NULL) { + fio = AVL_NEXT(&hqueue->read_tree, hqueue->next); + avl_remove(&hqueue->read_tree, hqueue->next); + kmem_cache_free(hio_cache, hqueue->next); + hqueue->next = NULL; + } + if (fio == NULL) { + fio = avl_first(&hqueue->read_tree); + } + } else if (hqueue->next != NULL) { + DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio); + + avl_remove(&hqueue->read_tree, hqueue->next); + kmem_cache_free(hio_cache, hqueue->next); + hqueue->next = NULL; + } + + /* + * In addition we try to coalesce contiguous + * requests into one bigger request. + */ + bufcount = 1; + bsize = ldbtob(fio->nblocks); + fvp = fio->bp->b_file; + nio = AVL_NEXT(&hqueue->read_tree, fio); + tio = fio; + while (nio != NULL && IS_ADJACENT(tio, nio) && + bsize < hqueue->dev_maxtransfer) { + avl_remove(&hqueue->deadline_tree, tio); + avl_remove(&hqueue->read_tree, tio); + tio->contig_chain = nio; + bsize += ldbtob(nio->nblocks); + prev = tio; + tio = nio; + + /* + * This check is required to detect the case where + * we are merging adjacent buffers belonging to + * different files. fvp is used to set the b_file + * parameter in the coalesced buf. b_file is used + * by DTrace so we do not want DTrace to accrue + * requests to two different files to any one file. + */ + if (fvp && tio->bp->b_file != fvp) { + fvp = NULL; + } + + nio = AVL_NEXT(&hqueue->read_tree, nio); + bufcount++; + } + + /* + * tio is not removed from the read_tree as it serves as a sentinel + * to cheaply allow us to scan to the next higher numbered I/O + * request. + */ + hqueue->next = tio; + avl_remove(&hqueue->deadline_tree, tio); + mutex_exit(&hqueue->hsfs_queue_lock); + DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount, + size_t, bsize); + + /* + * The benefit of coalescing occurs if the the savings in I/O outweighs + * the cost of doing the additional work below. + * It was observed that coalescing 2 buffers results in diminishing + * returns, so we do coalescing if we have >2 adjacent bufs. + */ + if (bufcount > hsched_coalesce_min) { + /* + * We have coalesced blocks. First allocate mem and buf for + * the entire coalesced chunk. + * Since we are guaranteed single-threaded here we pre-allocate + * one buf at mount time and that is re-used every time. This + * is a synthesized buf structure that uses kmem_alloced chunk. + * Not quite a normal buf attached to pages. + */ + fsp->coalesced_bytes += bsize; + nbuf = hqueue->nbuf; + bioinit(nbuf); + nbuf->b_edev = fio->bp->b_edev; + nbuf->b_dev = fio->bp->b_dev; + nbuf->b_flags = fio->bp->b_flags; + nbuf->b_iodone = fio->bp->b_iodone; + iodata = kmem_alloc(bsize, KM_SLEEP); + nbuf->b_un.b_addr = iodata; + nbuf->b_lblkno = fio->bp->b_lblkno; + nbuf->b_vp = fvp; + nbuf->b_file = fvp; + nbuf->b_bcount = bsize; + nbuf->b_bufsize = bsize; + nbuf->b_resid = bsize; + + DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int, + bufcount, size_t, bsize); + + /* + * Perform I/O for the coalesced block. + */ + (void) bdev_strategy(nbuf); + + /* + * Duplicate the last IO node to leave the sentinel alone. + * The sentinel is freed in the next invocation of this + * function. + */ + prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP); + prev->contig_chain->bp = tio->bp; + prev->contig_chain->sema = tio->sema; + tio = prev->contig_chain; + tio->contig_chain = NULL; + soffset = ldbtob(fio->bp->b_lblkno); + nio = fio; + + bioret = biowait(nbuf); + data = bsize - nbuf->b_resid; + biofini(nbuf); + mutex_exit(&hqueue->strategy_lock); + + /* + * We use the b_resid parameter to detect how much + * data was succesfully transferred. We will signal + * a success to all the fully retrieved actual bufs + * before coalescing, rest is signaled as error, + * if any. + */ + tio = nio; + DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio, + int, bioret, size_t, data); + + /* + * Copy data and signal success to all the bufs + * which can be fully satisfied from b_resid. + */ + while (nio != NULL && data >= nio->bp->b_bcount) { + offset = ldbtob(nio->bp->b_lblkno) - soffset; + bcopy(iodata + offset, nio->bp->b_un.b_addr, + nio->bp->b_bcount); + data -= nio->bp->b_bcount; + bioerror(nio->bp, 0); + biodone(nio->bp); + sema_v(nio->sema); + tio = nio; + nio = nio->contig_chain; + kmem_cache_free(hio_cache, tio); + } + + /* + * Signal error to all the leftover bufs (if any) + * after b_resid data is exhausted. + */ + while (nio != NULL) { + nio->bp->b_resid = nio->bp->b_bcount - data; + bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid); + bioerror(nio->bp, bioret); + biodone(nio->bp); + sema_v(nio->sema); + tio = nio; + nio = nio->contig_chain; + kmem_cache_free(hio_cache, tio); + data = 0; + } + kmem_free(iodata, bsize); + } else { + + nbuf = tio->bp; + io_done = tio->sema; + nio = fio; + last = tio; + + while (nio != NULL) { + (void) bdev_strategy(nio->bp); + nio = nio->contig_chain; + } + nio = fio; + mutex_exit(&hqueue->strategy_lock); + + while (nio != NULL) { + if (nio == last) { + (void) biowait(nbuf); + sema_v(io_done); + break; + /* sentinel last not freed. See above. */ + } else { + (void) biowait(nio->bp); + sema_v(nio->sema); + } + tio = nio; + nio = nio->contig_chain; + kmem_cache_free(hio_cache, tio); + } + } + return (0); +} + +/* + * Insert an I/O request in the I/O scheduler's pipeline + * Using AVL tree makes it easy to reorder the I/O request + * based on logical block number. + */ +static void +hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra) +{ + struct hsfs_queue *hqueue = fsp->hqueue; + + mutex_enter(&hqueue->hsfs_queue_lock); + + fsp->physical_read_bytes += hsio->bp->b_bcount; + if (ra) + fsp->readahead_bytes += hsio->bp->b_bcount; + + avl_add(&hqueue->deadline_tree, hsio); + avl_add(&hqueue->read_tree, hsio); + + DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio, + struct hsfs_queue *, hqueue, int, ra); + + mutex_exit(&hqueue->hsfs_queue_lock); +} + /* ARGSUSED */ static int hsfs_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr)
--- a/usr/src/uts/common/sys/fs/hsfs_node.h Tue Oct 23 17:36:10 2007 -0700 +++ b/usr/src/uts/common/sys/fs/hsfs_node.h Wed Oct 24 07:14:17 2007 -0700 @@ -35,6 +35,8 @@ extern "C" { #endif +#include <sys/taskq.h> + struct hs_direntry { uint_t ext_lbn; /* LBN of start of extent */ uint_t ext_size; /* no. of data bytes in extent */ @@ -99,6 +101,9 @@ long hs_mapcnt; /* mappings to file pages */ uint_t hs_seq; /* sequence number */ uint_t hs_flags; /* (see below) */ + u_offset_t hs_prev_offset; /* Last read end offset (readahead) */ + int hs_num_contig; /* Count of contiguous reads */ + int hs_ra_bytes; /* Bytes to readahead */ kmutex_t hs_contents_lock; /* protects hsnode contents */ /* except hs_offset */ }; @@ -165,6 +170,76 @@ #define HS_DUMMY_INO 16 /* dummy inode number for empty files */ /* + * Hsfs I/O Scheduling parameters and data structures. + * Deadline for reads is set at 5000 usec. + */ +#define HSFS_READ_DEADLINE 5000 +#define HSFS_NORMAL 0x0 + +/* + * This structure holds information for a read request that is enqueued + * for processing by the scheduling function. An AVL tree is used to + * access the read requests in a sorted manner. + */ +struct hio { + struct buf *bp; /* The buf for this read */ + struct hio *contig_chain; /* Next adjacent read if any */ + offset_t io_lblkno; /* Starting disk block of io */ + u_offset_t nblocks; /* # disk blocks */ + uint64_t io_timestamp; /* usec timestamp for deadline */ + ksema_t *sema; /* Completion flag */ + avl_node_t io_offset_node; /* Avl tree requirements */ + avl_node_t io_deadline_node; +}; + +/* + * This structure holds information about all the read requests issued + * during a read-ahead invocation. This is then enqueued on a task-queue + * for processing by a background thread that takes this read-ahead to + * completion and cleans up. + */ +struct hio_info { + struct buf *bufs; /* array of bufs issued for this R/A */ + caddr_t *vas; /* The kmem_alloced chunk for the bufs */ + ksema_t *sema; /* Semaphores used in the bufs */ + uint_t bufsused; /* # of bufs actually used */ + uint_t bufcnt; /* Tot bufs allocated. */ + struct page *pp; /* The list of I/O locked pages */ + struct hsfs *fsp; /* The filesystem structure */ +}; + +/* + * This is per-filesystem structure that stores toplevel data structures for + * the I/O scheduler. + */ +struct hsfs_queue { + /* + * A dummy hio holding the LBN of the last read processed. Easy + * to use in AVL_NEXT for Circular Look behavior. + */ + struct hio *next; + + /* + * A pre-allocated buf for issuing coalesced reads. The scheduling + * function is mostly single threaded by necessity. + */ + struct buf *nbuf; + kmutex_t hsfs_queue_lock; /* Protects the AVL trees */ + + /* + * Makes most of the scheduling function Single-threaded. + */ + kmutex_t strategy_lock; + avl_tree_t read_tree; /* Reads ordered by LBN */ + avl_tree_t deadline_tree; /* Reads ordered by timestamp */ + taskq_t *ra_task; /* Read-ahead Q */ + int max_ra_bytes; /* Max read-ahead quantum */ + + /* Device Max Transfer size in DEV_BSIZE */ + uint_t dev_maxtransfer; +}; + +/* * High Sierra filesystem structure. * There is one of these for each mounted High Sierra filesystem. */ @@ -198,6 +273,18 @@ kmutex_t hsfs_free_lock; /* protects free list */ struct hsnode *hsfs_free_f; /* first entry of free list */ struct hsnode *hsfs_free_b; /* last entry of free list */ + + /* + * Counters exported through kstats. + */ + uint64_t physical_read_bytes; + uint64_t cache_read_pages; + uint64_t readahead_bytes; + uint64_t coalesced_bytes; + uint64_t total_pages_requested; + kstat_t *hsfs_kstats; + + struct hsfs_queue *hqueue; /* I/O Scheduling parameters */ }; /*