/* $Id: physical.c,v 1.18.2.1 2005/01/16 23:13:30 adam Exp $
   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
   Index Data Aps

This file is part of the Zebra server.

Zebra is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.

Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Zebra; see the file LICENSE.zebra.  If not, write to the
Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
*/



/*
 * This module handles the representation of tables in the bfiles.
 */

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <yaz/log.h>
#include <isam.h>

static int is_freestore_alloc(ISAM is, int type)
{
    int tmp;

    if (is->types[type].freelist >= 0)
    {
    	tmp = is->types[type].freelist;
    	if (bf_read(is->types[type].bf, tmp, 0, sizeof(tmp),
	    &is->types[type].freelist) <=0)
	{
	    logf (LOG_FATAL, "Failed to allocate block");
	    exit(1);
	}
    }
    else
    	tmp = is->types[type].top++;

    logf (LOG_DEBUG, "Allocating block #%d", tmp);
    return tmp;
}

static void is_freestore_free(ISAM is, int type, int block)
{
    int tmp;

    logf (LOG_DEBUG, "Releasing block #%d", block);
    tmp = is->types[type].freelist;
    is->types[type].freelist = block;
    if (bf_write(is->types[type].bf, block, 0, sizeof(tmp), &tmp) < 0)
    {
    	logf (LOG_FATAL, "Failed to deallocate block.");
    	exit(1);
    }
}

/* this code must be modified to handle an index */
int is_p_read_partial(is_mtable *tab, is_mblock *block)
{
    int toread;
    is_mbuf *buf;

    assert(block->state == IS_MBSTATE_UNREAD);
    block->data = buf = xmalloc_mbuf(IS_MBUF_TYPE_LARGE);
    toread = tab->is->types[tab->pos_type].blocksize;
    if (toread > is_mbuf_size[buf->type])
    {
    	toread = is_mbuf_size[buf->type];
    	block->state = IS_MBSTATE_PARTIAL;
    }
    else
    	block->state = IS_MBSTATE_CLEAN;
    if (bf_read(tab->is->types[tab->pos_type].bf, block->diskpos, 0, toread,
	buf->data) < 0)
    {
    	logf (LOG_FATAL, "bfread failed.");
    	return -1;
    }
    /* extract header info */
    buf->offset = 0;
    memcpy(&block->num_records, buf->data, sizeof(block->num_records));
    assert(block->num_records > 0);
    buf->offset += sizeof(block->num_records);
    memcpy(&block->nextpos, buf->data + buf->offset,
	sizeof(block->nextpos));
    buf->offset += sizeof(block->nextpos);
    if (block == tab->data) /* first block */
    {
        memcpy(&tab->num_records, buf->data + buf->offset,
	    sizeof(tab->num_records));
	buf->offset +=sizeof(tab->num_records);
    }
    logf(LOG_DEBUG, "R: Block #%d: num %d nextpos %d total %d",
        block->diskpos, block->num_records, block->nextpos,
	block == tab->data ? tab->num_records : -1);
    buf->num = (toread - buf->offset) / is_keysize(tab->is);
    if (buf->num >= block->num_records)
    {
    	buf->num = block->num_records;
    	block->state = IS_MBSTATE_CLEAN;
    }
    else
    	block->bread = buf->offset + buf->num * is_keysize(tab->is);
    return 0;
}

int is_p_read_full(is_mtable *tab, is_mblock *block)
{
    is_mbuf *buf;
    int dread, toread;

    if (block->state == IS_MBSTATE_UNREAD && is_p_read_partial(tab, block) < 0)
    {
    	logf (LOG_FATAL, "partial read failed.");
    	return -1;
    }
    if (block->state == IS_MBSTATE_PARTIAL)
    {
    	buf = block->data;
    	dread = block->data->num;
    	while (dread < block->num_records)
    	{
	    buf->next = xmalloc_mbuf(IS_MBUF_TYPE_LARGE);
	    buf = buf->next;

	    toread = is_mbuf_size[buf->type] / is_keysize(tab->is);
	    if (toread > block->num_records - dread)
	    	toread = block->num_records - dread;

	    if (bf_read(tab->is->types[tab->pos_type].bf, block->diskpos, block->bread, toread *
		is_keysize(tab->is), buf->data) < 0)
	    {
	    	logf (LOG_FATAL, "bfread failed.");
	    	return -1;
	    }
	    buf->offset = 0;
	    buf->num = toread;
	    dread += toread;
	    block->bread += toread * is_keysize(tab->is);
    	}
	block->state = IS_MBSTATE_CLEAN;
    }
    logf (LOG_DEBUG, "R: Block #%d contains %d records.", block->diskpos, block->num_records);
    return 0;
}

/*
 * write dirty blocks to bfile.
 * Allocate blocks as necessary.
 */
void is_p_sync(is_mtable *tab)
{
    is_mblock *p;
    is_mbuf *b;
    int sum, v;
    isam_blocktype *type;

    type = &tab->is->types[tab->pos_type];
    for (p = tab->data; p; p = p->next)
    {
    	if (p->state < IS_MBSTATE_DIRTY)
	    continue;
	/* make sure that blocks are allocated. */
    	if (p->diskpos < 0)
	    p->diskpos = is_freestore_alloc(tab->is, tab->pos_type);
	if (p->next)
	{
	    if (p->next->diskpos < 0)
		p->nextpos = p->next->diskpos = is_freestore_alloc(tab->is,
		    tab->pos_type);
	    else
	    	p->nextpos = p->next->diskpos;
	}
	else
	    p->nextpos = 0;
	sum = 0;
	memcpy(type->dbuf, &p->num_records, sizeof(p->num_records));
	sum += sizeof(p->num_records);
	memcpy(type->dbuf + sum, &p->nextpos, sizeof(p->nextpos));
	sum += sizeof(p->nextpos);
	if (p == tab->data)  /* first block */
	{
	    memcpy(type->dbuf + sum, &tab->num_records,
		sizeof(tab->num_records));
	    sum += sizeof(tab->num_records);
	}
	logf (LOG_DEBUG, "W: Block #%d contains %d records.", p->diskpos,
	    p->num_records);
	assert(p->num_records > 0);
	for (b = p->data; b; b = b->next)
	{
            logf(LOG_DEBUG, "   buf: offset %d, keys %d, type %d, ref %d",
	    	b->offset, b->num, b->type, b->refcount);
	    if ((v = b->num * is_keysize(tab->is)) > 0)
		memcpy(type->dbuf + sum, b->data + b->offset, v);

	    sum += v;
	    assert(sum <= type->blocksize);
	}
	if (bf_write(type->bf, p->diskpos, 0, sum, type->dbuf) < 0)
	{
	    logf (LOG_FATAL, "Failed to write block.");
	    exit(1);
	}
    }
}

/*
 * Free all disk blocks associated with table.
 */
void is_p_unmap(is_mtable *tab)
{
    is_mblock *p;

    for (p = tab->data; p; p = p->next)
    {
    	if (p->diskpos >= 0)
    	{
	    is_freestore_free(tab->is, tab->pos_type, p->diskpos);
	    p->diskpos = -1;
	}
    }
}

static is_mbuf *mbuf_takehead(is_mbuf **mb, int *num, int keysize)
{
    is_mbuf *p = 0, **pp = &p, *inew;
    int toget = *num;

    if (!toget)
    	return 0;
    while (*mb && toget >= (*mb)->num)
    {
	toget -= (*mb)->num;
	*pp = *mb;
	*mb = (*mb)->next;
	(*pp)->next = 0;
	pp = &(*pp)->next;
    }
    if (toget > 0 && *mb)
    {
	inew = xmalloc_mbuf(IS_MBUF_TYPE_SMALL);
	inew->next = (*mb)->next;
	(*mb)->next = inew;
	inew->data = (*mb)->data;
	(*mb)->refcount++;
	inew->offset = (*mb)->offset + toget * keysize;
	inew->num = (*mb)->num - toget;
	(*mb)->num = toget;
	*pp = *mb;
	*mb = (*mb)->next;
	(*pp)->next = 0;
	toget = 0;
    }
    *num -= toget;
    return p;
}

/*
 * Split up individual blocks which have grown too large.
 * is_p_align and is_p_remap are alternative functions which trade off
 * speed in updating versus optimum usage of disk blocks.
 */
void is_p_align(is_mtable *tab)
{
    is_mblock *mblock, *inew, *last = 0, *next;
    is_mbuf *mbufs, *mbp;
    int blocks, recsblock;

    logf (LOG_DEBUG, "Realigning table.");
    for (mblock = tab->data; mblock; mblock = next)
    {
        next = mblock->next;
        if (mblock->state == IS_MBSTATE_DIRTY && mblock->num_records == 0)
        {
	    if (last)
	    {
	    	last->next = mblock->next;
	    	last->state = IS_MBSTATE_DIRTY;
	    	next = mblock->next;
	    }
	    else
	    {
	    	next = tab->data->next;
		if (next)
		{
		    if (next->state < IS_MBSTATE_CLEAN)
		    {
			if (is_p_read_full(tab, next) < 0)
			{
			    logf(LOG_FATAL, "Error during re-alignment");
			    abort();
			}
			if (next->nextpos && !next->next)
			{
			    next->next = xmalloc_mblock();
			    next->next->diskpos = next->nextpos;
			    next->next->state = IS_MBSTATE_UNREAD;
			    next->next->data = 0;
			}
		    }
		    next->state = IS_MBSTATE_DIRTY; /* force re-process */
		    tab->data = next;
		}
	    }
	    if (mblock->diskpos >= 0)
		is_freestore_free(tab->is, tab->pos_type, mblock->diskpos);
	    xrelease_mblock(mblock);
	}
    	else if (mblock->state == IS_MBSTATE_DIRTY && mblock->num_records >
	    (mblock == tab->data ?
	    tab->is->types[tab->pos_type].max_keys_block0 :
	    tab->is->types[tab->pos_type].max_keys_block))
	{
	    blocks = tab->num_records /
	    tab->is->types[tab->pos_type].nice_keys_block;
	    if (tab->num_records %
		tab->is->types[tab->pos_type].nice_keys_block)
		blocks++;
	    recsblock = tab->num_records / blocks;
	    if (recsblock < 1)
		recsblock = 1;
	    mbufs = mblock->data;
	    while ((mbp = mbuf_takehead(&mbufs, &recsblock,
		is_keysize(tab->is))) && recsblock)
	    {
	    	if (mbufs)
	    	{
		    inew = xmalloc_mblock();
		    inew->diskpos = -1;
		    inew->state = IS_MBSTATE_DIRTY;
		    inew->next = mblock->next;
		    mblock->next = inew;
		}
	    	mblock->data = mbp;
	    	mblock->num_records = recsblock;
	    	last = mblock;
	    	mblock = mblock->next;
	    }
	    next = mblock; 
	}
	else
	    last = mblock;
    }
}

/*
 * Reorganize data in blocks for minimum block usage and quick access.
 * Free surplus blocks.
 * is_p_align and is_p_remap are alternative functions which trade off
 * speed in updating versus optimum usage of disk blocks.
 */
void is_p_remap(is_mtable *tab)
{
    is_mbuf *mbufs, **bufpp, *mbp;
    is_mblock *blockp, **blockpp;
    int recsblock, blocks;

    logf (LOG_DEBUG, "Remapping table.");
    /* collect all data */
    bufpp = &mbufs;
    for (blockp = tab->data; blockp; blockp = blockp->next)
    {
    	if (blockp->state < IS_MBSTATE_CLEAN && is_m_read_full(tab, blockp) < 0)
	{
	    logf (LOG_FATAL, "Read-full failed in remap.");
	    exit(1);
	}
    	*bufpp = blockp->data;
    	while (*bufpp)
	    bufpp = &(*bufpp)->next;
        blockp->data = 0;
    }
    blocks = tab->num_records / tab->is->types[tab->pos_type].nice_keys_block;
    if (tab->num_records % tab->is->types[tab->pos_type].nice_keys_block)
    	blocks++;
    if (blocks == 0)
    	blocks = 1;
    recsblock = tab->num_records / blocks + 1;
    if (recsblock > tab->is->types[tab->pos_type].nice_keys_block)
    	recsblock--;
    blockpp = &tab->data;
    while ((mbp = mbuf_takehead(&mbufs, &recsblock, is_keysize(tab->is))) &&
    	recsblock)
    {
    	if (!*blockpp)
    	{
	    *blockpp = xmalloc_mblock();
	    (*blockpp)->diskpos = -1;
	}
    	(*blockpp)->data = mbp;
    	(*blockpp)->num_records = recsblock;
    	(*blockpp)->state = IS_MBSTATE_DIRTY;
    	blockpp = &(*blockpp)->next;
    }
    if (mbp)
    	xfree_mbufs(mbp);
    if (*blockpp)
    {
    	for (blockp = *blockpp; blockp; blockp = blockp->next)
	    if (blockp->diskpos >= 0)
		is_freestore_free(tab->is, tab->pos_type, blockp->diskpos);
    	xfree_mblocks(*blockpp);
    	*blockpp = 0;
    }
}


syntax highlighted by Code2HTML, v. 0.9.1