/* $Id: marcread.c,v 1.24.2.2 2005/01/16 23:11:04 adam Exp $
   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
   Index Data Aps

This file is part of the Zebra server.

Zebra is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.

Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Zebra; see the file LICENSE.zebra.  If not, write to the
Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
*/

#include <stdio.h>
#include <ctype.h>
#include <assert.h>

#include <yaz/log.h>
#include <yaz/yaz-util.h>
#include <yaz/marcdisp.h>
#include "grsread.h"
#include "marcomp.h"
#include "inline.h"

#define MARC_DEBUG 0
#define MARCOMP_DEBUG 0

static data1_node *grs_read_iso2709 (struct grs_read_info *p, int marc_xml)
{
    char buf[100000];
    int entry_p;
    int record_length;
    int indicator_length;
    int identifier_length;
    int base_address;
    int length_data_entry;
    int length_starting;
    int length_implementation;
    int read_bytes;
#if MARC_DEBUG
    FILE *outf = stdout;
#endif
    data1_node *res_root, *res_top;
    char *absynName;
    data1_marctab *marctab;

    if ((*p->readf)(p->fh, buf, 5) != 5)
        return NULL;
    record_length = atoi_n (buf, 5);
    if (record_length < 25)
    {
        logf (LOG_WARN, "MARC record length < 25, is %d", record_length);
        return NULL;
    }
    /* read remaining part - attempt to read one byte furhter... */
    read_bytes = (*p->readf)(p->fh, buf+5, record_length-4);
    if (read_bytes < record_length-5)
    {
        logf (LOG_WARN, "Couldn't read whole MARC record");
        return NULL;
    }
    if (read_bytes == record_length - 4)
    {
        off_t cur_offset = (*p->tellf)(p->fh);
	if (cur_offset <= 27)
	    return NULL;
	if (p->endf)
	    (*p->endf)(p->fh, cur_offset - 1);
    }
    absynName = p->type;
    res_root = data1_mk_root (p->dh, p->mem, absynName);
    if (!res_root)
    {
        yaz_log (LOG_WARN, "cannot read MARC without an abstract syntax");
        return 0;
    }
    if (marc_xml)
    {
	data1_node *lead;
	const char *attr[] = { "xmlns", "http://www.loc.gov/MARC21/slim", 0};
			 
	res_top = data1_mk_tag (p->dh, p->mem, "record", attr, res_root);

	lead = data1_mk_tag(p->dh, p->mem, "leader", 0, res_top);
	data1_mk_text_n(p->dh, p->mem, buf, 24, lead);
    }
    else
	res_top = data1_mk_tag (p->dh, p->mem, absynName, 0, res_root);

    if ((marctab = res_root->u.root.absyn->marc))
    {
	memcpy(marctab->leader, buf, 24);
        memcpy(marctab->implementation_codes, buf+6, 4);
        marctab->implementation_codes[4] = '\0';
        memcpy(marctab->user_systems, buf+17, 3);
        marctab->user_systems[3] = '\0';
    }

    if (marctab && marctab->force_indicator_length >= 0)
	indicator_length = marctab->force_indicator_length;
    else
	indicator_length = atoi_n (buf+10, 1);
    if (marctab && marctab->force_identifier_length >= 0)
	identifier_length = marctab->force_identifier_length;
    else
	identifier_length = atoi_n (buf+11, 1);
    base_address = atoi_n (buf+12, 5);

    length_data_entry = atoi_n (buf+20, 1);
    length_starting = atoi_n (buf+21, 1);
    length_implementation = atoi_n (buf+22, 1);

    for (entry_p = 24; buf[entry_p] != ISO2709_FS; )
        entry_p += 3+length_data_entry+length_starting;
    base_address = entry_p+1;
    for (entry_p = 24; buf[entry_p] != ISO2709_FS; )
    {
        int data_length;
        int data_offset;
        int end_offset;
        int i, i0;
        char tag[4];
        data1_node *res;
        data1_node *parent = res_top;

        memcpy (tag, buf+entry_p, 3);
        entry_p += 3;
        tag[3] = '\0';

	if (marc_xml)
	    res = parent;
	else
	    res = data1_mk_tag_n (p->dh, p->mem, tag, 3, 0 /* attr */, parent);
	
#if MARC_DEBUG
        fprintf (outf, "%s ", tag);
#endif
        data_length = atoi_n (buf+entry_p, length_data_entry);
        entry_p += length_data_entry;
        data_offset = atoi_n (buf+entry_p, length_starting);
        entry_p += length_starting;
        i = data_offset + base_address;
        end_offset = i+data_length-1;

        if (memcmp (tag, "00", 2) && indicator_length)
        {
            /* generate indicator node */
	    if (marc_xml)
	    {
		const char *attr[10];
		int j;

		attr[0] = "tag";
		attr[1] = tag;
		attr[2] = 0;

		res = data1_mk_tag(p->dh, p->mem, "datafield", attr, res);

		for (j = 0; j<indicator_length; j++)
		{
		    char str1[18], str2[2];
		    sprintf (str1, "ind%d", j+1);
		    str2[0] = buf[i+j];
		    str2[1] = '\0';

		    attr[0] = str1;
		    attr[1] = str2;
		    
		    data1_tag_add_attr (p->dh, p->mem, res, attr);
		}
	    }
	    else
	    {
#if MARC_DEBUG
		int j;
#endif
		res = data1_mk_tag_n (p->dh, p->mem, 
				      buf+i, indicator_length, 0 /* attr */, res);
#if MARC_DEBUG
		for (j = 0; j<indicator_length; j++)
		    fprintf (outf, "%c", buf[j+i]);
#endif
	    }
	    i += indicator_length;
        } 
	else
	{
	    if (marc_xml)
	    {
		const char *attr[10];
		
		attr[0] = "tag";
		attr[1] = tag;
		attr[2] = 0;
		
		res = data1_mk_tag(p->dh, p->mem, "controlfield", attr, res);
	    }
	}
        parent = res;
        /* traverse sub fields */
        i0 = i;
        while (buf[i] != ISO2709_RS && buf[i] != ISO2709_FS && i < end_offset)
        {
	    if (memcmp (tag, "00", 2) && identifier_length)
            {
		data1_node *res;
		if (marc_xml)
		{
		    int j;
		    const char *attr[3];
		    char code[10];
		    
		    for (j = 1; j<identifier_length && j < 9; j++)
			code[j-1] = buf[i+j];
		    code[j-1] = 0;
		    attr[0] = "code";
		    attr[1] = code;
		    attr[2] = 0;
		    res = data1_mk_tag(p->dh, p->mem, "subfield",
				       attr, parent);
		}
		else
		{
		    res = data1_mk_tag_n (p->dh, p->mem,
					   buf+i+1, identifier_length-1, 
					   0 /* attr */, parent);
		}
#if MARC_DEBUG
                fprintf (outf, " $"); 
                for (j = 1; j<identifier_length; j++)
                    fprintf (outf, "%c", buf[j+i]);
                fprintf (outf, " ");
#endif
                i += identifier_length;
                i0 = i;
            	while (buf[i] != ISO2709_RS && buf[i] != ISO2709_IDFS &&
		     buf[i] != ISO2709_FS && i < end_offset)
            	{
#if MARC_DEBUG
            	    fprintf (outf, "%c", buf[i]);
#endif
		    i++;
            	}
            	data1_mk_text_n (p->dh, p->mem, buf + i0, i - i0, res);
		i0 = i;
            }
            else
            {
#if MARC_DEBUG
                fprintf (outf, "%c", buf[i]);
#endif
                i++;
            }
        }
        if (i > i0)
	{
            data1_mk_text_n (p->dh, p->mem, buf + i0, i - i0, parent);
	}
#if MARC_DEBUG
        fprintf (outf, "\n");
        if (i < end_offset)
            fprintf (outf, "-- separator but not at end of field\n");
        if (buf[i] != ISO2709_RS && buf[i] != ISO2709_FS)
            fprintf (outf, "-- no separator at end of field\n");
#endif
    }
    return res_root;
}

/*
 * Locate some data under this node. This routine should handle variants
 * prettily.
 */
static char *get_data(data1_node *n, int *len)
{
    char *r;

    while (n)
    {
        if (n->which == DATA1N_data)
        {
            *len = n->u.data.len;
	    
	    /** Fixme: not delete leader/final whitespaces
	     ** in MARC field/subfield. It fixed in
	     ** data1/d1_marc.c too.
	     
            for (i = 0; i<*len; i++)
                if (!d1_isspace(n->u.data.data[i]))
                    break;
            while (*len && d1_isspace(n->u.data.data[*len - 1]))
                (*len)--;
            *len = *len - i;
            if (*len > 0)
                return n->u.data.data + i;
	    **/
	    if (*len > 0)
		return n->u.data.data;
        }
        if (n->which == DATA1N_tag)
            n = n->child;
	else if (n->which == DATA1N_data)
            n = n->next;
	else
            break;	
    }
    r = "";
    *len = strlen(r);
    return r;
}

static data1_node *lookup_subfield(data1_node *node, const char *name)
{
    data1_node *p;
    
    for (p=node; p; p=p->next)
    {
	if (!yaz_matchstr(p->u.tag.tag, name))
	    return p;
    }
    return 0;
}

static inline_subfield *lookup_inline_subfield(inline_subfield *pisf,
					       const char *name)
{
    inline_subfield *p;
    
    for (p=pisf; p; p=p->next)
    {
	if (!yaz_matchstr(p->name, name))
	    return p;
    }
    return 0;
}

static inline_subfield *cat_inline_subfield(mc_subfield *psf, WRBUF buf,
					    inline_subfield *pisf)
{
    mc_subfield *p;
    
    for (p = psf; p && pisf; p = p->next)
    {
	if (p->which == MC_SF)
	{
	    inline_subfield *found = lookup_inline_subfield(pisf, p->name);
	    
	    if (found)
	    {
	    	if (strcmp(p->prefix, "_"))
		{
		    wrbuf_puts(buf, " ");
		    wrbuf_puts(buf, p->prefix);
		}
		if (p->interval.start == -1)
		{
		    wrbuf_puts(buf, found->data);
		}
		else
		{
		    wrbuf_write(buf, found->data+p->interval.start,
				p->interval.end-p->interval.start+1);
		    wrbuf_puts(buf, "");
		}
	    	if (strcmp(p->suffix, "_"))
		{
		    wrbuf_puts(buf, p->suffix);
		    wrbuf_puts(buf, " ");
		}
#if MARCOMP_DEBUG
		logf(LOG_LOG, "cat_inline_subfield(): add subfield $%s", found->name);
#endif		
		pisf = found->next;
	    }
	}
	else if (p->which == MC_SFVARIANT)
	{
	    inline_subfield *next;
	    
	    do {
		next = cat_inline_subfield(p->u.child, buf, pisf);
		if (next == pisf)
		    break;
		pisf = next;
	    } while (pisf);
	}
	else if (p->which == MC_SFGROUP)
	{
	    mc_subfield *pp;
	    int found;
	    
	    for (pp = p->u.child, found = 0; pp; pp = pp->next)
	    {
		if (!yaz_matchstr(pisf->name, p->name))
		{
		    found = 1;
		    break;
		}
	    }
	    if (found)
	    {
		wrbuf_puts(buf, " (");
		pisf = cat_inline_subfield(p->u.child, buf, pisf);
		wrbuf_puts(buf, ") ");
	    }
	}
    }
    return pisf; 
}

static void cat_inline_field(mc_field *pf, WRBUF buf, data1_node *subfield)
{    
    if (!pf || !subfield)
	return;

    for (;subfield;)
    {
	int len;
	inline_field *pif=NULL;
	data1_node *psubf;
	
	if (yaz_matchstr(subfield->u.tag.tag, "1"))
	{
	    subfield = subfield->next;
	    continue;
	}
	
	psubf = subfield;
	pif = inline_mk_field();
	do
	{
	    int i;
	    if ((i=inline_parse(pif, psubf->u.tag.tag, get_data(psubf, &len)))<0)
	    {
		logf(LOG_WARN, "inline subfield ($%s): parse error",
		    psubf->u.tag.tag);
		inline_destroy_field(pif);
		return;	
	    }
	    psubf = psubf->next;
	} while (psubf && yaz_matchstr(psubf->u.tag.tag, "1"));
	
	subfield = psubf;
	
	if (pif && !yaz_matchstr(pif->name, pf->name))
	{
	    if (!pf->list && pif->list)
	    {
		wrbuf_puts(buf, pif->list->data);
	    }
	    else
	    {
		int ind1, ind2;

	        /*
		    check indicators
		*/

		ind1 = (pif->ind1[0] == ' ') ? '_':pif->ind1[0];
		ind2 = (pif->ind2[0] == ' ') ? '_':pif->ind2[0];
    
		if (((pf->ind1[0] == '.') || (ind1 == pf->ind1[0])) &&
		    ((pf->ind2[0] == '.') || (ind2 == pf->ind2[0])))
		{
		    cat_inline_subfield(pf->list, buf, pif->list);
		    
		    /*
		    	add separator for inline fields
		    */
		    if (wrbuf_len(buf))
		    {
			wrbuf_puts(buf, "\n");
		    }
		}
		else
		{
		    logf(LOG_WARN, "In-line field %s missed -- indicators do not match", pif->name);
		}
	    }
	}
	inline_destroy_field(pif);
    }
#if MARCOMP_DEBUG    
    logf(LOG_LOG, "cat_inline_field(): got buffer {%s}", buf->buf);
#endif
}

static data1_node *cat_subfield(mc_subfield *psf, WRBUF buf,
				data1_node *subfield)
{
    mc_subfield *p;
    
    for (p = psf; p && subfield; p = p->next)
    {
	if (p->which == MC_SF)
	{
	    data1_node *found = lookup_subfield(subfield, p->name);
	    
	    if (found)
	    {
		int len;
		
		if (strcmp(p->prefix, "_"))
		{
		    wrbuf_puts(buf, " ");
		    wrbuf_puts(buf, p->prefix);
		}
		
		if (p->u.in_line)
		{
		    cat_inline_field(p->u.in_line, buf, found);
		}
		else if (p->interval.start == -1)
		{
		    wrbuf_puts(buf, get_data(found, &len));
		}
		else
		{
		    wrbuf_write(buf, get_data(found, &len)+p->interval.start,
			p->interval.end-p->interval.start+1);
		    wrbuf_puts(buf, "");
		}
		if (strcmp(p->suffix, "_"))
		{
		    wrbuf_puts(buf, p->suffix);
		    wrbuf_puts(buf, " ");
		}
#if MARCOMP_DEBUG		
		logf(LOG_LOG, "cat_subfield(): add subfield $%s", found->u.tag.tag);
#endif		
		subfield = found->next;
	    }
	}
	else if (p->which == MC_SFVARIANT)
	{
	    data1_node *next;
	    do {
		next = cat_subfield(p->u.child, buf, subfield);
		if (next == subfield)
		    break;
		subfield = next;
	    } while (subfield);
	}
	else if (p->which == MC_SFGROUP)
	{
	    mc_subfield *pp;
	    int found;
	    
	    for (pp = p->u.child, found = 0; pp; pp = pp->next)
	    {
		if (!yaz_matchstr(subfield->u.tag.tag, pp->name))
		{
		    found = 1;
		    break;
		}
	    }
	    if (found)
	    {
		wrbuf_puts(buf, " (");
		subfield = cat_subfield(p->u.child, buf, subfield);
		wrbuf_puts(buf, ") ");
	    }
	}
    }
    return subfield;
}

static data1_node *cat_field(struct grs_read_info *p, mc_field *pf,
			     WRBUF buf, data1_node *field)
{
    data1_node *subfield;
    int ind1, ind2;
    
    if (!pf || !field)
	return 0;

    
    if (yaz_matchstr(field->u.tag.tag, pf->name))
	return field->next;

    subfield = field->child;
    
    if (!subfield)
	return field->next;

    /*
	check subfield without indicators
    */
    
    if (!pf->list && subfield->which == DATA1N_data)
    {
	int len;
	
	if (pf->interval.start == -1)
	{
	    wrbuf_puts(buf, get_data(field, &len));
	}
	else
	{
	    wrbuf_write(buf, get_data(field, &len)+pf->interval.start,
			pf->interval.end-pf->interval.start+1);
	    wrbuf_puts(buf, "");
	}
#if MARCOMP_DEBUG
        logf(LOG_LOG, "cat_field(): got buffer {%s}", buf->buf);
#endif
	return field->next;
    }
    
    /*
	check indicators
    */

    ind1 = (subfield->u.tag.tag[0] == ' ') ? '_':subfield->u.tag.tag[0];
    ind2 = (subfield->u.tag.tag[1] == ' ') ? '_':subfield->u.tag.tag[1];
    
    if (!(
	((pf->ind1[0] == '.') || (ind1 == pf->ind1[0])) &&
	((pf->ind2[0] == '.') || (ind2 == pf->ind2[0]))
	))
    {
#if MARCOMP_DEBUG
	logf(LOG_WARN, "Field %s missed -- does not match indicators", field->u.tag.tag);
#endif
	return field->next;
    }
    
    subfield = subfield->child;
    
    if (!subfield)
	return field->next;

    cat_subfield(pf->list, buf, subfield);

#if MARCOMP_DEBUG    
    logf(LOG_LOG, "cat_field(): got buffer {%s}", buf->buf);
#endif
    
    return field->next;    
}

static int is_empty(char *s)
{
    char *p = s;
    
    for (p = s; *p; p++)
    {
	if (!isspace(*p))
	    return 0;
    }
    return 1;
}

static void parse_data1_tree(struct grs_read_info *p, const char *mc_stmnt,
			     data1_node *root)
{
    data1_marctab *marctab = root->u.root.absyn->marc;
    data1_node *top = root->child;
    data1_node *field;
    mc_context *c;
    mc_field *pf;
    WRBUF buf;
    
    c = mc_mk_context(mc_stmnt+3);
    
    if (!c)
	return;
	
    pf = mc_getfield(c);
    
    if (!pf)
    {
	mc_destroy_context(c);
	return;
    }
    buf = wrbuf_alloc();
#if MARCOMP_DEBUG    
    logf(LOG_LOG, "parse_data1_tree(): statement -{%s}", mc_stmnt);
#endif
    if (!yaz_matchstr(pf->name, "ldr"))
    {
	data1_node *new;
#if MARCOMP_DEBUG
	logf(LOG_LOG,"parse_data1_tree(): try LEADER from {%d} to {%d} positions",
	    pf->interval.start, pf->interval.end);
#endif	
	new = data1_mk_tag_n(p->dh, p->mem, mc_stmnt, strlen(mc_stmnt), 0, top);
	data1_mk_text_n(p->dh, p->mem, marctab->leader+pf->interval.start,
	    pf->interval.end-pf->interval.start+1, new);
    }
    else
    {
	field=top->child;
	
	while(field)
	{
	    if (!yaz_matchstr(field->u.tag.tag, pf->name))
	    {
		data1_node *new;
		char *pb;
#if MARCOMP_DEBUG		
		logf(LOG_LOG, "parse_data1_tree(): try field {%s}", field->u.tag.tag);
#endif		
		wrbuf_rewind(buf);
		wrbuf_puts(buf, "");

		field = cat_field(p, pf, buf, field);
		
		pb = wrbuf_buf(buf);
		for (pb = strtok(pb, "\n"); pb; pb = strtok(NULL, "\n"))
		{
			if (!is_empty(pb))
			{
		    		new = data1_mk_tag_n(p->dh, p->mem, mc_stmnt, strlen(mc_stmnt), 0, top);
		    		data1_mk_text_n(p->dh, p->mem, pb, strlen(pb), new);
			}
		}
	    }
	    else
	    {
		field = field->next;
	    }
	}
    }
    mc_destroy_field(pf);
    mc_destroy_context(c);
    wrbuf_free(buf, 1);
}

data1_node *grs_read_marcxml(struct grs_read_info *p)
{
    data1_node *root = grs_read_iso2709(p, 1);
    data1_element *e;

    if (!root)
	return 0;
	
    for (e=root->u.root.absyn->main_elements; e; e=e->next)
    {
	data1_tag *tag = e->tag;
	
	if (tag && tag->which == DATA1T_string &&
	    !yaz_matchstr(tag->value.string, "mc?"))
		parse_data1_tree(p, tag->value.string, root);
    }
    return root;
}

data1_node *grs_read_marc(struct grs_read_info *p)
{
    data1_node *root = grs_read_iso2709(p, 0);
    data1_element *e;

    if (!root)
	return 0;
	
    for (e=root->u.root.absyn->main_elements; e; e=e->next)
    {
	data1_tag *tag = e->tag;
	
	if (tag && tag->which == DATA1T_string &&
	    !yaz_matchstr(tag->value.string, "mc?"))
		parse_data1_tree(p, tag->value.string, root);
    }
    return root;
}

static void *grs_init_marc(void)
{
    return 0;
}

static void grs_destroy_marc(void *clientData)
{
}

static struct recTypeGrs marc_type = {
    "marc",
    grs_init_marc,
    grs_destroy_marc,
    grs_read_marc
};

RecTypeGrs recTypeGrs_marc = &marc_type;

static struct recTypeGrs marcxml_type = {
    "marcxml",
    grs_init_marc,
    grs_destroy_marc,
    grs_read_marcxml
};

RecTypeGrs recTypeGrs_marcxml = &marcxml_type;


syntax highlighted by Code2HTML, v. 0.9.1