/* $Id: xmlread.c,v 1.12.2.1 2004/08/11 13:26:29 adam Exp $
   Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
   Index Data Aps

This file is part of the Zebra server.

Zebra is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.

Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Zebra; see the file LICENSE.zebra.  If not, write to the
Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
*/

#if HAVE_EXPAT_H

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#if HAVE_ICONV_H
#include <errno.h>
#include <iconv.h>
#endif

#include <yaz/log.h>

#include "grsread.h"

#include <yaz/xmalloc.h>
#include <yaz/log.h>
#include <data1.h>

#include <expat.h>

#define XML_CHUNK 1024

struct user_info {
    data1_node *d1_stack[256];
    int level;
    data1_handle dh;
    NMEM nmem;
    int loglevel;
};

static void cb_start (void *user, const char *el, const char **attr)
{
    struct user_info *ui = (struct user_info*) user;
    if (ui->level == 1)
        data1_set_root (ui->dh, ui->d1_stack[0], ui->nmem, el);
    ui->d1_stack[ui->level] = data1_mk_tag (ui->dh, ui->nmem, el, attr,
                                                ui->d1_stack[ui->level-1]);
    ui->level++;
    yaz_log (ui->loglevel, "cb_start %s", el);
}

static void cb_end (void *user, const char *el)
{
    struct user_info *ui = (struct user_info*) user;

    ui->level--;
    yaz_log (ui->loglevel, "cb_end %s", el);
}

static void cb_chardata (void *user, const char *s, int len)
{
    struct user_info *ui = (struct user_info*) user;
#if 0
    yaz_log (ui->loglevel, "cb_chardata %.*s", len, s);
#endif
    ui->d1_stack[ui->level] = data1_mk_text_n (ui->dh, ui->nmem, s, len,
                                                   ui->d1_stack[ui->level -1]);
}

static void cb_decl (void *user, const char *version, const char *encoding,
                     int standalone)
{
    struct user_info *ui = (struct user_info*) user;
    const char *attr_list[7];

    attr_list[0] = "version";
    attr_list[1] = version;

    attr_list[2] = "encoding";
    attr_list[3] = "UTF-8"; /* internally it's always UTF-8 */

    attr_list[4] = "standalone";
    attr_list[5] = standalone  ? "yes" : "no";

    attr_list[6] = 0;
    
    data1_mk_preprocess (ui->dh, ui->nmem, "xml", attr_list,
                             ui->d1_stack[ui->level-1]);
#if 0
    yaz_log (LOG_LOG, "decl version=%s encoding=%s",
             version ? version : "null",
             encoding ? encoding : "null");
#endif
}
    
static void cb_processing (void *user, const char *target,
                           const char *data)
{
    struct user_info *ui = (struct user_info*) user;
    data1_node *res =
        data1_mk_preprocess (ui->dh, ui->nmem, target, 0,
                             ui->d1_stack[ui->level-1]);
    data1_mk_text_nf (ui->dh, ui->nmem, data, strlen(data), res);
    
    yaz_log (ui->loglevel, "decl processing target=%s data=%s",
             target ? target : "null",
             data ? data : "null");
}

static void cb_comment (void *user, const char *data)
{
    struct user_info *ui = (struct user_info*) user;
    yaz_log (ui->loglevel, "decl comment data=%s", data ? data : "null");
    data1_mk_comment (ui->dh, ui->nmem, data, ui->d1_stack[ui->level-1]);
}

static void cb_doctype_start (void *userData, const char *doctypeName,
                              const char *sysid, const char *pubid,
                              int has_internal_subset)
{
    struct user_info *ui = (struct user_info*) userData;
    yaz_log (ui->loglevel, "doctype start doctype=%s sysid=%s pubid=%s",
             doctypeName, sysid, pubid);
}

static void cb_doctype_end (void *userData)
{
    struct user_info *ui = (struct user_info*) userData;
    yaz_log (ui->loglevel, "doctype end");
}


static void cb_entity_decl (void *userData, const char *entityName,
                            int is_parameter_entity,
                            const char *value, int value_length,
                            const char *base, const char *systemId,
                            const char *publicId, const char *notationName)
{
    struct user_info *ui = (struct user_info*) userData;
    yaz_log (ui->loglevel,
             "entity decl %s is_para_entry=%d value=%.*s base=%s systemId=%s"
             " publicId=%s notationName=%s",
             entityName, is_parameter_entity, value_length, value,
             base, systemId, publicId, notationName);
    
}

static int cb_external_entity (XML_Parser pparser,
                               const char *context,
                               const char *base,
                               const char *systemId,
                               const char *publicId)
{
    struct user_info *ui = (struct user_info*) XML_GetUserData(pparser);
    FILE *inf;
    int done = 0;
    XML_Parser parser;

    yaz_log (ui->loglevel,
             "external entity context=%s base=%s systemid=%s publicid=%s",
             context, base, systemId, publicId);
    if (!systemId)
        return 1;

    if (!(inf = fopen (systemId, "rb")))
    {
        yaz_log (LOG_WARN|LOG_ERRNO, "fopen %s", systemId);
        return 0;
    }

    parser = XML_ExternalEntityParserCreate (pparser, "", 0);
    while (!done)
    {
        int r;
        void *buf = XML_GetBuffer (parser, XML_CHUNK);
        if (!buf)
        {
            yaz_log (LOG_WARN, "XML_GetBuffer fail");
            break;
        }
        r = fread (buf, 1, XML_CHUNK, inf);
        if (r == 0)
        {
            if (ferror(inf))
            {
                yaz_log (LOG_WARN|LOG_ERRNO, "fread %s", systemId);
                break;
            }
            done = 1;
        }
        if (!XML_ParseBuffer (parser, r, done))
        {
	    done = 1;
	    yaz_log (LOG_WARN, "%s:%d:%d:XML error: %s",
		     systemId,
		     XML_GetCurrentLineNumber(parser),
		     XML_GetCurrentColumnNumber(parser),
		     XML_ErrorString(XML_GetErrorCode(parser)));
	}
    }
    fclose (inf);
    XML_ParserFree (parser);
    return done;
}


#if HAVE_ICONV_H
static int cb_encoding_convert (void *data, const char *s)
{
    iconv_t t = (iconv_t) data;
    size_t ret;
    size_t outleft = 2;
    char outbuf_[2], *outbuf = outbuf_;
    size_t inleft = 4;
    char *inbuf = (char *) s;
    unsigned short code;

#if 1
    yaz_log(LOG_LOG, "------------------------- cb_encoding_convert --- ");
#endif
    ret = iconv (t, &inbuf, &inleft, &outbuf, &outleft);
    if (ret == (size_t) (-1) && errno != E2BIG)
    {
        iconv (t, 0, 0, 0, 0);
        return -1;
    }
    if (outleft != 0)
        return -1;
    memcpy (&code, outbuf_, sizeof(short));
    return code;
}

static void cb_encoding_release (void *data)
{
    iconv_t t = (iconv_t) data;
    iconv_close (t);
}

static int cb_encoding_handler (void *userData, const char *name,
                                XML_Encoding *info)
{
    int i = 0;
    int no_ok = 0;
    struct user_info *ui = (struct user_info*) userData;

    iconv_t t = iconv_open ("UNICODE", name);
    if (t == (iconv_t) (-1))
        return 0;
   
    info->data = 0;  /* signal that multibyte is not in use */
    yaz_log (ui->loglevel, "Encoding handler of %s", name);
    for (i = 0; i<256; i++)
    {
        size_t ret;
        char outbuf_[5];
        char inbuf_[5];
        char *inbuf = inbuf_;
        char *outbuf = outbuf_;
        size_t inleft = 1;
        size_t outleft = 2;
        inbuf_[0] = i;

        iconv (t, 0, 0, 0, 0);  /* reset iconv */

        ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
        if (ret == (size_t) (-1))
        {
            if (errno == EILSEQ)
            {
                yaz_log (ui->loglevel, "Encoding %d: invalid sequence", i);
                info->map[i] = -1;  /* invalid sequence */
            }
            if (errno == EINVAL)
            {                       /* multi byte input */
                int len = 2;
                int j = 0;
                info->map[i] = -1;
                
                while (len <= 4)
                {
                    char sbuf[80];
                    int k;
                    inbuf = inbuf_;
                    inleft = len;
                    outbuf = outbuf_;
                    outleft = 2;

                    inbuf_[len-1] = j;
                    iconv (t, 0,0,0,0);

                    assert (i >= 0 && i<255);

                    *sbuf = 0;
                    for (k = 0; k<len; k++)
                    {
                        sprintf (sbuf+strlen(sbuf), "%d ", inbuf_[k]&255);
                    }
                    ret = iconv (t, &inbuf, &inleft, &outbuf, &outleft);
                    if (ret == (size_t) (-1))
                    {
                        if (errno == EILSEQ || errno == E2BIG)
                        {
                            j++;
                            if (j > 255)
                                break;
                        }
                        else if (errno == EINVAL)
                        {
                            len++;
                            j = 7;
                        }
                    }
                    else if (outleft == 0)
                    {
                        info->map[i] = -len;
                        info->data = t;  /* signal that multibyte is in use */
                        break;
                    }
                    else
                    {
                        break;
                    }
                }
                if (info->map[i] < -1)
                    yaz_log (ui->loglevel, "Encoding %d: multibyte input %d",
                             i, -info->map[i]);
                else
                    yaz_log (ui->loglevel, "Encoding %d: multibyte input failed",
                             i);
            }
            if (errno == E2BIG)
            {
                info->map[i] = -1;  /* no room for output */
                if (i != 0)
                    yaz_log (LOG_WARN, "Encoding %d: no room for output",
                             i);
            }
        }
        else if (outleft == 0)
        {
            unsigned short code;
            memcpy (&code, outbuf_, sizeof(short));
            info->map[i] = code;
            no_ok++;
        }
        else
        {   /* should never happen */
            info->map[i] = -1;
            yaz_log (LOG_DEBUG, "Encoding %d: bad state", i);
        }
    }
    if (info->data)
    {   /* at least one multi byte */
        info->convert = cb_encoding_convert;
        info->release = cb_encoding_release;
    }
    else
    {
        /* no multi byte - we no longer need iconv handler */
        iconv_close(t);
        info->convert = 0;
        info->release = 0;
    }
    if (!no_ok)
        return 0;
    return 1;
}
/* HAVE_ICONV_H */
#endif

static void cb_ns_start(void *userData, const char *prefix, const char *uri)
{
    struct user_info *ui = (struct user_info*) userData;
    if (prefix && uri)
	yaz_log(ui->loglevel, "cb_ns_start %s %s", prefix, uri);
}

static void cb_ns_end(void *userData, const char *prefix)
{
    struct user_info *ui = (struct user_info*) userData;
    if (prefix)
	yaz_log(ui->loglevel, "cb_ns_end %s", prefix);
}
data1_node *zebra_read_xml (data1_handle dh,
                            int (*rf)(void *, char *, size_t), void *fh,
                            NMEM m)
{
    XML_Parser parser;
    struct user_info uinfo;
    int done = 0;
    data1_node *first_node;

    uinfo.loglevel = LOG_DEBUG;
    uinfo.level = 1;
    uinfo.dh = dh;
    uinfo.nmem = m;
    uinfo.d1_stack[0] = data1_mk_node2 (dh, m, DATA1N_root, 0);
    uinfo.d1_stack[1] = 0; /* indicate no children (see end of routine) */
    
    parser = XML_ParserCreate (0 /* encoding */);
    
    XML_SetElementHandler (parser, cb_start, cb_end);
    XML_SetCharacterDataHandler (parser, cb_chardata);
    XML_SetXmlDeclHandler (parser, cb_decl);
    XML_SetProcessingInstructionHandler (parser, cb_processing);
    XML_SetUserData (parser, &uinfo);
    XML_SetCommentHandler (parser, cb_comment);
    XML_SetDoctypeDeclHandler (parser, cb_doctype_start, cb_doctype_end);
    XML_SetEntityDeclHandler (parser, cb_entity_decl);
    XML_SetExternalEntityRefHandler (parser, cb_external_entity);
    XML_SetNamespaceDeclHandler(parser, cb_ns_start, cb_ns_end);
#if HAVE_ICONV_H
    XML_SetUnknownEncodingHandler (parser, cb_encoding_handler, &uinfo);
#endif
    while (!done)
    {
        int r;
        void *buf = XML_GetBuffer (parser, XML_CHUNK);
        if (!buf)
        {
            /* error */
            yaz_log (LOG_WARN, "XML_GetBuffer fail");
            break;
        }
        r = (*rf)(fh, buf, XML_CHUNK);
        if (r < 0)
        {
            /* error */
            yaz_log (LOG_WARN, "XML read fail");
            break;
        }
        else if (r == 0)
            done = 1;
        if (!XML_ParseBuffer (parser, r, done))
        {
	    done = 1;
	    yaz_log (LOG_WARN, "%d:%d:XML error: %s",
		     XML_GetCurrentLineNumber(parser),
		     XML_GetCurrentColumnNumber(parser),
		     XML_ErrorString(XML_GetErrorCode(parser)));
	}
    }
    XML_ParserFree (parser);
    if (!uinfo.d1_stack[1] || !done)
        return 0;
    /* insert XML header if not present .. */
    first_node = uinfo.d1_stack[0]->child;
    if (first_node->which != DATA1N_preprocess ||
	strcmp(first_node->u.preprocess.target, "xml"))
    {
        const char *attr_list[5];

        attr_list[0] = "version";
        attr_list[1] = "1.0";

        attr_list[2] = "encoding";
        attr_list[3] = "UTF-8"; /* encoding */

        attr_list[4] = 0;
    
        data1_insert_preprocess (uinfo.dh, uinfo.nmem, "xml", attr_list,
				 uinfo.d1_stack[0]);
    }
    return uinfo.d1_stack[0];
}

struct xml_info {
    XML_Expat_Version expat_version;
};

static void *grs_init_xml(void)
{
    struct xml_info *p = (struct xml_info *) xmalloc (sizeof(*p));

    p->expat_version = XML_ExpatVersionInfo();

    return p;
}

static data1_node *grs_read_xml (struct grs_read_info *p)
{
    return zebra_read_xml (p->dh, p->readf, p->fh, p->mem);
}

static void grs_destroy_xml(void *clientData)
{
    struct xml_info *p = (struct xml_info *) clientData;

    xfree (p);
}

static struct recTypeGrs xml_type = {
    "xml",
    grs_init_xml,
    grs_destroy_xml,
    grs_read_xml
};

RecTypeGrs recTypeGrs_xml = &xml_type;

/* HAVE_EXPAT_H */
#endif



syntax highlighted by Code2HTML, v. 0.9.1