/*--[litembiggen.c]------------------------------------------------------------ | Copyright (C) 2002 Dan A. Jackson | | This file is part of the "openclit" library for processing .LIT files. | | "Openclit" is free software; you can redistribute it and/or modify | it under the terms of the GNU General Public License as published by | the Free Software Foundation; either version 2 of the License, or | (at your option) any later version. | | This program is distributed in the hope that it will be useful, | but WITHOUT ANY WARRANTY; without even the implied warranty of | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | GNU General Public License for more details. | | You should have received a copy of the GNU General Public License | along with this program; if not, write to the Free Software | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | | The GNU General Public License may also be available at the following | URL: http://www.gnu.org/licenses/gpl.html */ /* This file concerns the transformation of "evaporated" html files into | close to their original format. */ #include #include #include #include "litlib.h" #include "littags.h" /* | Warning - Fixed sized buffers here-in. Don't change U32's to U64 without | being very sure what you are doing. */ char *lit_lookup_mapping(manifest_type * pmanifest,U8 * s,int size); static int read_utf8_char(U8 * pdata, int nBytes, U32 * pvalue); static int write_entity(U32 c, HTMLWRITEFCN htmlwrite, void * write_data); typedef struct entity_map { char * name; unsigned long int id; } entity_map; entity_map entities[] = {{"amp",38}, {"lt",60}, {"gt",62}}; #define STATE_TEXT 0 #define STATE_GET_FLAGS 1 #define STATE_GET_TAG 2 #define STATE_GET_ATTR 3 #define STATE_GET_VALUE 4 #define STATE_GET_VALUE_LENGTH 5 #define STATE_GET_CUSTOM 6 #define STATE_GET_CUSTOM_LENGTH 7 #define STATE_GET_CUSTOM_ATTR 8 #define STATE_GET_ATTR_LENGTH 9 #define STATE_GET_HREF_LENGTH 10 #define STATE_GET_HREF 11 #define WRITE_CHAR(c) { \ ch = c;\ status = htmlwrite(write_data,&ch,1);\ if (status < 0) goto write_error;\ } #define WRITE_STRING(s) { \ status = htmlwrite(write_data,s,strlen(s));\ if (status < 0) goto write_error;\ } /** EVAPORATED HTML ********************************************************* Assumed to start in text, although all documents should start with a tag. Basically each name and element are 0-deliminated. so, roughly: Text<0>name<0>attr-name<0>attr-value<0><0> where flag: 1xxx - This is inside (and inclusive) of the element> x1xx - "Block" tags. Varies between verious of the DLL though. xx1x - Closing Tag xxx1 - Opening tag xx11 - Empty Tag (should be 1xxxx - Use a name from "atoms" array There's more for custom tag names and attributes. ***************************************************************************/ #define FLAG_OPENING 1 #define FLAG_CLOSING 2 #define FLAG_BLOCK 4 #define FLAG_HEAD 8 #define FLAG_ATOM 16 /*--[lit_reconstitute_html]---------------------------------------------------- | | I read UTF8 from the buffer, and expand the internal compressed element | representation used by the IHtmlElement interface. | | I am recursive to handle nested tags, and return the number of bytes | consumed. | | | Whitespace | This has been the source for bugs in every previous version of CLIT, | so I am rewriting how this is handled. | | Key principle: Don't throw away whitespace from the file, since that | leads to irritation. Any "missing" whitespace can be added back by Tidy | anyway. | | Rules for XML/XHTML whitespace | A. Whitespace is SPACE, TAB, CR, LF | B. Sequences of one or more whitespace characters are turned into one | C. white space immediately after a tag should be ignored | D. white space immediately before a end tag should be ignored | (E) MSXML converts all whitespace into ' ' | | Rule D doesn't really seem to work as I expect though. | | Rule B doesn't apply cases where I am inside xml:space="preserve" tags. | Since I don't want to scan for attributes, this means I cannot arbitrarily | add spaces except at the beginning and end! | | Anything | should always be convertable to: | | \t\tAnything | since all initial whitespace is ignored! | | To avoid doing this for , type tags, I'll use blocking. This is | a "convenience" choice -- is still rendered as text ! | text | Complications: | /meta (html_type = 1) doesn't use the upper 2 bits - force everything | into "block" mode. | | The "RIGHT" solution would use HTMLTidy - but that wouldn't be in following | with the "unzip" style approach | */ static int depth = 0; static int pending_indent = 0; static int was_in_text = 0; /* had non-whitespace text in the last block */ /* Kludge due to wierd recursion model. I'm not going to rewrite that to fix * spacing. */ static int lingering_space = 0; int lit_reconstitute_html(U8 * pHtml, int base, int nBytes, int html_type, manifest_type * pManifest, lit_atom_list * pAtoms, HTMLWRITEFCN htmlwrite, void * write_data ) { int elsize, state, index; int i, status, tag_index, char_count, href_base; U32 c, flags, tag; int is_goingdown = 0; attrmap * current_map = NULL, * tmp_map; char * tag_name = NULL, ch, numbuf[20]; int dynamic_tag = 0; U32 nTagRefs; char ** mapTag2Name; attrmap ** mapTag2Attrs; attrmap * mapAttrs; int errors = 0; int in_censorship = 0; int space_enabled = 1, saved_space_enabled = 0; /* Was this tag indented? If so, newline/indent the end */ int was_indented = 0; status = char_count = tag_index = flags = href_base = 0; /* Reinitialize stateful variables only the first time */ if (!depth) { was_in_text = pending_indent = lingering_space = 0; } switch (html_type) { case 1: nTagRefs = (U32)meta_tagcount; mapTag2Name = (char **)meta_tagtoname; mapTag2Attrs= (attrmap **)meta_tagtoattr; mapAttrs = (attrmap *)meta_attr; break; default: nTagRefs = (U32)tagcount; mapTag2Name = (char **)tagtoname; mapTag2Attrs= (attrmap **)tagtoattr; mapAttrs = (attrmap *)tagmap0; break; } state = STATE_TEXT; index = base; while (index < (int)nBytes) { elsize = read_utf8_char(pHtml+index,(int)nBytes-index,&c); if (elsize < 1) { lit_error(ERR_R, "Invalid UTF8 character at " "position %d, %d bytes remain.", index, (int)nBytes - index); return E_LIT_BAD_UTF8; } /** printf("\nS: %02d Char: %ld, %lx, %c (Depth = %d)\n", state, c, c, c, depth); **/ switch (state) { case STATE_TEXT: if (!c) { state = STATE_GET_FLAGS; break; } /* Treat as xml:space="preserve" for the moment, * so the first white space ends expansion */ if ((!was_in_text) || (space_enabled)) { space_enabled = 0; if ((c == ' ') || (c == '\t') || (c == '\n') || (c == '\r')) space_enabled++; else was_in_text = 1; } /* Special case, CTRL-K becomes a literal linefeed, * only in
 blocks */
            if (c == '\v') c = '\n'; 

            /* Anything, even whitespace, negates this */
            pending_indent = 0;
               
            status = write_entity(c, htmlwrite, write_data);
            if (status < 0) goto write_error;
            break;
        case STATE_GET_FLAGS:
            if (!c) { state = STATE_TEXT; break; }
            flags = c;
#if 0
/* This is a don't care -- worse that happens is the output will be 
 * wierd. */
            if ((c > 15) || ((c & 0x03) == 0)) {
                lit_error(ERR_R,
"Invalid \"flags\" byte (0x%lx) at position %ld.\n"
"\tA flag byte ranges from 0x01 to 0x0f, except 0x04,0x08,0x0C.\n",
                c, index);
            } 
#endif
#if 0
            {
                char buf[32];
                sprintf(buf,"", 
                    (flags>>7)&1,(flags>>6)&1,
                    (flags>>5)&1,(flags>>4)&1,
                    (flags>>3)&1,(flags>>2)&1,
                    (flags>>1)&1,flags&1, in_censorship); 
                WRITE_STRING(buf);
            }
#endif
            state = STATE_GET_TAG;
            break;
        case STATE_GET_TAG:
            if (!c) { state = STATE_TEXT; }
            else state = STATE_GET_ATTR;
 
            if (flags & FLAG_OPENING) {

                /* Can do pending indents if spaces are enabled... */
                if (space_enabled)
                {
                   if ((!was_in_text) || (flags & (FLAG_BLOCK|FLAG_HEAD)))
                   {
                      pending_indent++;
                   } 
                }
                /* 
                 | 1.Pending indents..
                 | 2. /meta Always generates an 
                 |    extra newline at the beginning -- good thing there
                 |    is a DTD there! 
                 */
                if ((pending_indent) || (html_type == 1))
                {
                    was_indented++;
                    WRITE_CHAR('\n');
                   
#if 1 
                    for (i = 0; i < depth; i++)
                        WRITE_CHAR(' ');  
#endif
                    pending_indent = 0;
                }
                /* This sets up an indent for the next tag. 
                 * This MUST be cancelled as soon as any text happens.
                 * See rule C above */
                if ((flags & FLAG_HEAD) || (flags & FLAG_BLOCK)||
                    (html_type == 1) || (depth == 0))  {
                        pending_indent = 1; 
                }

                tag = c;

                WRITE_CHAR('<'); 

                if (!(flags & FLAG_CLOSING)) is_goingdown = 1; 
                if (tag == 0x8000) {
                    state = STATE_GET_CUSTOM_LENGTH;

                    break; 
                }
                /* Tag is a 1-based index into the atom names list */
                if (flags & FLAG_ATOM) {
                    if (!pAtoms || !tag || (tag > pAtoms->num_atoms)) { 
                        lit_error(ERR_R,
"Error - Custom tag %d at %ld isn't in Atom List (%08lx:%d)\n",
                            tag,index,pAtoms,pAtoms?pAtoms->num_atoms:0);	
                        return -1;
                    }
                    tag_name = pAtoms->atom_names[tag-1];
                    current_map = pAtoms->attrmap;
                }
                else if ((tag < nTagRefs) && (mapTag2Name[tag])) {
                    tag_name = mapTag2Name[tag]; 
                    current_map = mapTag2Attrs[tag];
                } else {
                    tag_name = malloc(20);
                    if (!tag_name) goto malloc_error;
                    dynamic_tag++;
                    sprintf(tag_name, "?%ld?", tag);

                    /* FUTUREFIX - This should be a "warning"?  */
                    lit_error(ERR_R,
"Unknown or unrecognized tag %08lx at position %ld.\n"
"\tCurrent depth is %d. ", tag, index, depth);
                    errors++;
                       
                    current_map = mapTag2Attrs[tag];
                } 
                WRITE_STRING(tag_name);
            } else if (flags & FLAG_CLOSING) {
                if (!depth) {
                    lit_error(ERR_R,
"Unbalanced HTML - extra ending tag at position %ld.",index);
                    return -1;
                }
                lingering_space = space_enabled;
                return ((index + elsize) - base);
            }
            break;
        case STATE_GET_ATTR:
            in_censorship = 0;

            if (!c) {
                if (!is_goingdown)  {
                    /* no need for tag_name anymore */
                    if (dynamic_tag && tag_name) free(tag_name); 
                    tag_name = NULL; 
                    dynamic_tag = 0;
                    WRITE_CHAR(' '); 
                    WRITE_CHAR('/');
                    WRITE_CHAR('>');
                } 
                else  /* is_going_down */
                {
                    WRITE_CHAR('>'); 
 
#if 0
                    WRITE_CHAR('[');
                    WRITE_CHAR('0'+space_enabled);
                    WRITE_CHAR(']');
#endif

                    if ((html_type == 0) && (flags & (FLAG_BLOCK|FLAG_HEAD))) 
                        pending_indent++;

                    if (depth > 1000) {
                        lit_error(ERR_R,
"Attempted to recurse too deeply at position %ld.", index);
                        return -2;
                    }
                    depth++; 
                    status = lit_reconstitute_html(pHtml,index+elsize,
                        (int)nBytes-elsize, html_type, pManifest, pAtoms,
			htmlwrite, write_data);
                      
                    depth--;
                    if (status < 0) return status;
                    index += status;
                    is_goingdown = 0;
                    if (!tag_name) {
                        lit_error(ERR_R,
"Unbalanced HTML - tag ends before it begins at position %ld.", index);
                        return -1;
                    }

                    /* Borrow the space enabled from the recursed routine
                     * until the tag is finished.  At that point, use the
                     * one that was saved... */
                    saved_space_enabled = space_enabled;
                    space_enabled = lingering_space; 

#if 0
                    WRITE_CHAR('(');
                    WRITE_CHAR('0'+space_enabled);
                    WRITE_CHAR('0'+saved_space_enabled);
                    WRITE_CHAR(')');
#endif
                    /* Can safely insert newlines here. 
                     * (Rule D) 
                     * Should only be for block tags though, and
                     * Conveince - don't do this if we were in text. 
                     *  (looks better...) 
                     * ** OOPS ** Seems like I _can't_. 
                     *  (unless rule B applies...) 
                     *  (and 
 be damned!)
                     */
                    if (space_enabled && was_indented  && (!was_in_text))
                    {
                        WRITE_CHAR('\n');
#if 1
                        for (i = 0; i < depth; i++) 
                            WRITE_CHAR(' ');
#endif
                    }
                    WRITE_CHAR('<');
                    WRITE_CHAR('/');
                    WRITE_STRING(tag_name);
                    WRITE_CHAR('>');

                    /* New lines here as well... But only with spaces !*/
                    if ( space_enabled &&
                        ((html_type == 1)||(flags & (FLAG_BLOCK|FLAG_HEAD))))
                    { 
                        pending_indent++;
                    }
                    if (dynamic_tag) free(tag_name); 
                    dynamic_tag = 0;
                    tag_name = NULL;

                    space_enabled = saved_space_enabled;
                }
                was_in_text = 0;
                state = STATE_TEXT;
                break;
            } else {
                i = 0;
                tmp_map = current_map;

                if (c == 0x8000) {
                    state = STATE_GET_ATTR_LENGTH;
                    break;
                }

                while (tmp_map && tmp_map->id)
                {
                    if (tmp_map->id == c)
                        break; 
                    tmp_map++;
                }
                if ((!tmp_map) || (!tmp_map->id))
                {
                    tmp_map = &mapAttrs[0]; 
                    while (tmp_map && tmp_map->id)
                    {
                        if (tmp_map->id == c) break;
                        tmp_map++;
                    }
                }


                if ((!tmp_map) || (!tmp_map->id)) {
                    lit_error(ERR_R,
"Unrecognized attribute (0x%08lx) at position %ld.\n"
"\tCurrently processing tag: \"%s\".  Depth is %d.  ",
                        c, index, tag_name, depth);
                    sprintf(numbuf,"?%ld?",c);
                    WRITE_CHAR(' ');
                    WRITE_STRING(numbuf);
                }
                else if (tmp_map->name[0] == '%') {
                    in_censorship = 1;
                    /* Invisible tag starting */
                    state = STATE_GET_VALUE_LENGTH;
                    break;
                }
                else {   
                    WRITE_CHAR(' ');
                    WRITE_STRING((char *)tmp_map->name);
                }
                if (status < 0) goto write_error;

                WRITE_CHAR('=');

                /* Two special cases. */
                
                if (tmp_map && tmp_map->name && 
                    ((strcmp(tmp_map->name,"href") == 0) || 
                    (strcmp(tmp_map->name,"src") == 0))) {
                    state = STATE_GET_HREF_LENGTH;
                    break;
                }
                state = STATE_GET_VALUE_LENGTH;
            }
            break;
        case STATE_GET_VALUE_LENGTH:
            if (!in_censorship) {
                WRITE_CHAR('\"');
            }
            char_count = (int)c - 1;
            if (!char_count) {
                if (!in_censorship) {
                    WRITE_CHAR('\"'); 
                }
                in_censorship = 0;
                state = STATE_GET_ATTR;
                break;
            }
            state = STATE_GET_VALUE;
            if (c == 0xffff) break;
            if ((char_count < 0) || (char_count > ((int)nBytes - index))) {
                lit_error(ERR_R,
"attribute had invalid length (%ld) at position %ld.",c, index);
                return -1;
            }
            break;
        case STATE_GET_VALUE:
            if (char_count == 0xfffe) {
                if (!in_censorship) {
                    /* Yes! There is no opening quote. */
                    sprintf(numbuf,"%ld\"",c-1);
                    WRITE_STRING(numbuf); 
                }
                in_censorship = 0;
                state = STATE_GET_ATTR; 
            }
            else if (char_count) {
                if (!in_censorship) { 
                    status = write_entity(c, htmlwrite, write_data);
                    if (status < 0) goto write_error; 
                }
                char_count--; 

            }
            if (!char_count) {
                if (!in_censorship) {
                    WRITE_CHAR('\"');
                }
                in_censorship = 0;
                state = STATE_GET_ATTR;
            } 
            break;
        case STATE_GET_CUSTOM_LENGTH:
            char_count = c - 1;
            if ((char_count <= 0) || (char_count > ((int)nBytes - index))) {
                lit_error(ERR_R,
"custom element had invalid length (%ld) at position %ld.", c, index);
                return -1;
            }
            tag_index = 0; 
            tag_name = malloc(char_count+1);
            dynamic_tag++;
            if (!tag_name)  goto malloc_error;
            state = STATE_GET_CUSTOM;
            break;
        case STATE_GET_CUSTOM:

            /* Yes, this throws away data. 
             | I hope that UTF8 isn't valid in tags, otherwise this will
             | result in unexpected behavior. */
            tag_name[tag_index++] = (char)(c&0x7F);
            char_count--;
            if (!char_count) {
                tag_name[tag_index] = '\0';
                WRITE_STRING(tag_name);
                state = STATE_GET_ATTR;
            }
            break;  
        case STATE_GET_ATTR_LENGTH:
            char_count = c - 1;
            if ((char_count <= 0) || (char_count > ((int)nBytes - index))) {
                lit_error(ERR_R,
"custom attribute had invalid length (%ld) at position %ld.",
                c, index);
                return -1;
            }
            WRITE_CHAR(' ');
            state = STATE_GET_CUSTOM_ATTR;
            break;
        case STATE_GET_CUSTOM_ATTR:
            status = write_entity(c, htmlwrite, write_data);
            if (status < 0) goto write_error;
            char_count--;
            if (!char_count) {
                WRITE_CHAR('=');
                state = STATE_GET_VALUE_LENGTH;
            }
            break;
        case STATE_GET_HREF_LENGTH:
            char_count = c - 1;
            if ((char_count <= 0) || (char_count > ((int)nBytes - index))) {
                lit_error(ERR_R,
"HREF tag has invalid length (%ld) at position %ld.\n", c, index);
                return -1;
            }
            href_base = index + elsize;
            state = STATE_GET_HREF;
            break;
        case STATE_GET_HREF:
            char_count--;
            if (!char_count) {
                int href_size;
                U8 * href_value, * hash_ptr, * new_href;

                href_size = (index + elsize) - href_base - 1;
                href_value = malloc(href_size + 1);
                if (!href_value) goto malloc_error;
    
                memcpy(href_value,pHtml + href_base + 1, href_size);
                href_value[href_size] = '\0';

                hash_ptr = strchr(href_value,'#');
                if (hash_ptr) 
                    new_href = lit_lookup_mapping(pManifest, href_value, 
                        hash_ptr - href_value);
                else 
                    new_href = lit_lookup_mapping(pManifest, href_value, 
                        href_size);
                WRITE_CHAR('\"');
                if (new_href) {
                    WRITE_STRING(new_href);
                    if (hash_ptr) {
                        WRITE_STRING(hash_ptr);
                    }

                } else {
                    WRITE_STRING(href_value); 
                }
                WRITE_CHAR('\"');
                free(href_value);
                state = STATE_GET_ATTR;
            }
            break;  
        default:
            lit_error(ERR_R,
"Reached an invalid internal state (%d) at position %ld.\n", state, index);
            return -1;
            break;
        }
        index += elsize;
    } 
    lingering_space = space_enabled;
    return (index - base);

write_error:
    lit_error(ERR_R, "Unexpected write error!");
    /* the htmlwrite routine should return a status value */
    return status;
malloc_error:
    lit_error(ERR_R,"Ran out of memory processing an HTML file!");
    return E_LIT_OUT_OF_MEMORY;
 
} 


/*--[write_entity]------------------------------------------------------------
 |
 | This converts an entity into a string representation fit for display, 
 | and writes it out to the callback.
 | 
 | This function returns the number of chars written
 */
int write_entity(U32 c, HTMLWRITEFCN htmlwrite, void * write_data)
{
    int     i, found;
    int     len, status;
    char    ent_buffer[14], ch;


    found = -1;

    for (i = 0; i < sizeof(entities)/sizeof(entities[0]); i++)
    {
        if (entities[i].id == c) { found = i; break;}
    }
    len = 0;
    if (found > -1)
    {
        ch = '&';
        status = htmlwrite(write_data,&ch, 1);
        if (status < 0) return status;
        len += status;

        status = htmlwrite(write_data,entities[found].name,
            strlen(entities[found].name));
        if (status < 0) return status;
        len += status;
        ch = ';';
        status = htmlwrite(write_data,&ch, 1);
        if (status < 0) return status;
        return (len + status);
    }
    else {
        if (c < 0x80) {
            ent_buffer[0] = (char)(c & 0x7f);
            len = 1;
        }
        else  {
            /* 32 bit value, so assumed to never be more than 10! */
            len = sprintf(ent_buffer,"&#%ld;",c);
        }
        return htmlwrite(write_data,ent_buffer,len);
    }
#if 0
/* I don't know if I should ever be writing out raw UTF8. */
        unsigned long int mask;
        mask = (1 << 11) - 1;
        len = 1;
        while (c & ~mask) {
            len++;
            mask = ((mask << 5) - 1)|(1 << 5);
        }
        status = fputc((0xff << (7 - len)) | (c >> (6*len)),fh);
        if (status < 0) return status;
        while (len) {
            status = fputc(0x80 | ((c >> (6*(--len))) & 0x3F), fh);
            if (status < 0) return status;
        }
    }
#endif
    return 0;
}


/*--[read_utf8_char]-----------------------------------------------------------
 |
 | This reads a single UTF8 character from a data stream, returning the
 | number of bytes consumed (element size) and filling in the Integer 
 | value. 
*/
int  read_utf8_char(U8 * pdata, int nBytes, U32 * pvalue)
{
    U32  c;
    unsigned char mask;
    int elsize, i;

    if (pvalue) *pvalue = -1;

    if (nBytes < 1) return -1;
    c = *(pdata);
    mask = 0x80;
    if (c & mask) {
        elsize = 0;
        while (c & mask) { mask >>= 1; elsize++;}
        if ((mask <= 1) || (mask == 0x40))  return -1;
    } else {
        elsize = 1;
    }

    if (elsize > 1) {
        if ((elsize) > nBytes) { return -1; }
        c &= (mask - 1);
        for (i = 1; i < elsize; i++) {
            if ( (*(pdata + i) & 0xC0) != 0x80)  return -1;
            c = (c << 6) | ( *(pdata + i) & 0x3F );
        }
    }
    if (pvalue) *pvalue = c;
    return elsize;       
}