/* $Id: xmlparser.c,v 1.2 2002/03/22 11:00:37 rgbecker Exp $ */ #define DEBUG_FSM 0 #ifndef lint static char vcid[] = "$Id: xmlparser.c,v 1.2 2002/03/22 11:00:37 rgbecker Exp $"; #endif /* lint */ /* * XML (and nSGML) parser. * Author: Richard Tobin. */ #include #include #ifdef FOR_LT #include "lt-memory.h" #include "nsllib.h" #define Malloc salloc #define Realloc srealloc #define Free sfree #else #include "system.h" #endif #include "charset.h" #include "string16.h" #include "ctype16.h" #include "dtd.h" #include "input.h" #include "stdio16.h" #include "url.h" #include "namespaces.h" #include "xmlparser.h" #ifdef FOR_LT #include "lt-hash.h" typedef HashList *HashEntry; typedef HashList HashEntryStruct; #define create_hash_table NewSizedHashStruct #define free_hash_table(table) FreeHashStructM((table), 1) #define hash_map MapHashLists1 #define hash_set_value(entry, value) ((entry)->index = (value)) #define hash_get_value(entry) ((entry)->index) #define hash_get_key(entry) ((entry)->word) #define hash_get_key_len(entry) ((entry)->length * sizeof(Char)) #define HashMapRetType boolean static HashEntry hash_find_or_add(HashTable table, const Char *key, int key_len, int *foundp) { HashEntry entry; key_len /= sizeof(Char); entry = FindWordInTableX(table, key, key_len); if(!entry) { *foundp = 0; entry = AddWordToTableXM(table, key, key_len); if(!entry) return 0; } else *foundp = 1; return entry; } #else #include "hash.h" #define hash_set_value(entry, _value) ((entry)->value = (_value)) #define hash_get_value(entry) ((entry)->value) #define hash_get_key(entry) ((entry)->key) #define hash_get_key_len(entry) ((entry)->key_len) #define HashMapRetType void #endif static int transcribe(Parser p, int back, int count); static void pop_while_at_eoe(Parser p); static void maybe_uppercase(Parser p, Char *s); static void maybe_uppercase_name(Parser p); static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b); static int is_ascii_alpha(int c); static int is_ascii_digit(int c); static int parse_external_id(Parser p, int required, char8 **publicid, char8 **systemid, int preq, int sreq); static int parse_conditional(Parser p, Entity ent); static int parse_notation_decl(Parser p, Entity ent); static int parse_entity_decl(Parser p, Entity ent, int line, int chpos); static int parsing_internal(Parser p); static int parsing_external_subset(Parser p); static int parse_attlist_decl(Parser p, Entity ent); static int parse_element_decl(Parser p, Entity ent); static ContentParticle parse_cp(Parser p); static ContentParticle parse_choice_or_seq(Parser p, Entity ent); static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren, char sep, Entity ent); static int check_content_decl(Parser p, ContentParticle cp); static int check_content_decl_1(Parser p, ContentParticle cp); static Char *stringify_cp(ContentParticle cp); static void print_cp(ContentParticle cp, FILE16 *f); static int size_cp(ContentParticle cp); static int check_qualname_syntax(Parser p, const Char *name, const char *type); static int parse_reference(Parser p, int pe, int expand, int allow_external); static int parse_character_reference(Parser p, int expand); static const char8 *escape(int c, char8 *buf); static int parse_name(Parser p, const char8 *where); static int parse_nmtoken(Parser p, const char8 *where); static int looking_at(Parser p, const char8 *string); static void clear_xbit(XBit xbit); static int expect(Parser p, int expected, const char8 *where); static int expect_dtd_whitespace(Parser p, const char8 *where); static void skip_whitespace(InputSource s); static int skip_dtd_whitespace(Parser p, int allow_pe); static int parse_cdata(Parser p); static int process_nsl_decl(Parser p); static int process_xml_decl(Parser p); static int parse_dtd(Parser p); static int read_markupdecls(Parser p); static int error(Parser p, const char8 *format, ...); static int warn(Parser p, const char8 *format, ...); static void verror(char8 *buf, XBit bit, const char8 *format, va_list args); enum literal_type { LT_cdata_attr, LT_tok_attr, LT_plain, LT_entity, LT_param_entity, LT_pubid }; static int parse_string(Parser p, const char8 *where, enum literal_type type, int *normalised); static int parse_pi(Parser p, Entity ent); static int parse_comment(Parser p, int skip, Entity ent); static int parse_pcdata(Parser p); static int parse_starttag(Parser p); Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix); static int process_namespace(Parser p, AttributeDefinition d, const Char *value); static int parse_attribute(Parser p); static WhiteSpaceMode process_xml_space(Parser p, const Char *value); static int parse_endtag(Parser p); static int parse_markup(Parser p); static int parse(Parser p); static int parse_markupdecl(Parser p); static int validate_dtd(Parser p); static int validate_final(Parser p); static HashMapRetType check_id(const HashEntryStruct *id_entry, void *p); static int validate_attribute(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value); static int validate_xml_lang_attribute(Parser p, ElementDefinition e, const Char *value); static int check_attribute_syntax(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, const char *message); static int check_attribute_token(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, int length, const char *message); #if not_yet static int magically_transform_dtd(Parser p, Char *name, int namelen); #endif static struct element_definition pcdata_element; const ElementDefinition Epsilon = 0, PCDataElement = &pcdata_element; static FSM NewFSM(void); void FreeFSM(FSM fsm); static FSMNode AddNode(FSM fsm); static FSMEdge AddEdge(FSMNode source, FSMNode destination, void *label); static void UnMarkFSM(FSM fsm, int value); static void DeleteNode(FSMNode node); static void DeleteEdge(FSMEdge edge); static void CleanupFSM(FSM fsm); static void CleanupNode(FSMNode node); #if DEBUG_FSM static void PrintFSM(FILE16 *out, FSM fsm, int relabelled); #endif static int SimplifyFSM(FSM fsm); static int add_epsilon_closure(FSMNode base, FSMNode node); static FSMNode translate_particle(FSM fsm, ContentParticle cp, FSMNode next); static FSMNode translate_particle_1(FSM fsm, ContentParticle cp, FSMNode next); static FSMNode validate_content(FSMNode context, ElementDefinition e); static int check_deterministic(Parser p, ElementDefinition element); static int check_deterministic_1(Parser p, ElementDefinition element, FSMNode node, ElementDefinition previous); #define validity_error (p->seen_validity_error=1, ParserGetFlag(p, ErrorOnValidityErrors) ? error : warn) #define require(x) if(x >= 0) {} else return -1 #define require0(x) if(x >= 0) {} else return 0 #define Consume(buf) (buf = 0, buf##size = 0) #define ExpandBuf(buf, sz) \ if(buf##size >= (sz)+1) {} else if((buf = Realloc(buf, (buf##size = sz + 1) * sizeof(Char)))) {} else return error(p, "System error") #define CopyName(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else return error(p, "System error"); #define CopyName0(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else {error(p, "System error"); return 0;} #if CHAR_SIZE == 8 #define tochar8(s) s #define duptochar8(s) strdup8(s) #else #define tochar8(s) (p->transbuf = translate_utf16_latin1_m(s, p->transbuf)) #define duptochar8(s) translate_utf16_latin1_m(s, 0) #endif const char8 *XBitTypeName[XBIT_enum_count] = { "dtd", "start", "empty", "end", "eof", "pcdata", "pi", "comment", "cdsect", "error", "warning", "none" }; static Entity xml_builtin_entity; static Entity xml_predefined_entities; static int parser_initialised = 0; int init_parser(void) { Entity e, f; int i; static const Char lt[] = {'l','t',0}, ltval[] = {'&','#','6','0',';',0}; static const Char gt[] = {'g','t',0}, gtval[] = {'>',0}; static const Char amp[] = {'a','m','p',0}, ampval[] = {'&','#','3','8',';',0}; static const Char apos[] = {'a','p','o','s',0}, aposval[] = {'\'',0}; static const Char quot[] = {'q','u','o','t',0}, quotval[] = {'"',0}; static const Char *builtins[5][2] = { {lt, ltval}, {gt, gtval}, {amp, ampval}, {apos, aposval}, {quot, quotval} }; if(parser_initialised) return 0; parser_initialised = 1; if(init_charset() == -1 || init_ctype16() == -1 || init_stdio16() == -1 || init_url() == -1 || init_namespaces() == -1) return -1; xml_builtin_entity = NewInternalEntity(0, 0, 0, 0, 0, 0); for(i=0, f=0; i<5; i++, f=e) { e = NewInternalEntity(builtins[i][0], builtins[i][1], xml_builtin_entity, 0, 0, 0); if(!e) return -1; e->next = f; } xml_predefined_entities = e; return 0; } void deinit_parser(void) { Entity e, f; if(!parser_initialised) return; parser_initialised = 0; deinit_charset(); deinit_ctype16(); deinit_stdio16(); deinit_namespaces(); deinit_url(); for(e = xml_predefined_entities; e; e=f) { f = e->next; e->text = 0; /* it wasn't malloced so we mustn't free it */ FreeEntity(e); } FreeEntity(xml_builtin_entity); } static void skip_whitespace(InputSource s) { int c; while((c = get(s)) != XEOE && is_xml_whitespace(c)) ; unget(s); } /* * Skip whitespace and (optionally) the start and end of PEs. Return 1 if * there actually *was* some whitespace or a PE start/end, -1 if * an error occurred, 0 otherwise. */ static int skip_dtd_whitespace(Parser p, int allow_pe) { int c; int got_some = 0; InputSource s = p->source; while(1) { c = get(s); if(c == XEOE) { got_some = 1; if(s->parent) { if(!allow_pe) return error(p, "PE end not allowed here in internal subset"); if(s->entity->type == ET_external) p->external_pe_depth--; ParserPop(p); s = p->source; } else { unget(s); /* leave the final EOE waiting to be read */ return got_some; } } else if(is_xml_whitespace(c)) { got_some = 1; } else if(c == '%') { /* this complication is needed for source; if(s->entity->type == ET_external) p->external_pe_depth++; got_some = 1; } else { unget(s); return got_some; } } else { unget(s); return got_some; } } } static int expect(Parser p, int expected, const char8 *where) { int c; InputSource s = p->source; c = get(s); if(c != expected) { unget(s); /* For error position */ if(c == BADCHAR) return error(p, "Input error: %s", s->error_msg); else return error(p, "Expected %s %s, but got %s", escape(expected, p->escbuf[0]), where, escape(c, p->escbuf[1])); } return 0; } /* * Expects whitespace or the start or end of a PE. */ static int expect_dtd_whitespace(Parser p, const char8 *where) { int r = skip_dtd_whitespace(p, p->external_pe_depth > 0); if(r < 0) return -1; if(r == 0) return error(p, "Expected whitespace %s", where); return 0; } static void clear_xbit(XBit xbit) { xbit->type = XBIT_none; xbit->s1 = 0; xbit->S1 = xbit->S2 = 0; xbit->attributes = 0; xbit->element_definition = 0; xbit->ns_dict = 0; } void FreeXBit(XBit xbit) { Attribute a, b; if(xbit->S1) Free(xbit->S1); if(xbit->S2) Free(xbit->S2); if(xbit->type != XBIT_error && xbit->type != XBIT_warning && xbit->s1) Free(xbit->s1); if(xbit->ns_dict && xbit->nsowned) { int i; NamespaceBinding parent, ns = xbit->ns_dict; for(i=0; insc; i++) { parent = ns->parent; Free(ns); ns = parent; } } for(a = xbit->attributes; a; a = b) { b = a->next; if(a->value) Free(a->value); Free(a); } clear_xbit(xbit); } /* * Returns 1 if the input matches string (and consume the input). * Otherwise returns 0 and leaves the input stream where it was. * Case-sensitivity depends on the CaseInsensitive flag. * A space character at end of string matches any (non-zero) amount of * whitespace; space are treated literally elsewhere. * Never reads beyond an end-of-line, except to consume * extra whitespace when the last character of string is a space. * Never reads beyond end-of-entity. */ static int looking_at(Parser p, const char8 *string) { InputSource s = p->source; int c, d; int save = s->next; if(p->state == PS_error) /* we got a bad character before, don't try again */ return 0; for(c = *string++; c; c = *string++) { if(at_eol(s)) goto fail; /* We would go over a line end */ d = get(s); if(d == BADCHAR) { error(p, "Input error: %s", s->error_msg); goto fail; } if(c == ' ' && *string == 0) { if(d == XEOE || !is_xml_whitespace(d)) goto fail; skip_whitespace(s); } else if((ParserGetFlag(p, CaseInsensitive) && Toupper(d) != Toupper(c)) || (!ParserGetFlag(p, CaseInsensitive) && d != c)) goto fail; } return 1; fail: s->next = save; return 0; } static int parse_name(Parser p, const char8 *where) { InputSource s = p->source; int c, i; c = get(s); if(c == BADCHAR) return error(p, "Input error: %s", s->error_msg); if(c == XEOE || !is_xml_namestart(c)) { unget(s); /* For error position */ error(p, "Expected name, but got %s %s", escape(c, p->escbuf[0]), where); return -1; } i = 1; while(c = get(s), (c != XEOE && is_xml_namechar(c))) i++; unget(s); p->name = s->line + s->next - i; p->namelen = i; return 0; } static int parse_nmtoken(Parser p, const char8 *where) { InputSource s = p->source; int c, i=0; c = get(s); if(c == BADCHAR) return error(p, "Input error: %s", s->error_msg); while(c !=XEOE && is_xml_namechar(c)) { i++; c = get(s); } unget(s); if(i == 0) return error(p, "Expected nmtoken, but got %s %s", escape(c, p->escbuf[0]), where); p->name = s->line + s->next - i; p->namelen = i; return 0; } /* Escape a character for printing n an error message. */ static const char8 *escape(int c, char8 *buf) { #if CHAR_SIZE == 8 if(c != XEOE) c &= 0xff; #endif if(c == XEOE) return ""; else if(c >= 33 && c <= 126) sprintf(buf, "%c", c); else if(c == ' ') sprintf(buf, ""); else sprintf(buf, "<0x%x>", c); return buf; } Parser NewParser(void) { Parser p; static Char xml[] = {'x','m','l',0}; if(init_parser() == -1) return 0; p = Malloc(sizeof(*p)); if(!p) return 0; p->state = PS_prolog1; p->seen_validity_error = 0; p->document_entity = 0; /* Set at first ParserPush */ p->have_dtd = 0; p->standalone = SDD_unspecified; p->flags[0] = p->flags[1] = 0; p->source = 0; clear_xbit(&p->xbit); #ifndef FOR_LT p->xbit.nchildren = 0; /* These three should never be changed */ p->xbit.children = 0; p->xbit.parent = 0; #endif p->pbufsize = p->pbufnext = 0; p->pbuf = 0; p->save_pbufsize = p->save_pbufnext = 0; p->save_pbuf = 0; p->transbuf = 0; p->peeked = 0; p->dtd = NewDtd(); p->dtd_callback = p->warning_callback = 0; p->entity_opener = 0; p->callback_arg = 0; p->external_pe_depth = 0; VectorInit(p->element_stack); p->base_ns.parent = 0; p->base_ns.prefix = xml; p->base_ns.namespace = FindNamespace(p->dtd->namespace_universe, "http://www.w3.org/XML/1998/namespace", 1); if(!p->base_ns.namespace) return 0; p->id_table = create_hash_table(100); if(!p->id_table) return 0; ParserSetFlag(p, XMLSyntax, 1); ParserSetFlag(p, XMLPredefinedEntities, 1); ParserSetFlag(p, XMLExternalIDs, 1); ParserSetFlag(p, XMLMiscWFErrors, 1); ParserSetFlag(p, ErrorOnUnquotedAttributeValues, 1); ParserSetFlag(p, XMLLessThan, 1); ParserSetFlag(p, ExpandGeneralEntities, 1); ParserSetFlag(p, ExpandCharacterEntities, 1); ParserSetFlag(p, NormaliseAttributeValues, 1); ParserSetFlag(p, WarnOnRedefinitions, 1); ParserSetFlag(p, TrustSDD, 1); ParserSetFlag(p, ReturnComments, 1); ParserSetFlag(p, MaintainElementStack, 1); ParserSetFlag(p, XMLSpace, 0); ParserSetFlag(p, XMLNamespaces, 0); return p; } void FreeParser(Parser p) { while (p->source) ParserPop(p); /* Will close file */ Free(p->pbuf); Free(p->save_pbuf); Free(p->transbuf); Free(p->element_stack); free_hash_table(p->id_table); Free(p); } InputSource ParserRootSource(Parser p) { InputSource s; for(s=p->source; s && s->parent; s = s->parent) ; return s; } Entity ParserRootEntity(Parser p) { return ParserRootSource(p)->entity; } void ParserSetCallbackArg(Parser p, void *arg) { p->callback_arg = arg; } void ParserSetDtdCallback(Parser p, CallbackProc cb) { p->dtd_callback = cb; } void ParserSetWarningCallback(Parser p, CallbackProc cb) { p->warning_callback = cb; } void ParserSetEntityOpener(Parser p, EntityOpenerProc opener) { p->entity_opener = opener; } #ifndef FOR_LT XBit ReadXTree(Parser p) { XBit bit, tree, child; XBit *children; bit = ReadXBit(p); switch(bit->type) { case XBIT_error: return bit; case XBIT_start: if(!(tree = Malloc(sizeof(*tree)))) { error(p, "System error"); return &p->xbit; } *tree = *bit; while(1) { child = ReadXTree(p); switch(child->type) { case XBIT_error: FreeXTree(tree); return child; case XBIT_eof: FreeXTree(tree); { error(p, "EOF in element"); return &p->xbit; } case XBIT_end: if(child->element_definition != tree->element_definition) { const Char *name1 = tree->element_definition->name, *name2 = child->element_definition->name; FreeXTree(tree); FreeXTree(child); error(p, "Mismatched end tag: expected , got ", name1, name2); return &p->xbit; } /* Transfer ns records to start bit so that ns gets freed when the tree is freed, rather than now. */ tree->nsowned = 1; child->nsowned = 0; FreeXTree(child); return tree; default: children = Realloc(tree->children, (tree->nchildren + 1) * sizeof(XBit)); if(!children) { FreeXTree(tree); FreeXTree(child); error(p, "System error"); return &p->xbit; } child->parent = tree; children[tree->nchildren] = child; tree->nchildren++; tree->children = children; break; } } default: if(!(tree = Malloc(sizeof(*tree)))) { error(p, "System error"); return &p->xbit; } *tree = *bit; return tree; } } void FreeXTree(XBit tree) { int i; XBitType type = tree->type; for(i=0; inchildren; i++) FreeXTree(tree->children[i]); Free(tree->children); FreeXBit(tree); if(type == XBIT_error) /* error "trees" are always in the Parser structure, not malloced */ return; Free(tree); } #endif /* (not) FOR_LT */ XBit ReadXBit(Parser p) { if(p->peeked) p->peeked = 0; else parse(p); return &p->xbit; } XBit PeekXBit(Parser p) { if(p->peeked) error(p, "Attempt to peek twice"); else { parse(p); p->peeked = 1; } return &p->xbit; } int ParserPush(Parser p, InputSource source) { if(!p->source && !p->document_entity) p->document_entity = source->entity; source->parent = p->source; p->source = source; if(source->entity->type == ET_internal) return 0; /* Look at first few bytes of external entities to guess encoding, then look for an XMLDecl or TextDecl. */ /* Check encoding even if we have already determined it for this entity, because otherwise we might leave a BOM unread. */ determine_character_encoding(source); #if CHAR_SIZE == 8 if(!EncodingIsAsciiSuperset(source->entity->encoding)) return error(p, "Unsupported character encoding %s", CharacterEncodingName[source->entity->encoding]); #else if(source->entity->encoding == CE_unknown) return error(p, "Unknown character encoding"); #endif get(source); unget(source); /* To get the first line read */ if(looking_at(p, "entity == p->document_entity && !source->entity->version_decl) return error(p, "XML declaration in document entity lacked " "version number"); if(source->entity != p->document_entity && source->entity->standalone_decl != SDD_unspecified) return error(p, "Standalone attribute not allowed except in " "document entity"); if(source->entity != p->document_entity && source->entity->encoding_decl == CE_unknown) return error(p, "Encoding declaration is required in text " "declaration"); return 0; } if(looking_at(p, "state == PS_error) /* looking_at may have set it */ return -1; else return 0; } void ParserPop(Parser p) { InputSource source; source = p->source; p->source = source->parent; SourceClose(source); } /* Returns true if the source is at EOE. If so, the EOE will have been read. */ static int at_eoe(InputSource s) { if(!at_eol(s)) return 0; if(s->seen_eoe || get_with_fill(s) == XEOE) return 1; unget(s); return 0; } /* Pops any sources that are at EOE. Leaves source buffer with at least one character in it (except at EOF, where it leaves the EOE unread). */ static void pop_while_at_eoe(Parser p) { while(1) { InputSource s = p->source; if(!at_eoe(s)) return; if(!s->parent) { unget(s); return; } ParserPop(p); } } void ParserSetFlag(Parser p, ParserFlag flag, int value) { int flagset; unsigned int flagbit; flagset = (flag >> 5); flagbit = (1u << (flag & 31)); if(value) p->flags[flagset] |= flagbit; else p->flags[flagset] &= ~flagbit; if(flag == XMLPredefinedEntities) { if(value) p->dtd->predefined_entities = xml_predefined_entities; else p->dtd->predefined_entities = 0; } } void _ParserPerror(FILE16 *f, Parser p, XBit bit) { int linenum, charnum; InputSource s, root; root = ParserRootSource(p); if(ParserGetFlag(p, SimpleErrorFormat)) { const char8 *d, *e; d = EntityDescription(root->entity); e = d+strlen8(d); while(e > d && e[-1] != '/') --e; if(p->state == PS_validate_dtd) Fprintf(f, "%s:-1(end of prolog):-1: ", e); else if(p->state == PS_validate_final) Fprintf(f, "%s:-1(end of body):-1: ", e); else Fprintf(f, "%s:%d:%d: ", e,root->line_number+1, root->next+1); if(bit->type == XBIT_warning) Fprintf(f, "warning: "); Fprintf(f, "%s\n", bit->error_message); return; } Fprintf(f, "%s: %s\n", bit->type == XBIT_error ? "Error" : "Warning", bit->error_message); if(p->state == PS_validate_dtd || p->state == PS_validate_final) { Fprintf(f, " (detected at end of %s of document %s)\n", p->state == PS_validate_final ? "body" : "prolog", EntityDescription(root->entity)); return; } for(s=p->source; s; s=s->parent) { if(s->entity->name) Fprintf(f, " in entity \"%S\"", s->entity->name); else Fprintf(f, " in unnamed entity"); switch(SourceLineAndChar(s, &linenum, &charnum)) { case 1: Fprintf(f, " at line %d char %d of", linenum+1, charnum+1); break; case 0: Fprintf(f, " defined at line %d char %d of", linenum+1, charnum+1); break; case -1: Fprintf(f, " defined in"); break; } Fprintf(f, " %s\n", EntityDescription(s->entity)); } } void ParserPerror(Parser p, XBit bit) { _ParserPerror(Stderr,p,bit); } static int parse(Parser p) { int c; InputSource s; if(p->state == PS_end || p->state == PS_error) { /* After an error or EOF, jsut keep returning EOF */ p->xbit.type = XBIT_eof; return 0; } clear_xbit(&p->xbit); if(p->state <= PS_prolog2 || p->state == PS_epilog) skip_whitespace(p->source); restart: pop_while_at_eoe(p); s = p->source; SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset); switch(c = get(s)) { case XEOE: if(p->state != PS_epilog) return error(p, "Document ends too soon"); p->state = PS_end; p->xbit.type = XBIT_eof; return 0; case '<': return parse_markup(p); case '&': if(ParserGetFlag(p, IgnoreEntities)) goto pcdata; if(p->state <= PS_prolog2) return error(p, "Entity reference not allowed in prolog"); if(looking_at(p, "#")) { /* a character reference - go back and parse as pcdata */ unget(s); goto pcdata; } if(p->state == PS_error) /* looking_at may have set it */ return -1; if(ParserGetFlag(p, ExpandGeneralEntities)) { /* an entity reference - push it and start again */ require(parse_reference(p, 0, 1, 1)); goto restart; } /* not expanding general entities, so treat as pcdata */ goto pcdata; default: pcdata: unget(s); return parse_pcdata(p); } } /* Called after reading '<' */ static int parse_markup(Parser p) { InputSource s = p->source; int c = get(s); switch(c) { case '!': if(looking_at(p, "--")) { if(ParserGetFlag(p, ReturnComments)) return parse_comment(p, 0, 0); else { require(parse_comment(p, 1, 0)); return parse(p); } } else if(looking_at(p, "DOCTYPE ")) return parse_dtd(p); else if(looking_at(p, "[CDATA[")) return parse_cdata(p); else if(p->state == PS_error) /* looking_at may have set it */ return -1; else return error(p, "Syntax error after error_msg); default: unget(s); if(!ParserGetFlag(p, XMLLessThan) && (c == XEOE || !is_xml_namestart(c))) { /* In nSGML, recognise < as stago only if followed by namestart */ unget(s); /* put back the < */ return parse_pcdata(p); } return parse_starttag(p); } } static int parse_endtag(Parser p) { ElementDefinition e; NSElementDefinition nse; Entity ent; p->xbit.type = XBIT_end; require(parse_name(p, "after element_stack) <= 0) return error(p, "End tag outside of any element", p->namelen, p->name); } if(ParserGetFlag(p, Validate)) { struct element_info *info = &VectorLast(p->element_stack); ElementDefinition parent = info->definition; if(parent->type == CT_element && info->context && !info->context->end_node) { require(validity_error(p, "Content model for %S does not " "allow it to end here", parent->name)); } } if(ParserGetFlag(p, MaintainElementStack)) { ent = VectorLast(p->element_stack).entity; e = VectorLast(p->element_stack).definition; nse = VectorLast(p->element_stack).ns_definition; p->xbit.ns_dict = VectorLast(p->element_stack).ns; p->xbit.nsc = VectorLast(p->element_stack).nsc; p->xbit.nsowned = 1; VectorPop(p->element_stack); if(p->namelen != e->namelen || memcmp(p->name, e->name, p->namelen * sizeof(Char)) != 0) return error(p, "Mismatched end tag: expected , got ", e->name, p->namelen, p->name); p->xbit.element_definition = e; p->xbit.ns_element_definition = nse; if(ent != p->source->entity) return error(p, "Element ends in different entity from that " "in which it starts"); if(VectorCount(p->element_stack) == 0) { if(ParserGetFlag(p, Validate)) { p->state = PS_validate_final; require(validate_final(p)); } p->state = PS_epilog; } } else { e = FindElementN(p->dtd, p->name, p->namelen); p->xbit.element_definition = e; if(!p->xbit.element_definition) return error(p, "End tag for unknown element %.*S", p->namelen, p->name); } skip_whitespace(p->source); return expect(p, '>', "after name in end tag"); } static int check_qualname_syntax(Parser p, const Char *name, const char *type) { Char *t; t = Strchr(name, ':'); if(!t) return 0; if(t == name) warn(p, "%s name %S has empty prefix", type, name); else if(t[1] == 0) warn(p, "%s name %S has empty local part", type, name); else if(!is_xml_namestart(t[1])) warn(p, "%s name %S has illegal local part", type, name); else if(Strchr(t+1, ':')) warn(p, "%s name %S has multiple colons", type, name); return 0; } static int parse_starttag(Parser p) { int c, is_top_level = 0; ElementDefinition e; AttributeDefinition d; Attribute a, aa, all_attrs; struct element_info *this_info = 0, *parent_info = 0; if(p->state == PS_epilog && !ParserGetFlag(p, AllowMultipleElements)) return error(p, "Document contains multiple elements"); if(p->state < PS_body) { if(ParserGetFlag(p, Validate)) { p->state = PS_validate_dtd; require(validate_dtd(p)); } is_top_level = 1; } p->state = PS_body; require(parse_name(p, "after <")); maybe_uppercase_name(p); #if not_yet if(is_top_level && p->magic_prefix) require(magically_transform_dtd(p, p->name, p->namelen)); #endif e = FindElementN(p->dtd, p->name, p->namelen); if(!e || e->tentative) { if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedElements)) return error(p, "Start tag for undeclared element %.*S", p->namelen, p->name); if(ParserGetFlag(p, Validate) && !(ParserGetFlag(p, RelaxedAny) && VectorCount(p->element_stack) != 0 && VectorLast(p->element_stack).definition->type == CT_any)) { require(validity_error(p, "Start tag for undeclared element %.*S", p->namelen, p->name)); } if(e) RedefineElement(e, CT_any, 0, 0, 0); else { if(!(e = DefineElementN(p->dtd, p->name, p->namelen, CT_any, 0, 0, 0))) return error(p, "System error"); if(ParserGetFlag(p, XMLNamespaces)) { require(check_qualname_syntax(p, e->name, "Element")); } } } p->xbit.element_definition = e; if(ParserGetFlag(p, Validate)) { if(VectorCount(p->element_stack) == 0) { if(Strcmp(p->dtd->name, e->name) != 0) { require(validity_error(p, "Root element is %S, should be %S", e->name, p->dtd->name)); } } else { struct element_info *info = &VectorLast(p->element_stack); ElementDefinition parent = info->definition; if(parent->type == CT_empty) { require(validity_error(p, "Content model for %S does not " "allow anything here", parent->name)); } else if(info->context) { info->context = validate_content(info->context, e); if(!info->context) { require(validity_error(p, "Content model for %S does not " "allow element %S here", parent->name, e->name)); } } } } while(1) { InputSource s = p->source; /* We could just do skip_whitespace here, but we will get a better error message if we look a bit closer. */ c = get(s); if(c == BADCHAR) return error(p, "Input error: %s", s->error_msg); if(c !=XEOE && is_xml_whitespace(c)) { skip_whitespace(s); c = get(s); } else if(c != '>' && !(ParserGetFlag(p, XMLSyntax) && c == '/')) { unget(s); /* For error position */ return error(p, "Expected whitespace or tag end in start tag"); } if(c == '>') { p->xbit.type = XBIT_start; break; } if((ParserGetFlag(p, XMLSyntax)) && c == '/') { require(expect(p, '>', "after / in start tag")); p->xbit.type = XBIT_empty; break; } unget(s); require(parse_attribute(p)); } if(ParserGetFlag(p, MaintainElementStack)) { if(p->xbit.type == XBIT_start) { if(!VectorPushNothing(p->element_stack)) return error(p, "System error"); if(VectorCount(p->element_stack) > 1) parent_info = &VectorLast(p->element_stack) - 1; this_info = &VectorLast(p->element_stack); this_info->definition = e; this_info->context = e->fsm ? e->fsm->start_node : 0; this_info->wsm = WSM_unspecified; this_info->ns = 0; this_info->entity = p->source->entity; } else { /* Is this element allowed to be empty? */ if(ParserGetFlag(p, Validate) && e->fsm && !e->fsm->start_node->end_node) { require(validity_error(p, "Content model for %S does not " "allow it to be empty", e->name)); } /* Is it the (empty) top-level element? */ if(VectorCount(p->element_stack) == 0) { if(ParserGetFlag(p, Validate)) { p->state = PS_validate_final; require(validate_final(p)); } p->state = PS_epilog; } else parent_info = &VectorLast(p->element_stack); } } if(ParserGetFlag(p, Validate)) { /* check for required attributes */ AttributeDefinition d; Attribute a; for(d=NextAttributeDefinition(e, 0); d; d=NextAttributeDefinition(e, d)) { if(d->default_type != DT_required) continue; for(a=p->xbit.attributes; a; a=a->next) if(a->definition == d) break; if(!a) { require(validity_error(p, "Required attribute %S for element %S " "is not present", d->name, e->name)); } } } /* Find defaulted attributes if we need them */ /* p->xbit.attributes only points to actually present attributes until the end of this function. */ all_attrs = p->xbit.attributes; if(ParserGetFlag(p, ReturnDefaultedAttributes) || ParserGetFlag(p, XMLNamespaces)) { for(d=NextAttributeDefinition(e, 0); d; d=NextAttributeDefinition(e, d)) { if(!d->default_value) continue; for(a=p->xbit.attributes; a; a=a->next) if(a->definition == d) break; if(!a) { if(!(a = Malloc(sizeof(*a)))) return error(p, "System error"); a->definition = d; if(!(a->value = Strdup(d->default_value))) return error(p, "System error"); a->specified = 0; a->quoted = 1; a->next = all_attrs; all_attrs = a; } } } /* Check no externally-declared defaults in standalone document */ if(ParserGetFlag(p, Validate) && p->standalone == SDD_yes) { for(d=NextAttributeDefinition(e, 0); d; d=NextAttributeDefinition(e, d)) { if(!d->default_value || !d->is_externally_declared) continue; for(a=p->xbit.attributes; a; a=a->next) if(a->definition == d) break; if(!a) { require(validity_error(p, "Externally declared attribute %S " "for element %S defaulted in document declared standalone", d->name, e->name)); } } } /* Look for xml:space attribute */ if(ParserGetFlag(p, XMLSpace)) { d = e->xml_space_attribute; if(d) { for(a=p->xbit.attributes; a; a=a->next) if(a->definition == d) { p->xbit.wsm = process_xml_space(p, a->value); goto done; } if(d->default_type == DT_none || d->default_type == DT_fixed) { p->xbit.wsm = process_xml_space(p, d->default_value); goto done; } } p->xbit.wsm = parent_info ? parent_info->wsm : WSM_unspecified; done: if(this_info) this_info->wsm = p->xbit.wsm; } else p->xbit.wsm = WSM_unspecified; if(ParserGetFlag(p, XMLNamespaces)) { Attribute *attp; Namespace ns; NSElementDefinition nselt; NSAttributeDefinition nsattr; p->xbit.ns_dict = parent_info ? parent_info->ns : &p->base_ns; p->xbit.nsc = 0; /* Look for xmlns attributes */ for(attp=&all_attrs; *attp; ) { a = *attp; if(a->definition->ns_attr_prefix) { require(process_namespace(p, a->definition, a->value)); p->xbit.nsc++; /* remove the attribute now we've processed it */ if(!ParserGetFlag(p, ReturnNamespaceAttributes)) { if(p->xbit.attributes == a) p->xbit.attributes = a->next; *attp = a->next; Free(a->value); Free(a); } else attp = &a->next; } else attp = &a->next; } p->xbit.nsowned = (p->xbit.type == XBIT_empty); /* Find namespace for element */ if(e->prefix) { ns = LookupNamespace(p->xbit.ns_dict, e->prefix); if(!ns) warn(p, "Element name %S has unbound prefix", e->name); } else ns = LookupNamespace(p->xbit.ns_dict, 0); nselt = 0; if(ns) if(!(nselt = NamespacifyElementDefinition(e, ns))) return error(p, "System error"); p->xbit.ns_element_definition = nselt; if(this_info) { this_info->ns = p->xbit.ns_dict; this_info->nsc = p->xbit.nsc; this_info->ns_definition = nselt; } /* Find namespaces for attributes */ for(a=all_attrs; a; a=a->next) { d = a->definition; nsattr = 0; if(!d->ns_attr_prefix) /* Ignore namespace attributes themselves */ { if(d->prefix) { ns = LookupNamespace(p->xbit.ns_dict, d->prefix); if(!ns) warn(p, "Attribute name %S has unbound prefix", d->name); else if(!(nsattr = NamespacifyGlobalAttributeDefinition(d, ns))) return error(p, "System error"); } else if(nselt) { if(!(nsattr = NamespacifyElementAttributeDefinition(d, nselt))) return error(p, "System error"); } } a->ns_definition = nsattr; } /* Check for repeated qualified attributes */ for(a=all_attrs; a; a=a->next) { d = a->definition; if(a->ns_definition && !a->ns_definition->element) for(aa=all_attrs; aa != a; aa=aa->next) { if(aa->ns_definition == a->ns_definition) warn(p, "Repeated attribute %S in namespace %s", d->local, a->ns_definition->namespace->uri); } } /* Free defaulted attrs if we only got them for namespace stuff */ if(!ParserGetFlag(p, ReturnDefaultedAttributes)) { for(a=all_attrs; a != p->xbit.attributes; a = aa) { aa = a->next; Free(a->value); Free(a); } all_attrs = p->xbit.attributes; } } p->xbit.attributes = all_attrs; return 0; } static int process_namespace(Parser p, AttributeDefinition d,const Char *value) { NamespaceBinding nb; const Char *prefix; Namespace ns; prefix = *d->ns_attr_prefix ? d->ns_attr_prefix : 0; if(*value == 0) { if(prefix) { warn(p, "Namespace declaration for %S has empty URI", prefix); return 0; } ns = 0; } else { const char8 *uri = tochar8(value); if(!(ns = FindNamespace(p->dtd->namespace_universe, uri, 1))) return error(p, "System error"); } if(!(nb = Malloc(sizeof(*nb)))) return error(p, "System error"); nb->prefix = prefix; nb->namespace = ns; nb->parent = p->xbit.ns_dict; p->xbit.ns_dict = nb; return 0; } Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix) { NamespaceBinding n; for(n=dictionary; n; n=n->parent) { if(prefix == 0) { if(n->prefix == 0) return n->namespace; } else if(n->prefix && Strcmp(prefix, n->prefix) == 0) return n->namespace; } return 0; } static int parse_attribute(Parser p) { InputSource s = p->source; ElementDefinition elt = p->xbit.element_definition; AttributeDefinition def; struct attribute *a; int c; int normalised = 0; static Char xmlns[] = {'x','m','l','n','s',0}; require(parse_name(p, "for attribute")); maybe_uppercase_name(p); def = FindAttributeN(elt, p->name, p->namelen); if(!def) { if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedAttributes)) return error(p, "Undeclared attribute %.*S for element %S", p->namelen, p->name, elt->name); if(ParserGetFlag(p, Validate) && (elt->declared || elt->has_attlist) && !(ParserGetFlag(p, AllowUndeclaredNSAttributes) && p->namelen >= 5 && Strncmp(p->name, xmlns, 5) == 0 && (p->namelen == 5 || p->name[5] == ':'))) { require(validity_error(p, "Undeclared attribute %.*S for element %S", p->namelen, p->name, elt->name)); } if(!(def = DefineAttributeN(elt, p->name, p->namelen, AT_cdata, 0, DT_implied, 0, 0))) return error(p, "System error"); if(ParserGetFlag(p, XMLNamespaces)) { require(check_qualname_syntax(p, def->name, "Attribute")); } } for(a = p->xbit.attributes; a; a = a->next) if(a->definition == def) return error(p, "Repeated attribute %.*S", p->namelen, p->name); if(!(a = Malloc(sizeof(*a)))) return error(p, "System error"); a->value = 0; /* in case of error */ a->next = p->xbit.attributes; p->xbit.attributes = a; a->definition = def; a->specified = 1; skip_whitespace(s); require(expect(p, '=', "after attribute name")); skip_whitespace(s); c = get(s); unget(s); switch(c) { case BADCHAR: case '"': case '\'': a->quoted = 1; require(parse_string(p, "in attribute value", a->definition->type == AT_cdata ? LT_cdata_attr : LT_tok_attr, &normalised)); a->value = p->pbuf; Consume(p->pbuf); break; default: if(ParserGetFlag(p, ErrorOnUnquotedAttributeValues)) return error(p, "Value of attribute is unquoted"); a->quoted = 0; require(parse_nmtoken(p, "in unquoted attribute value")); CopyName(a->value); break; } if(ParserGetFlag(p, Validate)) { if(p->standalone == SDD_yes && normalised && a->definition->is_externally_declared) { require(validity_error(p, "Externally declared attribute %S for " "element %S was normalised in document declared standalone", a->definition->name, elt->name)); } require(validate_attribute(p, a->definition, elt, a->value)); } return 0; } static WhiteSpaceMode process_xml_space(Parser p, const Char *value) { static Char _preserve[9] = {'p','r','e','s','e','r','v','e',0}; static Char _default[8] = {'d','e','f','a','u','l','t',0}; Char buf[9]; const Char *v; int i; /* It's possible that it hasn't been normalised (sigh) */ for(v=value; is_xml_whitespace(*v); v++) ; for(i=0; i<8; i++) { if(!v[i] || is_xml_whitespace(v[i])) break; buf[i] = v[i]; } buf[i] = '\0'; for(; v[i]; i++) if(!is_xml_whitespace(v[i])) /* If you want validation, you know where to find it */ return WSM_unspecified; if(Strcmp(v, _preserve) == 0) return WSM_preserve; if(Strcmp(v, _default) == 0) return WSM_default; return WSM_unspecified; } static int transcribe(Parser p, int back, int count) { ExpandBuf(p->pbuf, p->pbufnext + count); memcpy(p->pbuf + p->pbufnext, p->source->line + p->source->next - back, count * sizeof(Char)); p->pbufnext += count; return 0; } /* Called after pushing back the first character of the pcdata */ static int parse_pcdata(Parser p) { int count = 0; InputSource s; Char *buf; int next, buflen; if(p->state <= PS_prolog2) return error(p, "Character data not allowed in prolog"); if(p->state == PS_epilog) return error(p, "Character data not allowed after body"); s = p->source; buf = s->line; next = s->next; buflen = s->line_length; p->pbufnext = 0; while(1) { if(next == buflen) { s->next = next; if(count > 0) { require(transcribe(p, count, count)); } count = 0; if(at_eoe(s)) { if(!ParserGetFlag(p, MergePCData)) goto done; else pop_while_at_eoe(p); } s = p->source; buf = s->line; next = s->next; buflen = s->line_length; if(next == buflen) goto done; /* must be EOF */ } switch(buf[next++]) { case BADCHAR: return error(p, "Input error: %s", s->error_msg); case '<': if(!ParserGetFlag(p, XMLLessThan)) { /* In nSGML, don't recognise < as markup unless it looks ok */ if(next == buflen) goto deflt; if(buf[next] != '!' && buf[next] != '/' && buf[next] != '?' && !is_xml_namestart(buf[next])) goto deflt; } s->next = next; if(count > 0) { require(transcribe(p, count+1, count)); } count = 0; if(!ParserGetFlag(p, ReturnComments) && buflen >= next + 3 && buf[next] == '!' && buf[next+1] == '-' && buf[next+2] == '-') { s->next = next + 3; require(parse_comment(p, 1, 0)); buflen = s->line_length; next = s->next; } else { s->next = next-1; goto done; } break; case '&': if(ParserGetFlag(p, IgnoreEntities)) goto deflt; if(!ParserGetFlag(p, MergePCData) && (p->pbufnext > 0 || count > 0)) { /* We're returning references as separate bits, and we've come to one, and we've already got some data to return, so return what we've got and get the reference next time. */ s->next = next-1; if(count > 0) { require(transcribe(p, count, count)); } goto done; } if(buflen >= next+1 && buf[next] == '#') { /* It's a character reference */ s->next = next+1; if(count > 0) { require(transcribe(p, count+2, count)); } count = 0; require(parse_character_reference(p, ParserGetFlag(p, ExpandCharacterEntities))); next = s->next; if(!ParserGetFlag(p, MergePCData)) goto done; } else { /* It's a general entity reference */ s->next = next; if(count > 0) { require(transcribe(p, count+1, count)); } count = 0; require(parse_reference(p, 0, ParserGetFlag(p, ExpandGeneralEntities), 1)); s = p->source; buf = s->line; buflen = s->line_length; next = s->next; if(!ParserGetFlag(p, MergePCData)) goto done; } break; case ']': if(ParserGetFlag(p, XMLMiscWFErrors) && buflen >= next + 2 && buf[next] == ']' && buf[next+1] == '>') return error(p, "Illegal character sequence ']]>' in pcdata"); /* fall through */ default: deflt: count++; break; } } done: ExpandBuf(p->pbuf, 0); /* In case we got nothing */ p->pbuf[p->pbufnext++] = 0; p->xbit.type = XBIT_pcdata; p->xbit.pcdata_chars = p->pbuf; Consume(p->pbuf); p->xbit.pcdata_ignorable_whitespace = 0; if(ParserGetFlag(p, Validate)) { ElementDefinition e = VectorLast(p->element_stack).definition; if(e->type == CT_empty) { require(validity_error(p, "PCDATA not allowed in EMPTY element %S", e->name)); } else if(e->type == CT_element) { Char *t; for(t = p->xbit.pcdata_chars; *t; t++) if(!is_xml_whitespace(*t)) break; if(*t) { require(validity_error(p, "Content model for %S does not allow PCDATA", e->name)); } else { p->xbit.pcdata_ignorable_whitespace = 1; if(p->standalone == SDD_yes && e->is_externally_declared) { require(validity_error(p, "Ignorable whitespace in " "externally declared element %S in document declared standalone", e->name)); } } } } return 0; } /* Called after reading '