#include #include #include #ifdef FOR_LT #include "lt-memory.h" #include "nsllib.h" #define ERR(m) LT_ERROR(NECHAR,m) #define ERR1(m,x) LT_ERROR1(NECHAR,m,x) #define ERR2(m,x,y) LT_ERROR2(NECHAR,m,x,y) #define ERR3(m,x,y,z) LT_ERROR3(NECHAR,m,x,y,z) #define Malloc salloc #define Realloc srealloc #define Free sfree #else #include "system.h" #define ERR(m) fprintf(stderr,m) #define ERR1(m,x) fprintf(stderr,m,x) #define ERR2(m,x,y) fprintf(stderr,m,x,y) #define ERR3(m,x,y,z) fprintf(stderr,m,x,y,z) #endif #include "charset.h" #include "string16.h" #include "dtd.h" #include "input.h" #include "url.h" #include "ctype16.h" static void internal_reader(InputSource s); static void external_reader(InputSource s); InputSource SourceFromFILE16(const char8 *description, FILE16 *file16) { Entity e; e = NewExternalEntity(0, 0, description, 0, 0); if(!strchr8(description, '/')) { char8 *base = default_base_url(); EntitySetBaseURL(e, base); Free(base); } return NewInputSource(e, file16); } InputSource SourceFromStream(const char8 *description, FILE *file) { FILE16 *file16; if(!(file16 = MakeFILE16FromFILE(file, "r"))) return 0; return SourceFromFILE16(description, file16); } InputSource EntityOpen(Entity e) { FILE16 *f16; char8 *r_url; if(e->type == ET_external) { const char8 *url = EntityURL(e); if(!url || !(f16 = url_open(url, 0, "r", &r_url))) return 0; if(r_url && !e->base_url) EntitySetBaseURL(e, r_url); Free(r_url); } else { f16 = MakeFILE16FromString((char *)e->text, -1, "r"); } return NewInputSource(e, f16); } InputSource NewInputSource(Entity e, FILE16 *f16) { InputSource source; if(!(source = Malloc(sizeof(*source)))) return 0; source->line = 0; source->line_alloc = 0; source->line_length = 0; source->expecting_low_surrogate = 0; source->complicated_utf8_line = 0; source->line_is_incomplete = 0; source->next = 0; source->seen_eoe = 0; source->entity = e; source->reader = (e->type == ET_external) ? external_reader :internal_reader; source->file16 = f16; source->bytes_consumed = 0; source->bytes_before_current_line = 0; source->line_end_was_cr = 0; source->line_number = 0; source->not_read_yet = 1; source->read_carefully = 0; source->nextin = source->insize = 0; source->parent = 0; source->seen_error = 0; strcpy(source->error_msg, "no error (you should never see this)"); return source; } void SourceClose(InputSource source) { Fclose(source->file16); if(source->entity->type == ET_external) Free(source->line); Free(source); } int SourceLineAndChar(InputSource s, int *linenum, int *charnum) { Entity e = s->entity, f = e->parent; if(e->type == ET_external) { *linenum = s->line_number; *charnum = s->next; return 1; } if(f && f->type == ET_external) { if(e->matches_parent_text) { *linenum = e->line_offset + s->line_number; *charnum = (s->line_number == 0 ? e->line1_char_offset : 0) + s->next; return 1; } else { *linenum = e->line_offset; *charnum = e->line1_char_offset; return 0; } } if(f && f->matches_parent_text) { *linenum = f->line_offset + e->line_offset; *charnum = (e->line_offset == 0 ? f->line1_char_offset : 0) + e->line1_char_offset; return 0; } return -1; } void SourcePosition(InputSource s, Entity *entity, int *byte_offset) { *entity = s->entity; *byte_offset = SourceTell(s); } int SourceTell(InputSource s) { #if CHAR_SIZE == 8 return s->bytes_before_current_line + s->next; #else switch(s->entity->encoding) { case CE_ISO_10646_UCS_2B: case CE_UTF_16B: case CE_ISO_10646_UCS_2L: case CE_UTF_16L: return s->bytes_before_current_line + 2 * s->next; case CE_ISO_8859_1: case CE_ISO_8859_2: case CE_ISO_8859_3: case CE_ISO_8859_4: case CE_ISO_8859_5: case CE_ISO_8859_6: case CE_ISO_8859_7: case CE_ISO_8859_8: case CE_ISO_8859_9: case CE_unspecified_ascii_superset: return s->bytes_before_current_line + s->next; case CE_UTF_8: if(s->complicated_utf8_line) { /* examine earlier chars in line to see how many bytes they used */ int i, c, n; /* We cache the last result to avoid N^2 slowness on very long lines. Thanks to Gait Boxman for suggesting this. */ if(s->next < s->cached_line_char) { /* Moved backwards in line; doesn't happen, I think */ s->cached_line_char = 0; s->cached_line_byte = 0; } n = s->cached_line_byte; for(i = s->cached_line_char; i < s->next; i++) { c = s->line[i]; if(c <= 0x7f) n += 1; else if(c <= 0x7ff) n += 2; else if(c >= 0xd800 && c <= 0xdfff) /* One of a surrogate pair, count 2 each */ n += 2; else if(c <= 0xffff) n += 3; else if(c <= 0x1ffff) n += 4; else if(c <= 0x3ffffff) n += 5; else n += 6; } s->cached_line_char = s->next; s->cached_line_byte = n; return s->bytes_before_current_line + n; } else return s->bytes_before_current_line + s->next; default: return -1; } #endif } int SourceSeek(InputSource s, int byte_offset) { s->line_length = 0; s->next = 0; s->seen_eoe = 0; s->bytes_consumed = s->bytes_before_current_line = byte_offset; s->nextin = s->insize = 0; /* XXX line number will be wrong! */ s->line_number = -999999; return Fseek(s->file16, byte_offset, SEEK_SET); } /* reader for internal entities, doesn't need to do any encoding translation */ static void internal_reader(InputSource s) { /* XXX reconsider use of FILE16 for internal entities */ struct _FILE16 { void *handle; int handle2, handle3; /* we don't need the rest here */ }; Char *p; struct _FILE16 *f16 = (struct _FILE16 *)s->file16; if(!*(Char *)((char *)f16->handle + f16->handle2)) { s->line_length = 0; return; } s->line = (Char *)((char *)f16->handle + f16->handle2); for(p=s->line; *p && *p != '\n'; p++) ; if(*p) p++; f16->handle2 = (char *)p - (char *)f16->handle; s->line_length = p - s->line; s->bytes_before_current_line = f16->handle2; s->next = 0; if(s->not_read_yet) s->not_read_yet = 0; else s->line_number++; return; } /* * Translate bytes starting at s->inbuf[s->nextin] until end of line * or until s->nextin == s->insize. * The output is placed starting at s->line[s->nextout], which must * have enough space. * Returns zero at end of line or error, one if more input is needed. * In the case of an error (encoding error or illegal XML character) we * set s->seen_error and put a SUB (ctrl-Z) in the output as a marker. */ #define SETUP \ int c; /* can't use Char, it might be >0x10000 */ \ \ /* local copies of fields of s, that are not modified */ \ \ unsigned char * const inbuf = s->inbuf; \ const int insize = s->insize; \ const int startin = s->nextin; \ Char * const outbuf = s->line; \ \ /* local copies of fields of s, that are modified (and restored) */ \ \ int nextin = s->nextin; \ int nextout = s->line_length; \ int ignore_linefeed = s->ignore_linefeed; \ #define ERROR_CHECK \ if(c == -1) \ { \ /* There was an error. Put a SUB character (ctl-Z) in \ as a marker, and end the line. */ \ outbuf[nextout++] = BADCHAR; \ s->seen_error = 1; \ goto end_of_line; \ } #define LINEFEED \ if(c == '\n' && ignore_linefeed) \ { \ /* Ignore lf at start of line if last line ended with cr */ \ ignore_linefeed = 0; \ s->bytes_before_current_line += (nextin - startin); \ continue; \ } \ \ ignore_linefeed = 0; \ \ if(c == '\r') \ { \ s->line_end_was_cr = 1; \ c = '\n'; \ } #define OUTPUT \ outbuf[nextout++] = c; \ \ if(c == '\n') \ goto end_of_line #define OUTPUT_WITH_SURROGATES \ if(c >= 0x10000) \ { \ /* Use surrogates */ \ outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800; \ outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00; \ } \ else \ outbuf[nextout++] = c; \ \ if(c == '\n') \ goto end_of_line #define MORE_BYTES \ more_bytes: \ s->nextin = nextin; \ s->line_length = nextout; \ s->ignore_linefeed = ignore_linefeed; \ return 1 \ #define END_OF_LINE \ end_of_line: \ s->nextin = nextin; \ s->line_length = nextout; \ s->ignore_linefeed = ignore_linefeed; \ return 0 #if CHAR_SIZE == 8 static int translate_8bit(InputSource s) { SETUP; while(nextin < insize) { c = inbuf[nextin++]; if(!is_xml_legal(c)) { sprintf(s->error_msg, "Illegal character <0x%x> at file offset %d", c, s->bytes_consumed + nextin - startin - 1); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; } #else static int translate_latin(InputSource s) { CharacterEncoding enc = s->entity->encoding; int *to_unicode = iso_to_unicode[enc - CE_ISO_8859_2]; SETUP; while(nextin < insize) { c = to_unicode[inbuf[nextin++]]; if(c == -1) { sprintf(s->error_msg, "Illegal byte <0x%x> for encoding %s at file offset %d", inbuf[nextin-1], CharacterEncodingName[enc], s->bytes_consumed + nextin - 1 - startin); } else if(!is_xml_legal(c)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; } static int translate_latin1(InputSource s) { SETUP; while(nextin < insize) { c = inbuf[nextin++]; if(!is_xml_legal(c)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; } static int translate_utf8(InputSource s) { int more, i, mincode; SETUP; while(nextin < insize) { c = inbuf[nextin++]; if(c <= 0x7f) goto gotit; else if(c <= 0xc0 || c >= 0xfe) { sprintf(s->error_msg, "Illegal UTF-8 start byte <0x%x> at file offset %d", c, s->bytes_consumed + nextin - 1 - startin); c = -1; goto gotit; } else if(c <= 0xdf) { c &= 0x1f; more = 1; mincode = 0x80; } else if(c <= 0xef) { c &= 0x0f; more = 2; mincode = 0x800; } else if(c <= 0xf7) { c &= 0x07; more = 3; mincode = 0x10000; } else if(c <= 0xfb) { c &= 0x03; more = 4; mincode = 0x200000; } else { c &= 0x01; more = 5; mincode = 0x4000000; } if(nextin+more > insize) { nextin--; goto more_bytes; } s->complicated_utf8_line = 1; s->cached_line_char = 0; s->cached_line_byte = 0; for(i=0; ierror_msg, "Illegal UTF-8 byte %d <0x%x> at file offset %d", i+2, t, s->bytes_consumed + nextin - 1 - startin); break; } c = (c << 6) + (t & 0x3f); } if(c < mincode && c != -1) { sprintf(s->error_msg, "Illegal (non-shortest) UTF-8 sequence for " "character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } gotit: if(c > 0x110000 || (c >=0 && c < 0x10000 && !is_xml_legal(c))) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT_WITH_SURROGATES; if(c == '>' && s->read_carefully) { s->line_is_incomplete = 1; goto end_of_line; } } MORE_BYTES; END_OF_LINE; } static int translate_utf16(InputSource s) { int le = (s->entity->encoding == CE_ISO_10646_UCS_2L || s->entity->encoding == CE_UTF_16L); SETUP; while(nextin < insize) { if(nextin+2 > insize) goto more_bytes; if(le) c = (inbuf[nextin+1] << 8) + inbuf[nextin]; else c = (inbuf[nextin] << 8) + inbuf[nextin+1]; nextin += 2; if(c >= 0xdc00 && c <= 0xdfff) /* low (2nd) surrogate */ { if(s->expecting_low_surrogate) s->expecting_low_surrogate = 0; else { sprintf(s->error_msg, "Unexpected low surrogate <0x%x> " "at file offset %d", c, s->bytes_consumed + nextin - startin - 2); c = -1; } } else if(s->expecting_low_surrogate) { sprintf(s->error_msg, "Expected low surrogate but got <0x%x> " "at file offset %d", c, s->bytes_consumed + nextin - startin - 2); c = -1; } if(c >= 0xd800 && c <= 0xdbff) /* high (1st) surrogate */ s->expecting_low_surrogate = 1; if(c > 0x110000 || /* surrogates are legal in utf-16 */ (c >= 0 && c < 0xd800 && !is_xml_legal(c)) || (c > 0xdfff && c < 0x10000 && !is_xml_legal(c))) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; } #endif static void external_reader(InputSource s) { int startin = s->nextin; int (*trans)(InputSource); int continuing_incomplete_line = s->line_is_incomplete; if(s->seen_error) return; s->line_is_incomplete = 0; if(!continuing_incomplete_line) { s->ignore_linefeed = s->line_end_was_cr; s->line_end_was_cr = 0; s->complicated_utf8_line = 0; s->line_length = 0; s->bytes_before_current_line = s->bytes_consumed; s->next = 0; } #if CHAR_SIZE == 8 trans = translate_8bit; #else switch(s->entity->encoding) { case CE_ISO_646: /* should really check for >127 in this case */ case CE_ISO_8859_1: case CE_unspecified_ascii_superset: trans = translate_latin1; break; case CE_ISO_8859_2: case CE_ISO_8859_3: case CE_ISO_8859_4: case CE_ISO_8859_5: case CE_ISO_8859_6: case CE_ISO_8859_7: case CE_ISO_8859_8: case CE_ISO_8859_9: trans = translate_latin; break; case CE_UTF_8: trans = translate_utf8; break; case CE_ISO_10646_UCS_2B: case CE_UTF_16B: case CE_ISO_10646_UCS_2L: case CE_UTF_16L: trans=translate_utf16; break; default: assert(1==0); break; } #endif while(1) { /* There are never more characters than bytes in the input */ if(s->line_alloc < s->line_length + (s->insize - s->nextin)) { s->line_alloc = s->line_length + (s->insize - s->nextin); s->line = Realloc(s->line, s->line_alloc * sizeof(Char)); } if(trans(s) == 0) { s->bytes_consumed += (s->nextin - startin); if(s->not_read_yet) s->not_read_yet = 0; else if(!continuing_incomplete_line) s->line_number++; return; } else { int i, bytes_read, remaining = 0; /* more input needed */ /* Copy down any partial character */ remaining = s->insize - s->nextin; for(i=0; iinbuf[i] = s->inbuf[s->nextin + i]; /* Get another block */ s->bytes_consumed += (s->nextin - startin); bytes_read = Readu(s->file16, s->inbuf+remaining, sizeof(s->inbuf)-remaining); s->nextin = startin = 0; if(bytes_read <= 0) { if(remaining > 0) { /* EOF or error in the middle of a character */ sprintf(s->error_msg, "EOF or error inside character at " "file offset %d", s->bytes_consumed + remaining); /* There must be space because there is unconsumed input */ s->line[s->line_length++] = BADCHAR; s->seen_error = 1; } s->insize = 0; if(s->not_read_yet) s->not_read_yet = 0; else if(!continuing_incomplete_line) s->line_number++; return; } s->insize = bytes_read + remaining; } } } void determine_character_encoding(InputSource s) { Entity e = s->entity; int nread; unsigned char *b = (unsigned char *)s->inbuf; b[0] = b[1] = b[2] = b[3] = 0; while(s->insize < 4) { nread = Readu(s->file16, s->inbuf + s->insize, 4 - s->insize); if(nread == -1) return; if(nread == 0) break; s->insize += nread; } #if 0 if(b[0] == 0 && b[1] == 0 && b[2] == 0 && b[3] == '<') e->encoding = CE_ISO_10646_UCS_4B; else if(b[0] == '<' && b[1] == 0 && b[2] == 0 && b[3] == 0) e->encoding = CE_ISO_10646_UCS_4L; else #endif if(b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf) { e->encoding = CE_UTF_8; s->nextin = 3; s->bytes_consumed = 3; } else if(b[0] == 0xfe && b[1] == 0xff) { e->encoding = CE_UTF_16B; s->nextin = 2; s->bytes_consumed = 2; } else if(b[0] == 0 && b[1] == '<' && b[2] == 0 && b[3] == '?') e->encoding = CE_UTF_16B; else if(b[0] == 0xff && b[1] == 0xfe) { e->encoding = CE_UTF_16L; s->nextin = 2; s->bytes_consumed = 2; } else if(b[0] == '<' && b[1] == 0 && b[2] == '?' && b[3] == 0) e->encoding = CE_UTF_16L; else { #if CHAR_SIZE == 8 e->encoding = CE_unspecified_ascii_superset; #else e->encoding = CE_UTF_8; s->read_carefully = 1; #endif } } int get_with_fill(InputSource s) { int old_length = s->next; int old_cu8l = s->complicated_utf8_line; int old_bbcl = s->bytes_before_current_line; int old_ln = s->line_number; assert(!s->seen_eoe); if(s->seen_error) { s->seen_eoe = 1; return XEOE; } s->reader(s); if(s->line_length == 0) { /* Restore old line */ s->line_length = s->next = old_length; s->complicated_utf8_line = old_cu8l; s->bytes_before_current_line = old_bbcl; s->line_number = old_ln; s->seen_eoe = 1; #if 0 fprintf(stderr, "EOE on %s\n", EntityDescription(s->entity)); #endif return XEOE; } if(s->next == s->line_length) { /* "incomplete" line turned out to be at EOF */ #if 0 fprintf(stderr, "EOE on %s\n", EntityDescription(s->entity)); #endif s->seen_eoe = 1; return XEOE; } #if 0 Fprintf(Stderr, "line (len %d, next %d): |%.*S|\n", s->line_length, s->next, s->line_length, s->line); #endif return s->line[s->next++]; }