/* * Copyright (c) 1997-2007, OpenFWTK Development Group * All rights reserved. See LICENSE. */ /* html.c */ /* Copyright 1997-2000 by Eberhard Mattes Donated to the public domain. No warranty. 1997-07-19 Initial version 1997-09-06 Accept more broken HTML comments; "href" configuration 1997-09-10 REJ_COMMENT and "foo--bar" 1997-09-13 url_check_path() removed 1997-10-21 does not end , replace " 1998-06-08 Don't end comment at '">' if preceded by "<" 1998-11-12 Accept white space (not just SP) in 1999-01-01 Accept '&' in attribute values (CTYPE_VALUE) 1999-07-01 Parsing of incorrect tags, comments, and changed 1999-08-26 allow/block embed; don't remove 1999-09-16 allow/block style; ) according to POLICY. END_TAG points to the end tag using lower-case letters, e.g., "". Remove the CDATA for CDATA_DROP, just return and let HTML parser handle the CDATA for CDATA_HTML (which shouldn't be done for CDATA), or copy the CDATA verbatim for CDATA_VERBATIM (dangerous). */ /* TODO: Add another policy which automatically wraps the CDATA in an HTML comment. */ static int process_cdata (int c, const char *end_tag, enum cdata_policy policy) { int i; DEBUG_ASSERT (!tag_end); /* Avoid recursion */ /* Let HTML rewriting mangle the CDATA. */ if (policy == CDATA_HTML) return c; for (;;) { /* We skip quickly over the CDATA, without paying attention to HTML syntax. If there's a "<", we have to look closer, it might be "". Web browsers use some heuristic for ignoring "" embedded in scripts (e.g., in strings). Trying to reseble this algorithm, we track single and double quotes (those may be nested) and javascript comments (// ). Please note that, opposing to standards and common sense, commented out *DOES* terminate the script, because browsers behave this way. We also replace "' && c != EMI_EOF && lower_table[c] == end_tag[i]) { if (i >= 2) tag_name[i-2] = (char)c; ++i; c = slow_getc (); } if (c == EMI_EOF) { if (policy == CDATA_VERBATIM) { /* Copy the buffered characters (same code as below). */ emo_write (out, "<\\/", i < 2 ? i : 3); if (i >= 2) emo_write (out, tag_name, i - 2); /* Attempt to prevent the browser's heuristic from interpreting the script as HTML. */ emo_puts (out, end_tag); } return c; } if (end_tag[i] == '>' && !NAME (c)) break; if (policy == CDATA_VERBATIM) { /* Copy the buffered characters, replacing "= 2) emo_write (out, tag_name, i - 2); } /* Be careful to not miss "" */ } /* We found "" */ static int tag_applet (int c) { /* TODO: look at attributes, configuration */ if (cf_html.block_java.v) dangerous_tag (); else output_tag (0); return c; } /* Process "". */ static int tag_script (int c) { /* Well, we block any script languages, not just JavaScript and LiveScript. (TODO: look for the LANGUAGE and TYPE attributes.) */ if (cf_html.block_javascript.v) { dangerous_tag (); if (!tag_end) c = process_cdata (c, "", CDATA_DROP); /* Remove */ } else { if (!tag_end) { int i; script_lang = SCRIPT_JAVASCRIPT; for (i = 0; i < attr_count; ++i) { struct attr *a = &attr_v[i]; if (a->name_len == 8 && lower_cmpn (a->name, "language", 8) == 0) { if (a->value_len == 10 && lower_cmpn ((char*) a->value, "javascript", 10) == 0) script_lang = SCRIPT_JAVASCRIPT; else if (a->value_len == 7 && lower_cmpn ((char*) a->value, "jscript", 7) == 0) script_lang = SCRIPT_JAVASCRIPT; else if (a->value_len == 8 && lower_cmpn ((char*) a->value, "vbscript", 8) == 0) script_lang = SCRIPT_VBSCRIPT; else if (a->value_len == 10 && lower_cmpn ((char*) a->value, "perlscript", 10) == 0) script_lang = SCRIPT_PERLSCRIPT; else script_lang = SCRIPT_UNKNOWN; } } } /* According to most browsers behavior, we do not allow script tag to have embedded termination ( tag, though XHTML 1.0 permits it. If we need to change this, we should comment out the next line, or (better) make it DOCTYPE - dependant */ if (document_type != DOCTYPE_XHTML) tag_slashed = 0; output_tag (0); if (!tag_end && !tag_slashed) c = process_cdata (c, "", cf_html.script); } return c; } /* Process "". */ static int tag_style (int c) { if (cf_html.block_style.v) { dangerous_tag (); if (!tag_end) c = process_cdata (c, "", CDATA_DROP); return c; } if (!tag_end) { /* Protect against "", cf_html.style); return c; } /* Flags in the link type table. Note: Numbers ordered by severity (except for LT_STYLESHEET). */ #define LT_OK 0 #define LT_UNKNOWN 1 #define LT_STYLESHEET 2 #define LT_DANGEROUS 3 /* Hash table of REL and REV values for . This include file is generated from "html-rel.tab" by maketable. */ #include "html-rel.h" /* Parse link types. Return LT_OK, LT_UNKNOWN, or LT_DANGEROUS. */ static int parse_link_type (const char *s, size_t n) { char buf[32+1]; size_t i; int rc = LT_OK; int nrc; const struct hash_entry *he; while (n != 0) { while (n != 0 && *s == ' ') { ++s; --n; } if (n == 0) break; i = 0; while (i < n && i < sizeof (buf) && s[i] != ' ') ++i; DEBUG_ASSERT (i != 0); lower_copy (buf, s, i); s += i; n -= i; he = find_hash_entry2 (&rel_descr, buf, i); nrc = LT_UNKNOWN; if (he != NULL) nrc = he->flags; if (nrc == LT_STYLESHEET) nrc = cf_html.block_style.v ? LT_DANGEROUS : LT_OK; if (nrc > rc) rc = nrc; } return rc; } /* Process "". */ static int tag_link (int c) { if (!tag_end) { int i; for (i = 0; i < attr_count; ++i) { struct attr *a = &attr_v[i]; if (a->name_len == 3 && (lower_cmpn (a->name, "rel", 3) == 0 || lower_cmpn (a->name, "rev", 3) == 0)) { if (parse_link_type ((char*) a->value, a->value_len) != LT_OK) { dangerous_tag (); return c; } } } } output_tag (0); return c; } /* Process . */ static int tag_object (int c) { if (cf_html.block_object.v) dangerous_tag (); else output_tag (0); return c; } /* Process . */ static int tag_embed (int c) { if (cf_html.block_embed.v) dangerous_tag (); else output_tag (0); return c; } /* Return values for handlers. */ #define MA_OUTPUT 0 #define MA_DANGEROUS 1 #define MA_DROP_SILENT 2 #define MA_UNKNOWN_NAME 3 /* Flags in html-meta.tab and passed to handlers. These flags indicate whether the entry applies to "name", "http-equiv", or both For handlers, these flags indicate the actual type of the META tag. */ #define MF_NAME 0x01 /* "name" */ #define MF_HTTP 0x02 /* "http-equiv" */ /* Keep the tag. */ static int meta_keep (struct attr *v, struct attr *scheme, int flags) { return MA_OUTPUT; } /* Benign tag. */ static int meta_default (struct attr *v, struct attr *scheme, int flags) { return MA_OUTPUT; } /* Drop tag silently. */ static int meta_drop_silent (struct attr *v, struct attr *scheme, int flags) { return MA_DROP_SILENT; } /* Process "". Don't care about whether "http-equiv" or "name" is used. */ static int meta_content_type (struct attr *v, struct attr *scheme, int flags) { /* The content type must be "text/html". */ if (v->value_len < 9 || lower_cmpn ((char*) v->value, "text/html", 9) != 0) return MA_DANGEROUS; if (v->value_len > 9) { /* If there's something after "text/html", it must start with a semicolon. */ if (v->value[9] != ';') return MA_DANGEROUS; /* TODO: Check charset */ /* TODO: Reject EBCDIC :-) */ } if (cf_html.drop_meta_content_type.v) return MA_DROP_SILENT; else { charset_type = multibyte_charset ((char*)v->value); return MA_OUTPUT; } } /* Process "". Don't care about whether "http-equiv" or "name" is used. */ static int meta_pragma (struct attr *v, struct attr *scheme, int flags) { if (v->has_value && check_pragma (v->value, v->value_len) == 0) return MA_OUTPUT; else return MA_DANGEROUS; } /* Process "". Don't care about whether "http-equiv" or "name" is used. */ static int meta_refresh (struct attr *v, struct attr *scheme, int flags) { if (v->has_value && check_refresh (v->value, v->value_len, cf_html.block_javascript.v) == 0) return MA_OUTPUT; else return MA_DANGEROUS; } /* Process "". Don't care about whether "http-equiv" or "name" is used. */ static int meta_set_cookie (struct attr *v, struct attr *scheme, int flags) { int r; size_t nlen; if (!v->has_value) return MA_DANGEROUS; r = cookies_parse_set_cookie (v->value); if (r != 0) return MA_DANGEROUS; nlen = cookies_rebuild_length (); /* Don't overwrite the old value as cookies.c still has pointers into the old value. */ v->value = fastheap_alloc (&attr_heap, nlen + 1); if (nlen > v->value_len) v->value_ref = fastheap_alloc (&attr_heap, nlen); memset (v->value_ref, REF_AUTO, nlen); if (cookies_rebuild (v->value, nlen) != 0) ALWAYS_ASSERT (0); v->value_len = nlen; v->value[nlen] = 0; return MA_OUTPUT; } /* Hash table of "http-equiv" and "name" values for . This include file is generated from "html-meta.tab" by maketable. */ #include "html-meta.h" /* Process . */ static int tag_meta (int c) { int i, rc, log, flags = 0; struct attr *a_content = NULL; struct attr *a_name = NULL; struct attr *a_scheme = NULL; const struct hash_entry *he; int (*handler) (struct attr *, struct attr *, int); if (tag_end) goto danger; /* Collect the attributes. Reject the tag if there are attributes not supported for . Reject the tag if any attribute is given more than once or if the value is missing. */ for (i = 0; i < attr_count; ++i) { struct attr *a = &attr_v[i]; /* As the attributes we're interested in have mostly different lengths, there's no point in replacing lower_cmpn() with lower_copy() and memcmp(). */ if (a->name_len == 7 && lower_cmpn (a->name, "content", 7) == 0) { if (a_content != NULL || !a->has_value) goto danger; a_content = a; } else if ((a->name_len == 10 && lower_cmpn (a->name, "http-equiv", 10) == 0) || (a->name_len == 4 && lower_cmpn (a->name, "name", 4) == 0)) { /* Don't accept both "name" and "http-equiv". */ if (a_name != NULL || !a->has_value) goto danger; a_name = a; if (a->name_len == 4) flags = MF_NAME; /* "name" */ else flags = MF_HTTP; /* "http-equiv" */ } else if (a->name_len == 6 && lower_cmpn (a->name, "scheme", 6) == 0) { if (a_scheme != NULL || !a->has_value) goto danger; a_scheme = a; } else if ((a->name_len == 3 && lower_cmpn (a->name, "dir", 3) == 0) || (a->name_len == 4 && lower_cmpn (a->name, "lang", 4) == 0)) { /* We ignore these attributes. It doesn't matter if there's no value. */ } else goto danger; } /* The "content" and "http-equiv" (or "name") attributes are required. */ if (a_content == NULL || a_name == NULL) goto danger; /* Remove leading and trailing spaces. */ trim_value (a_name); trim_value (a_content); if (a_scheme != NULL) trim_value (a_scheme); /* Dispatch according to the value of the "http-equiv" or "name" attribute. */ DEBUG_ASSERT (a_name->value_len <= ATTR_VALUE_LIMIT); lower_copy ((char*) attr_value, (char*) a_name->value, a_name->value_len); he = find_hash_entry2 (&meta_descr, (char*) attr_value, a_name->value_len); if (he == NULL) { if (flags == MF_NAME) rc = MA_UNKNOWN_NAME; else rc = MA_DANGEROUS; } else if (flags == MF_NAME && !(he->flags & MF_NAME)) { rc = MA_DANGEROUS; syslog (LLEV, "NAME used instead of HTTP-EQUIV for "); } else if (flags == MF_HTTP && !(he->flags & MF_HTTP)) { rc = MA_DANGEROUS; syslog (LLEV, "HTTP-EQUIV used instead of NAME for "); } else { handler = he->handler; ALWAYS_ASSERT (META_HANDLER (handler)); /* Paranoia */ rc = handler (a_content, a_scheme, flags); } if (rc == MA_OUTPUT) output_tag (0); else if (rc != MA_DROP_SILENT) { if (rc == MA_UNKNOWN_NAME) { log = cf_html.meta_unknown_name.log; remove_tag (&cf_html.meta_unknown_name, "unknown", EMI_EOF); } else { log = cf_html.tag_dangerous.log; dangerous_tag (); } if (log) syslog (LLEV, " attributes: %s=%.128s %s=%.128s", a_name->name, a_name->value, a_content->name, a_content->value); } return c; danger: dangerous_tag (); return c; } /* Leave the tag unprocessed. */ static int tag_default (int c) { output_tag (0); return c; } /* HTML levels (currently not used). */ #define HTML_20 0x01 /* HTML 2.0 (RFC 1866) */ #define HTML_32 0x02 /* HTML 3.2 */ #define HTML_40 0x04 /* HTML 4.0 (draft) */ #define HTML_MS 0x08 /* Microsoft extensions */ #define HTML_NS 0x10 /* Netscape extensions */ #define HTML_MO 0x20 /* Mosaic extensions */ #define HTML_UNKNOWN 0x00 /* Unknown origin */ /* Table of HTML tags: `tag_hash'. This include file is generated from "html-tag.tab" by maketable. */ #include "html-tag.h" static int process_tag (int c) { int (*handler)(int); const struct hash_entry *he; if (tag_type == HTML_TAG) { if (!tag_ok) { invalid_tag (c); return c; } else { /* Find the handler for this tag. */ lower_copy (lower_tag_name, tag_name, tag_name_len); he = find_hash_entry2 (&tag_descr, lower_tag_name, tag_name_len); if (he == NULL) { unknown_tag (); /* Unknown tag */ return c; } else { /* Known tag which is to be processed. */ handler = he->handler; ALWAYS_ASSERT (TAG_HANDLER (handler)); /* Paranoia */ return handler (c); } } } return c; } /* ================================= HTML ================================== */ /* Filter HTML while copying from IN to OUT. Limit the number of characters read to SIZE, unless SIZE is -1 (see emi_limit() for this special value). TODO: Limit the size arbitrarily if SIZE is -1. */ void html_copy (EMO_FILE *out0, EMI_FILE *in0, long size) { int c; out = out0; in = in0; in_start = emi_amount (in, 1); out_start = emo_amount (out, 1); emi_limit (in, size); fastheap_init (&attr_heap, 8192); /* TODO: fastheap_add_static() */ attr_value = xmalloc (ATTR_VALUE_LIMIT + 1); attr_value_ref = xmalloc (ATTR_VALUE_LIMIT); c = slow_getc (); while (c != EMI_EOF && !emo_error (out)) { /* Make the most common case fast. No error checking in this loop (see outer loop). */ while (c != EMI_EOF && !ESCAPE (c) && (charset_type == CHARSET_GENERIC)) { fast_putc (c); c = fast_getc (); } switch (c) { case EMI_EOF: break; case '<': c = parse_tag (); c = process_tag (c); fastheap_reset (&attr_heap); break; case '>': output_char (c); c = fast_getc (); break; case '&': c = parse_ref (); process_ref (); break; default: if ((c >= 0x20) && (c < 0x7f)) { fast_putc (c); c = fast_getc (); break; } if ((c <0x20) || (c == 0x7f) || (charset_type == CHARSET_GENERIC)) { output_char (c); c = fast_getc (); break; } else if (charset_type == CHARSET_UTF8) { /* * From RFC2278: * In UTF-8, characters are encoded using sequences of 1 to 6 octets. * The only octet of a "sequence" of one has the higher-order bit set to * 0, the remaining 7 bits being used to encode the character value. In * a sequence of n octets, n>1, the initial octet has the n higher-order * bits set to 1, followed by a bit set to 0. The remaining bit(s) of * that octet contain bits from the value of the character to be * encoded. The following octet(s) all have the higher-order bit set to * 1 and the following bit set to 0, leaving 6 bits in each to contain * bits from the character to be encoded. */ int nc; if (!(c & 0x40) && (c > 0xfd)) { syslog(LLEV,"securityalert: bad unicode character"); return; } for (nc = c; nc & 0x80 ; nc <<= 1) { if ((nc != c) && !((c & 0x80) && !(c & 0x40))) { syslog(LLEV,"securityalert: bad unicode character"); return; } fast_putc (c); c = fast_getc (); if (c == EMI_EOF) break; } } else { syslog(LLEV,"fwtksyserr: charset class not implemented"); return; } } } if (emo_error (out)) syslog (LLEV, "write failed: %s", strerror(errno)); } /* ============================= CONFIGURATION ============================= */ /* Initialize `cf_html' to the default configuration. */ void html_default_config (void) { cf_html.block_java.v = 1; cf_html.block_java.force = 0; cf_html.block_object.v = 1; cf_html.block_object.force = 0; cf_html.block_embed.v = 1; cf_html.block_embed.force = 0; cf_html.block_javascript.v = 1; cf_html.block_javascript.force = 0; cf_html.block_style.v = 1; cf_html.block_style.force = 0; cf_html.use_callback = 0; cf_html.log_incorrect_tags = 0; cf_html.log_key = 0; cf_html.log_script_macros = 0; cf_html.attr_value_limit = 1024; /* RFC 1866 */ cf_html.drop_meta_content_type.v = 0; cf_html.drop_meta_content_type.force = 0; cf_html.tag_invalid.output = REJ_ESCAPE; cf_html.tag_invalid.log = 0; cf_html.tag_dangerous.output = REJ_PREFIX; cf_html.tag_dangerous.log = 0; cf_html.tag_unknown.output = REJ_PREFIX; cf_html.tag_unknown.log = 0; cf_html.attr_dangerous.output = REJ_PREFIX; cf_html.attr_dangerous.log = 0; cf_html.attr_on.output = REJ_PREFIX; cf_html.attr_on.log = 1; cf_html.attr_unknown.output = REJ_PREFIX; cf_html.attr_unknown.log = 0; cf_html.attr_novalue.output = REJ_COPY; cf_html.attr_novalue.log = 0; cf_html.attr_alphanumeric.output = REJ_PREFIX; cf_html.attr_alphanumeric.log = 0; cf_html.ref_unknown.output = REJ_ESCAPE; cf_html.ref_unknown.log = 0; cf_html.meta_unknown_name.output = REJ_PREFIX; cf_html.meta_unknown_name.log = 0; cf_html.script = CDATA_HTML; cf_html.style = CDATA_HTML; }