/* url.c */ /* Copyright 1997-2000 by Eberhard Mattes Donated to the public domain. No warranty. 1997-07-28 Initial version 1997-09-06 "*:PATTERN"; hex escape and url_compare() 1997-09-10 UPF_NODEFPORT 1997-09-13 URL_HTTPS, URL_HTTP_OR_HTTPS; url_check_path() removed 1997-10-21 url_compare(): Treat IP address specially 1999-03-17 parse_connect() 2000-04-15 Make NAME_data const */ #include #include #include #include "firewall.h" #include "libemfw.h" #include "emio.h" #include "squid-gw.h" #define SELECT_CTYPE_URL #include "tables.h" /* References: RFC 1436 Gopher RFC 1630 URI RFC 1738 URL ftp://boombox.micro.umn.edu/pub/gopher/gopher_protocol/Gopher+/Gopher+.txt */ /* ============================== CHARACTERS =============================== */ #define URL_CHAR_TYPE(c,x) (url_char_type[(unsigned char)c] & (x)) #define SCHEME(c) URL_CHAR_TYPE (c, CTYPE_SCHEME) #define DIGIT(c) URL_CHAR_TYPE (c, CTYPE_DIGIT) #define HEXDIGIT(c) URL_CHAR_TYPE (c, CTYPE_HEXDIGIT) #define HPATH(c) URL_CHAR_TYPE (c, CTYPE_HPATH) #define FPATH(c) URL_CHAR_TYPE (c, CTYPE_FPATH) #define USER(c) URL_CHAR_TYPE (c, CTYPE_USER) /* TODO: CTYPE_HOST */ #define HOST(c) (URL_CHAR_TYPE (c, CTYPE_ALPHA|CTYPE_DIGIT) \ || c == '.' || c == '-') #define FTPTYPE(c) ((c) == 'A' || (c) == 'I' || (c) == 'D' \ || (c) == 'a' || (c) == 'i' || (c) == 'd') #define SEARCH(c) ((c) >= 33 && (c) != '%' && (c) != '<' && (c) != '>' \ && (c) != '#') #define WILDCARD(c,f) ((c) == '*' && ((f) & UPF_WILDCARD)) static const unsigned char url_char_type[UCHAR_MAX+1] = { #include "url-ctype.h" }; /* ============================= PARSING A URL ============================= */ /* Evaluate to true if there's an escape at index I of the string S of length N. It's assumed that I < N, that is, that S[I] can be accessed. */ #define ESCAPE(s,i,n) ((s)[(i)] == '%' && (i)+2 < (n) \ && HEXDIGIT ((s)[(i)+1]) && HEXDIGIT ((s)[(i)+2])) /* Check for "//". */ static int url_has_host (struct url *u, const octet *s, int len, int *pi) { int i = *pi; if (i + 1 < len && s[i] == '/' && s[i+1] == '/') { u->scheme_length += 2; *pi = i + 2; return 1; } else return 0; } /* Parse "host" of RFC 1738. */ static int url_host (struct url *u, const octet *s, int len, int i, unsigned flags) { u->host_start = i; while (i < len && (HOST (s[i]) || WILDCARD (s[i], flags))) ++i; u->host_length = i - u->host_start; /* TODO: "foo.bar.123" is not a valid host name (RFC 1738). */ return i; } /* Parse "hostport" of RFC 1738. */ static int url_hostport (struct url *u, const octet *s, int len, int i, unsigned flags) { long port; i = url_host (u, s, len, i, flags); if (i < 0) return -1; u->port_start = i; if (i < len && s[i] == ':') { ++i; if (!(i < len && DIGIT (s[i]))) return -1; port = 0; while (i < len && DIGIT (s[i])) { port = port * 10 + s[i] - '0'; if (port > 65535) return -1; ++i; } u->port = port; } u->port_length = i - u->port_start; return i; } /* Parse "login" of RFC 1738. */ static int url_login (struct url *u, const octet *s, int len, int i, unsigned flags) { u->user_start = i; while (i < len && (USER (s[i]) || ESCAPE (s, i, len))) ++i; u->user_length = i - u->user_start; if (i < len && s[i] == ':' && u->user_length != 0) { ++i; u->user_length += 1; u->password_start = i; while (i < len && (USER (s[i]) || ESCAPE (s, i, len))) ++i; u->password_length = i - u->password_start; } if (i < len && s[i] == '@' && u->user_length != 0) { /* Subsume the "@" character under "user" or "password". */ if (u->password_length != 0) { /* The password part contains at least a ":" character. */ u->password_length += 1; } else u->user_length += 1; return url_hostport (u, s, len, i + 1, flags); } else { /* Restart parsing. */ u->user_length = 0; u->password_length = 0; return url_hostport (u, s, len, u->user_start, flags); } } /* Parse "?search" of RFC 1738. */ static int url_search (struct url *u, const octet *s, int len, int i) { u->query_start = i; if (i < len && s[i] == '?') { ++i; while (i < len && (SEARCH (s[i]) || ESCAPE (s, i, len))) ++i; } u->query_length = i - u->query_start; return i; } /* Parse a URL with scheme "cache_object" (for Squid). */ static int url_cache_object (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "clsid". TODO! */ static int url_clsid (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "file". */ static int url_file (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "file://" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; /* "host" of RFC 1738. */ i = url_host (u, s, len, i, flags); if (i < 0) return -1; /* "/fpath" of RFC 1738. We include the "/" in the url-path, contrary to RFC 1738. */ u->path_start = i; if (i < len && s[i] == '/') { ++i; while (i < len && (FPATH (s[i]) || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; } return i; } /* Parse a URL with scheme "ftp". */ static int url_ftp (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "ftp:" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; /* "login" of RFC 1738. */ i = url_login (u, s, len, i, flags); if (i < 0) return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 21; /* "/fpath" of RFC 1738. We include the "/" in the url-path, contrary to RFC 1738. */ u->path_start = i; if (i < len && s[i] == '/') { ++i; while (i < len && (FPATH (s[i]) || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; } /* ";type=" of RFC 1738. TODO: Convert "type" to lower-case. */ u->type_start = i; if (i < len && s[i] == ';') { if (i + 6 >= len || lower_cmpn ((char*)s+i+1, "type=", 5) != 0 || !FTPTYPE (s[i+6])) return -1; i += 7; } u->type_length = i - u->type_start; return i; } /* Parse a URL with scheme "gopher". */ static int url_gopher (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "gopher://" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; /* "//hostport" of RFC 1738. */ i = url_hostport (u, s, len, i, flags); if (i < 0) return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 70; u->path_start = i; if (i < len) { if (s[i] != '/') return -1; ++i; } if (i < len) { /* Check the gopher type. */ /* TODO: Gopher+ */ if (s[i] == 0 || strchr ("012456789+ITg", s[i]) == NULL) return -1; /* No characters are reserved. */ i = len; } u->path_length = i - u->path_start; /* RFC 1436 says that we should limit the length of the selector string to 255 characters. Why add 2? One for the "/", one for the gopher type. */ if (u->path_length > 255 + 2) return -1; return i; } /* Parse a URL with scheme "http" or "https". */ static int url_http (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "http://" of RFC 1738. */ if (u->scheme_length != 0 && url_has_host (u, s, len, &i)) { /* "//hostport" of RFC 1738. No user name or password is allowed (says RFC 1738). */ i = url_hostport (u, s, len, i, flags); if (i < 0) return -1; if (i < len && s[i] != '/') return -1; } if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 80; /* "/hpath" of RFC 1738, plus relative paths. We include the "/" in the url-path, contrary to RFC 1738. */ u->path_start = i; if (i < len) { while (i < len && (HPATH (s[i]) || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; /* "?search" of RFC 1738. */ i = url_search (u, s, len, i); if (i < 0) return -1; /* Oops. The BNF of RFC 1738 forgot about "#fragment"! Go with RFC 1630. Additionally allow SP and '#' in fragments. */ u->fragment_start = i; if (i < len && s[i] == '#') { ++i; while (i < len && (HPATH (s[i]) || s[i] == SP || s[i] == '#' || ESCAPE (s, i, len))) ++i; } u->fragment_length = i - u->fragment_start; } return i; } /* Parse a URL with scheme "java". TODO! */ static int url_java (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "javascript". */ static int url_javascript (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "mailto". */ static int url_mailto (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; while (i < len && (s[i] != '%' || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "news". */ static int url_news (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "nntp". */ static int url_nntp (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "nntp://" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; i = url_hostport (u, s, len, i, flags); if (i < 0) return -1; if (i < len && s[i] != '/') return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 119; u->path_start = i; /* No checking, no escape */ i = len; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "prospero". */ static int url_prospero (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "prospero://" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; i = url_hostport (u, s, len, i, flags); if (i < 0) return -1; if (i >= len || s[i] != '/') return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 1525; u->path_start = i; while (i < len && (s[i] != '%' || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "telnet". */ static int url_telnet (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "telnet:" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; i = url_login (u, s, len, i, flags); if (i < 0) return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 23; u->path_start = i; if (i < len) { if (s[i] != '/') return -1; ++i; } u->path_length = i - u->path_start; return i; } /* Parse a URL with scheme "wais". */ static int url_wais (struct url *u, const octet *s, int len, int i, unsigned flags) { /* "wais://" of RFC 1738. */ if (!url_has_host (u, s, len, &i)) return -1; i = url_hostport (u, s, len, i, flags); if (i < 0) return -1; if (i >= len || s[i] != '/') return -1; if (u->port == -1 && !(flags & UPF_NODEFPORT)) u->port = 210; u->path_start = i; while (i < len && (s[i] != '%' || ESCAPE (s, i, len))) ++i; u->path_length = i - u->path_start; return i; } /* Unknown schemes. The "//" goes into the path, not the scheme. No escape. */ static int url_other (struct url *u, const octet *s, int len, int i, unsigned flags) { u->path_start = i; i = len; u->path_length = i - u->path_start; return i; } /* Hash table for schemes. */ #include "url-scheme.h" /* Parse a (possibly encoded) URL. Accept "*" if FLAGS includes UPF_WILDCARD. Don't assign a default port number if FLAGS includes UPF_NODEFPORT. Return 0 on success, -1 on failure. */ int url_parse (struct url *u, const octet *s, int len, unsigned flags) { int i; const struct hash_entry *he; char lower_scheme[13]; int (*handler) (struct url *, const octet *, int, int, unsigned); DEBUG_ASSERT (len >= 0); u->host_start = 0; u->host_length = 0; u->port_start = 0; u->port_length = 0; u->user_start = 0; u->user_length = 0; u->password_start = 0; u->password_length = 0; u->path_start = 0; u->path_length = 0; u->query_start = 0; u->query_length = 0; u->fragment_start = 0; u->fragment_length = 0; u->type_start = 0; u->type_length = 0; u->total_length = 0; u->port = -1; u->scheme_start = 0; u->scheme_length = 0; if (len == 0) { u->scheme = URL_EMPTY; return 0; } /* Special case for URL patterns starting with "*:". */ if ((flags & UPF_WILDCARD) && len >= 2 && s[0] == '*' && s[1] == ':') { u->scheme = URL_WILDCARD; u->path_start = 2; u->path_length = len - 2; u->total_length = len - 2; return 0; } /* Scheme. Escaping is not allowed in the scheme. Note that we allow uppercase letters in the scheme. The caller should convert them to lowercase. */ i = 0; while (i < len && SCHEME (s[i])) ++i; if (i < len && s[i] == ':') { u->scheme_start = 0; u->scheme_length = i + 1; if (i == 0) return -1; if (i < sizeof (lower_scheme)) { lower_copy (lower_scheme, (char*)s, i); he = find_hash_entry2 (&scheme_descr, lower_scheme, i); } else he = NULL; if (he == NULL) { u->scheme = URL_OTHER_SCHEME; i = url_other (u, s, len, i + 1, flags); } else { u->scheme = he->flags; handler = he->handler; ALWAYS_ASSERT (SCHEME_HANDLER (handler)); i = handler (u, s, len, i + 1, flags); } } else if ((flags & UPF_WILDCARD) && i == 4 && len >= 6 && lower_cmpn ((char*)s, "http*:", 6) == 0) { u->scheme_start = 0; u->scheme_length = i + 2; u->scheme = URL_HTTP_OR_HTTPS; i = url_http (u, s, len, i + 2, flags); } else { /* Heuristic: reject the string if any ":" precedes the first "/" or "?" or "#". Otherwise we would take ",:" as relative URL... */ while (i < len && s[i] != ':' && s[i] != '/' && s[i] != '?' && s[i] != '#') ++i; if (s[i] == ':') return -1; u->scheme_start = 0; u->scheme_length = 0; u->scheme = URL_NO_SCHEME; /* Assume HTTP for relative URLs as this function is also used for processing "HREF" in HTML documents. */ if (len == 0) return -1; u->scheme = URL_HTTP; i = url_http (u, s, len, 0, flags); } if (i < 0 || i != len) return -1; u->total_length = len; return 0; } /* ============================ DECODING ESCAPE ============================ */ /* Convert a single hexadecimal digit to a number. */ static int hex_digit (octet c) { if (DIGIT (c)) return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; DEBUG_ASSERT (0); exit (1); } /* Decode escapes of an URI. */ int url_decode (char *dst, int dst_size, const char *src, int src_len) { int i = 0; while (src_len > 0) { if (i + 1 >= dst_size) return -1; if (*src == '%') { if (src_len < 3 || !HEXDIGIT (src[1]) || !HEXDIGIT (src[2])) return -1; dst[i++] = hex_digit (src[1]) * 16 + hex_digit (src[2]); src += 3; src_len -= 3; } else { dst[i++] = *src++; --src_len; } } dst[i] = 0; return i; } /* ============================ COMPARING URLS ============================= */ /* Helper function for url_compare(). Return 0 on match. */ static int compare (const octet *s1, int n1, const octet *s2, int n2, unsigned flags) { static char buffer[4096]; int nb; /* TODO: Not all schemes use hex escaping. */ if (memchr (s2, '%', n2) != NULL) { nb = url_decode (buffer, sizeof (buffer), (char*) s2, n2); if (nb == -1) return -1; s2 = (unsigned char*) buffer; n2 = nb; } if (flags & UCF_WILDCARD) return lower_match ((char*) s1, n1, (char*) s2, n2) ? 0 : 1; else { if (n1 != n2) return 1; if (flags & UCF_IGNORE_CASE) return lower_cmpn ((char*) s1, (char*) s2, n1); else return memcmp (s1, s2, n1); } } /* Compare two URLs. FLAGS may include UCF_IGNORE_CASE (ignore case; note that case is always ignored in the server name), UCF_EXACT (compare even fields not present in pattern), and UCF_WILDCARD ("*" for matching any number of characters, UCF_WILDCARD for now implies UCF_IGNORE_CASE). Return 0 if the URLs do match, a non-zero value otherwise. */ /* TODO: consider "/./" and "/../" when comparing paths */ /* TODO: http://proxy/http://evil.com/ */ /* TODO: http://proxy/evil.com/ */ /* TODO: http://199.99.99.99/... */ /* TODO: *://host */ /* TODO: http://host:* */ int url_compare (const octet *pat_s, const struct url *pat_u, const octet *val_s, const struct url *val_u, unsigned flags) { if (pat_u->scheme == URL_WILDCARD) return compare (pat_s + pat_u->path_start, pat_u->path_length, val_s, val_u->total_length, flags); /* The tests are sorted by speed, so that mismatches are detected quickly. For matches, the sequence doesn't matter. */ #define COMPARE(name) \ compare (pat_s + pat_u->name##_start, pat_u->name##_length, \ val_s + val_u->name##_start, val_u->name##_length, flags) #define CHECK(name) \ ((pat_u->name##_length != 0) || (flags & UCF_EXACT)) /* Scheme. */ if (pat_u->scheme == URL_HTTP_OR_HTTPS) { /* "http*:" matches "http:" and "https:". */ if (val_u->scheme != URL_HTTP && val_u->scheme != URL_HTTPS) return 1; } else { if (pat_u->scheme != val_u->scheme) return 1; /* Compare the strings for unknown schemes. */ if (pat_u->scheme == URL_OTHER_SCHEME && COMPARE (scheme) != 0) return 1; } /* Port number. */ if ((pat_u->port != -1 || (flags & UCF_EXACT)) && pat_u->port != val_u->port) return 1; /* User name. Report a loose match if one of the URLs has a user name and the other doesn't. */ if (CHECK (user) && COMPARE (user) != 0) return 1; /* Password. Report a loose match if one of the URLs has a password and the other doesn't. */ if (CHECK (password) && COMPARE (password) != 0) return 1; /* Path. TODO: "." and "..". TODO: Prefix. */ if (CHECK (path) && COMPARE (path) != 0) return 1; if (CHECK (query) && COMPARE (query) != 0) return 1; /* Fragment. */ if (CHECK (fragment) && COMPARE (fragment) != 0) return 1; /* Type. */ if (CHECK (type) && COMPARE (type) != 0) return 1; /* Host. As this test is the slowest one, it comes last. TODO: Support IP addresses, compare IP addresses (aliases!). */ if (CHECK (host)) { octet paddr[4], pmask[4], vaddr[4], vmask[4]; if (ipaddr_parsen (pat_s + pat_u->host_start, pat_u->host_length, paddr, pmask, (flags & UCF_WILDCARD) ? IAP_WILDCARD : 0) == 0) { /* Pattern is an IP address. Report a mismatch if the value isn't an IP address, */ if (ipaddr_parsen (val_s + val_u->host_start, val_u->host_length, vaddr, vmask, 0) != 0) return 1; /* Compare the IP addresses, without asking a name server. */ if (ipaddr_compare (paddr, pmask, vaddr) != 0) return 1; } else { /* Pattern is not an IP address. Just compare as text, without asking a name server. */ if (COMPARE (host) != 0) return 1; } } return 0; #undef COMPARE #undef CHECK } /* This function is defined in url.c instead of connect.c to be able to use url_char_type without much ado. This function assumes that the string pointed to by SRC is terminated by SP. Return 0 on success, -1 on error. */ int parse_connect (char *hn, size_t hnsize, int *port, const char *src) { size_t i = 0; char *end; long n; while (HOST (src[i])) ++i; if (i == 0 || i >= hnsize) return -1; memcpy (hn, src, i); hn[i] = 0; src += i; if (*src++ != ':') return -1; /* The port number is mandatory */ n = strtol (src, &end, 10); if (n < 0 || n > 65535 || end == src) return -1; *port = (int)n; src = end; return *src == ' ' ? 0 : -1; }