/* url.c */

/* Copyright 1997-2000 by Eberhard Mattes <em-gw@windhager.de>
   Donated to the public domain.  No warranty.

   1997-07-28 Initial version
   1997-09-06 "*:PATTERN"; hex escape and url_compare()
   1997-09-10 UPF_NODEFPORT
   1997-09-13 URL_HTTPS, URL_HTTP_OR_HTTPS; url_check_path() removed
   1997-10-21 url_compare(): Treat IP address specially
   1999-03-17 parse_connect()
   2000-04-15 Make NAME_data const */

#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "firewall.h"
#include "libemfw.h"
#include "emio.h"
#include "squid-gw.h"
#define SELECT_CTYPE_URL
#include "tables.h"

/* References:
   RFC 1436	Gopher
   RFC 1630	URI
   RFC 1738	URL
   ftp://boombox.micro.umn.edu/pub/gopher/gopher_protocol/Gopher+/Gopher+.txt
   */

/* ============================== CHARACTERS =============================== */

#define URL_CHAR_TYPE(c,x) (url_char_type[(unsigned char)c] & (x))

#define SCHEME(c)	URL_CHAR_TYPE (c, CTYPE_SCHEME)
#define DIGIT(c)	URL_CHAR_TYPE (c, CTYPE_DIGIT)
#define HEXDIGIT(c)	URL_CHAR_TYPE (c, CTYPE_HEXDIGIT)
#define HPATH(c)	URL_CHAR_TYPE (c, CTYPE_HPATH)
#define FPATH(c)	URL_CHAR_TYPE (c, CTYPE_FPATH)
#define USER(c)		URL_CHAR_TYPE (c, CTYPE_USER)
/* TODO: CTYPE_HOST */
#define HOST(c)		(URL_CHAR_TYPE (c, CTYPE_ALPHA|CTYPE_DIGIT) \
			 || c == '.' || c == '-')
#define FTPTYPE(c)	((c) == 'A' || (c) == 'I' || (c) == 'D' \
			 || (c) == 'a' || (c) == 'i' || (c) == 'd')
#define SEARCH(c)	((c) >= 33 && (c) != '%' && (c) != '<' && (c) != '>' \
			 && (c) != '#')
#define WILDCARD(c,f)	((c) == '*' && ((f) & UPF_WILDCARD))

static const unsigned char url_char_type[UCHAR_MAX+1] =
{
#include "url-ctype.h"
};

/* ============================= PARSING A URL ============================= */

/* Evaluate to true if there's an escape at index I of the string S of
   length N.  It's assumed that I < N, that is, that S[I] can be
   accessed. */

#define ESCAPE(s,i,n) ((s)[(i)] == '%' && (i)+2 < (n) \
		       && HEXDIGIT ((s)[(i)+1]) && HEXDIGIT ((s)[(i)+2]))

/* Check for "//". */

static int url_has_host (struct url *u, const octet *s, int len, int *pi)
{
  int i = *pi;
  if (i + 1 < len && s[i] == '/' && s[i+1] == '/')
    {
      u->scheme_length += 2; *pi = i + 2;
      return 1;
    }
  else
    return 0;
}

/* Parse "host" of RFC 1738. */

static int url_host (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  u->host_start = i;
  while (i < len && (HOST (s[i]) || WILDCARD (s[i], flags)))
    ++i;
  u->host_length = i - u->host_start;

  /* TODO: "foo.bar.123" is not a valid host name (RFC 1738). */

  return i;
}

/* Parse "hostport" of RFC 1738. */

static int url_hostport (struct url *u, const octet *s, int len, int i,
			 unsigned flags)
{
  long port;

  i = url_host (u, s, len, i, flags);
  if (i < 0)
    return -1;
  u->port_start = i;
  if (i < len && s[i] == ':')
    {
      ++i;
      if (!(i < len && DIGIT (s[i])))
	return -1;
      port = 0;
      while (i < len && DIGIT (s[i]))
	{
	  port = port * 10 + s[i] - '0';
	  if (port > 65535)
	    return -1;
	  ++i;
	}
      u->port = port;
    }
  u->port_length = i - u->port_start;
  return i;
}

/* Parse "login" of RFC 1738. */

static int url_login (struct url *u, const octet *s, int len, int i,
		      unsigned flags)
{
  u->user_start = i;
  while (i < len && (USER (s[i]) || ESCAPE (s, i, len)))
    ++i;
  u->user_length = i - u->user_start;
  if (i < len && s[i] == ':' && u->user_length != 0)
    {
      ++i; u->user_length += 1;
      u->password_start = i;
      while (i < len && (USER (s[i]) || ESCAPE (s, i, len)))
	++i;
      u->password_length = i - u->password_start;
    }
  if (i < len && s[i] == '@' && u->user_length != 0)
    {
      /* Subsume the "@" character under "user" or "password". */
      if (u->password_length != 0)
	{
	  /* The password part contains at least a ":" character. */
	  u->password_length += 1;
	}
      else
	u->user_length += 1;

      return url_hostport (u, s, len, i + 1, flags);
    }
  else
    {
      /* Restart parsing. */

      u->user_length = 0;
      u->password_length = 0;
      return url_hostport (u, s, len, u->user_start, flags);
    }
}

/* Parse "?search" of RFC 1738. */

static int url_search (struct url *u, const octet *s, int len, int i)
{
  u->query_start = i;
  if (i < len && s[i] == '?')
    {
      ++i;
      while (i < len && (SEARCH (s[i]) || ESCAPE (s, i, len)))
	++i;
    }
  u->query_length = i - u->query_start;
  return i;
}

/* Parse a URL with scheme "cache_object" (for Squid). */

static int url_cache_object (struct url *u, const octet *s, int len, int i,
			     unsigned flags)
{
  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "clsid". TODO! */

static int url_clsid (struct url *u, const octet *s, int len, int i,
		      unsigned flags)
{
  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "file". */

static int url_file (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  /* "file://" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;

  /* "host" of RFC 1738. */

  i = url_host (u, s, len, i, flags);
  if (i < 0)
    return -1;

  /* "/fpath" of RFC 1738.  We include the "/" in the url-path,
     contrary to RFC 1738. */

  u->path_start = i;
  if (i < len && s[i] == '/')
    {
      ++i;
      while (i < len && (FPATH (s[i]) || ESCAPE (s, i, len)))
	++i;
      u->path_length = i - u->path_start;
    }
  return i;
}

/* Parse a URL with scheme "ftp". */

static int url_ftp (struct url *u, const octet *s, int len, int i,
		    unsigned flags)
{
  /* "ftp:" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;

  /* "login" of RFC 1738. */

  i = url_login (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 21;

  /* "/fpath" of RFC 1738.  We include the "/" in the url-path,
     contrary to RFC 1738. */

  u->path_start = i;
  if (i < len && s[i] == '/')
    {
      ++i;
      while (i < len && (FPATH (s[i]) || ESCAPE (s, i, len)))
	++i;
      u->path_length = i - u->path_start;
    }

  /* ";type=" of RFC 1738.  TODO: Convert "type" to lower-case. */

  u->type_start = i;
  if (i < len && s[i] == ';')
    {
      if (i + 6 >= len || lower_cmpn ((char*)s+i+1, "type=", 5) != 0
	  || !FTPTYPE (s[i+6]))
	return -1;
      i += 7;
    }
  u->type_length = i - u->type_start;
  return i;
}

/* Parse a URL with scheme "gopher". */

static int url_gopher (struct url *u, const octet *s, int len, int i,
		       unsigned flags)
{
  /* "gopher://" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;

  /* "//hostport" of RFC 1738. */

  i = url_hostport (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 70;

  u->path_start = i;
  if (i < len)
    {
      if (s[i] != '/')
	return -1;
      ++i;
    }

  if (i < len)
    {
      /* Check the gopher type. */
      /* TODO: Gopher+ */

      if (s[i] == 0 || strchr ("012456789+ITg", s[i]) == NULL)
	return -1;
      /* No characters are reserved. */
      i = len;
    }
  u->path_length = i - u->path_start;

  /* RFC 1436 says that we should limit the length of the selector
     string to 255 characters.  Why add 2?  One for the "/", one for
     the gopher type. */

  if (u->path_length > 255 + 2)
    return -1;
  return i;
}

/* Parse a URL with scheme "http" or "https". */

static int url_http (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  /* "http://" of RFC 1738. */

  if (u->scheme_length != 0 && url_has_host (u, s, len, &i))
    {
      /* "//hostport" of RFC 1738.  No user name or password is
         allowed (says RFC 1738). */

      i = url_hostport (u, s, len, i, flags);
      if (i < 0)
	return -1;
      if (i < len && s[i] != '/')
	return -1;
    }
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 80;

  /* "/hpath" of RFC 1738, plus relative paths.  We include the "/" in
     the url-path, contrary to RFC 1738. */

  u->path_start = i;
  if (i < len)
    {
      while (i < len && (HPATH (s[i]) || ESCAPE (s, i, len)))
	++i;
      u->path_length = i - u->path_start;

      /* "?search" of RFC 1738. */

      i = url_search (u, s, len, i);
      if (i < 0)
	return -1;

      /* Oops.  The BNF of RFC 1738 forgot about "#fragment"!  Go with
	 RFC 1630.  Additionally allow SP and '#' in fragments. */

      u->fragment_start = i;
      if (i < len && s[i] == '#')
	{
	  ++i;
	  while (i < len && (HPATH (s[i]) || s[i] == SP || s[i] == '#'
			     || ESCAPE (s, i, len)))
	    ++i;
	}
      u->fragment_length = i - u->fragment_start;
    }
  return i;
}

/* Parse a URL with scheme "java". TODO! */

static int url_java (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "javascript". */

static int url_javascript (struct url *u, const octet *s, int len, int i,
			   unsigned flags)
{
  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "mailto". */

static int url_mailto (struct url *u, const octet *s, int len, int i,
		       unsigned flags)
{
  u->path_start = i;
  while (i < len && (s[i] != '%' || ESCAPE (s, i, len)))
    ++i;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "news". */

static int url_news (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "nntp". */

static int url_nntp (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  /* "nntp://" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;
  i = url_hostport (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (i < len && s[i] != '/')
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 119;

  u->path_start = i;
  /* No checking, no escape */
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "prospero". */

static int url_prospero (struct url *u, const octet *s, int len, int i,
			 unsigned flags)
{
  /* "prospero://" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;
  i = url_hostport (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (i >= len || s[i] != '/')
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 1525;

  u->path_start = i;
  while (i < len && (s[i] != '%' || ESCAPE (s, i, len)))
    ++i;
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "telnet". */

static int url_telnet (struct url *u, const octet *s, int len, int i,
		       unsigned flags)
{
  /* "telnet:" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;
  i = url_login (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 23;

  u->path_start = i;
  if (i < len)
    {
      if (s[i] != '/')
	return -1;
      ++i;
    }
  u->path_length = i - u->path_start;
  return i;
}

/* Parse a URL with scheme "wais". */

static int url_wais (struct url *u, const octet *s, int len, int i,
		     unsigned flags)
{
  /* "wais://" of RFC 1738. */

  if (!url_has_host (u, s, len, &i))
    return -1;
  i = url_hostport (u, s, len, i, flags);
  if (i < 0)
    return -1;
  if (i >= len || s[i] != '/')
    return -1;
  if (u->port == -1 && !(flags & UPF_NODEFPORT))
    u->port = 210;

  u->path_start = i;
  while (i < len && (s[i] != '%' || ESCAPE (s, i, len)))
    ++i;
  u->path_length = i - u->path_start;
  return i;
}

/* Unknown schemes.  The "//" goes into the path, not the scheme.  No
   escape. */

static int url_other (struct url *u, const octet *s, int len, int i,
		      unsigned flags)
{
  u->path_start = i;
  i = len;
  u->path_length = i - u->path_start;
  return i;
}

/* Hash table for schemes. */

#include "url-scheme.h"

/* Parse a (possibly encoded) URL.  Accept "*" if FLAGS includes
   UPF_WILDCARD.  Don't assign a default port number if FLAGS includes
   UPF_NODEFPORT.  Return 0 on success, -1 on failure. */

int url_parse (struct url *u, const octet *s, int len, unsigned flags)
{
  int i;
  const struct hash_entry *he;
  char lower_scheme[13];
  int (*handler) (struct url *, const octet *, int, int, unsigned);

  DEBUG_ASSERT (len >= 0);
  u->host_start = 0; u->host_length = 0;
  u->port_start = 0; u->port_length = 0;
  u->user_start = 0; u->user_length = 0;
  u->password_start = 0; u->password_length = 0;
  u->path_start = 0; u->path_length = 0;
  u->query_start = 0; u->query_length = 0;
  u->fragment_start = 0; u->fragment_length = 0;
  u->type_start = 0; u->type_length = 0;
  u->total_length = 0;
  u->port = -1;
  u->scheme_start = 0;
  u->scheme_length = 0;

  if (len == 0)
    {
      u->scheme = URL_EMPTY;
      return 0;
    }

  /* Special case for URL patterns starting with "*:". */

  if ((flags & UPF_WILDCARD) && len >= 2 && s[0] == '*' && s[1] == ':')
    {
      u->scheme = URL_WILDCARD;
      u->path_start = 2;
      u->path_length = len - 2;
      u->total_length = len - 2;
      return 0;
    }

  /* Scheme.  Escaping is not allowed in the scheme.  Note that we
     allow uppercase letters in the scheme.  The caller should convert
     them to lowercase. */

  i = 0;
  while (i < len && SCHEME (s[i]))
    ++i;
  if (i < len && s[i] == ':')
    {
      u->scheme_start = 0; u->scheme_length = i + 1;
      if (i == 0)
	return -1;
      if (i < sizeof (lower_scheme))
	{
	  lower_copy (lower_scheme, (char*)s, i);
	  he = find_hash_entry2 (&scheme_descr, lower_scheme, i);
	}
      else
	he = NULL;
      if (he == NULL)
	{
	  u->scheme = URL_OTHER_SCHEME;
	  i = url_other (u, s, len, i + 1, flags);
	}
      else
	{
	  u->scheme = he->flags;
	  handler = he->handler;
	  ALWAYS_ASSERT (SCHEME_HANDLER (handler));
	  i = handler (u, s, len, i + 1, flags);
	}
    }
  else if ((flags & UPF_WILDCARD) && i == 4 && len >= 6
	   && lower_cmpn ((char*)s, "http*:", 6) == 0)
    {
      u->scheme_start = 0; u->scheme_length = i + 2;
      u->scheme = URL_HTTP_OR_HTTPS;
      i = url_http (u, s, len, i + 2, flags);
    }
  else
    {
      /* Heuristic: reject the string if any ":" precedes the first
         "/" or "?" or "#".  Otherwise we would take ",:" as relative
         URL... */

      while (i < len && s[i] != ':' && s[i] != '/'
	     && s[i] != '?' && s[i] != '#')
	++i;
      if (s[i] == ':')
	return -1;
      u->scheme_start = 0; u->scheme_length = 0;
      u->scheme = URL_NO_SCHEME;

      /* Assume HTTP for relative URLs as this function is also used
         for processing "HREF" in HTML documents. */

      if (len == 0)
	return -1;
      u->scheme = URL_HTTP;
      i = url_http (u, s, len, 0, flags);
    }
  if (i < 0 || i != len)
    return -1;
  u->total_length = len;
  return 0;
}

/* ============================ DECODING ESCAPE ============================ */

/* Convert a single hexadecimal digit to a number. */

static int hex_digit (octet c)
{
  if (DIGIT (c))
    return c - '0';
  if (c >= 'a' && c <= 'f')
    return c - 'a' + 10;
  if (c >= 'A' && c <= 'F')
    return c - 'A' + 10;
  DEBUG_ASSERT (0);
  exit (1);
}

/* Decode escapes of an URI. */

int url_decode (char *dst, int dst_size, const char *src, int src_len)
{
  int i = 0;

  while (src_len > 0)
    {
      if (i + 1 >= dst_size)
	return -1;
      if (*src == '%')
        {
          if (src_len < 3 || !HEXDIGIT (src[1]) || !HEXDIGIT (src[2]))
	    return -1;
          dst[i++] = hex_digit (src[1]) * 16 + hex_digit (src[2]);
          src += 3; src_len -= 3;
        }
      else
	{
	  dst[i++] = *src++; --src_len;
	}
    }
  dst[i] = 0;
  return i;
}

/* ============================ COMPARING URLS ============================= */

/* Helper function for url_compare().  Return 0 on match. */

static int compare (const octet *s1, int n1, const octet *s2, int n2,
		    unsigned flags)
{
  static char buffer[4096];
  int nb;

  /* TODO: Not all schemes use hex escaping. */

  if (memchr (s2, '%', n2) != NULL)
    {
      nb = url_decode (buffer, sizeof (buffer), (char*) s2, n2);
      if (nb == -1)
	return -1;
      s2 = (unsigned char*) buffer; n2 = nb;
    }

  if (flags & UCF_WILDCARD)
    return lower_match ((char*) s1, n1, (char*) s2, n2) ? 0 : 1;
  else
    {
      if (n1 != n2)
	return 1;
      if (flags & UCF_IGNORE_CASE)
	return lower_cmpn ((char*) s1, (char*) s2, n1);
      else
	return memcmp (s1, s2, n1);
    }
}

/* Compare two URLs.  FLAGS may include UCF_IGNORE_CASE (ignore case;
   note that case is always ignored in the server name), UCF_EXACT
   (compare even fields not present in pattern), and UCF_WILDCARD ("*"
   for matching any number of characters, UCF_WILDCARD for now implies
   UCF_IGNORE_CASE).  Return 0 if the URLs do match, a non-zero value
   otherwise. */

/* TODO: consider "/./" and "/../" when comparing paths */
/* TODO: http://proxy/http://evil.com/ */
/* TODO: http://proxy/evil.com/ */
/* TODO: http://199.99.99.99/... */
/* TODO: *://host */
/* TODO: http://host:* */

int url_compare (const octet *pat_s, const struct url *pat_u,
		 const octet *val_s, const struct url *val_u, unsigned flags)
{
  if (pat_u->scheme == URL_WILDCARD)
    return compare (pat_s + pat_u->path_start, pat_u->path_length,
		    val_s, val_u->total_length, flags);

  /* The tests are sorted by speed, so that mismatches are detected
     quickly.  For matches, the sequence doesn't matter. */

#define COMPARE(name) \
  compare (pat_s + pat_u->name##_start, pat_u->name##_length, \
	   val_s + val_u->name##_start, val_u->name##_length, flags)

#define CHECK(name) \
    ((pat_u->name##_length != 0) || (flags & UCF_EXACT))

  /* Scheme. */

  if (pat_u->scheme == URL_HTTP_OR_HTTPS)
    {
      /* "http*:" matches "http:" and "https:". */

      if (val_u->scheme != URL_HTTP && val_u->scheme != URL_HTTPS)
	return 1;
    }
  else
    {
      if (pat_u->scheme != val_u->scheme)
	return 1;

      /* Compare the strings for unknown schemes. */

      if (pat_u->scheme == URL_OTHER_SCHEME && COMPARE (scheme) != 0)
	return 1;
    }

  /* Port number. */

  if ((pat_u->port != -1 || (flags & UCF_EXACT))
      && pat_u->port != val_u->port)
    return 1;

  /* User name.  Report a loose match if one of the URLs has a user
     name and the other doesn't. */

  if (CHECK (user) && COMPARE (user) != 0)
    return 1;

  /* Password.  Report a loose match if one of the URLs has a password
     and the other doesn't. */

  if (CHECK (password) && COMPARE (password) != 0)
    return 1;

  /* Path.  TODO: "." and "..".  TODO: Prefix. */

  if (CHECK (path) && COMPARE (path) != 0)
    return 1;

  if (CHECK (query) && COMPARE (query) != 0)
    return 1;

  /* Fragment. */

  if (CHECK (fragment) && COMPARE (fragment) != 0)
    return 1;

  /* Type. */

  if (CHECK (type) && COMPARE (type) != 0)
    return 1;

  /* Host.  As this test is the slowest one, it comes last.  TODO:
     Support IP addresses, compare IP addresses (aliases!). */

  if (CHECK (host))
    {
      octet paddr[4], pmask[4], vaddr[4], vmask[4];

      if (ipaddr_parsen (pat_s + pat_u->host_start, pat_u->host_length,
			 paddr, pmask,
			 (flags & UCF_WILDCARD) ? IAP_WILDCARD : 0) == 0)
	{
	  /* Pattern is an IP address.  Report a mismatch if the value
             isn't an IP address, */

	  if (ipaddr_parsen (val_s + val_u->host_start, val_u->host_length,
			     vaddr, vmask, 0) != 0)
	    return 1;

	  /* Compare the IP addresses, without asking a name server. */

	  if (ipaddr_compare (paddr, pmask, vaddr) != 0)
	    return 1;
	}
      else
	{
	  /* Pattern is not an IP address.  Just compare as text,
             without asking a name server. */

	  if (COMPARE (host) != 0)
	    return 1;
	}
    }

  return 0;
#undef COMPARE
#undef CHECK
}

/* This function is defined in url.c instead of connect.c to be able
   to use url_char_type without much ado.  This function assumes that
   the string pointed to by SRC is terminated by SP.  Return 0 on
   success, -1 on error. */

int parse_connect (char *hn, size_t hnsize, int *port, const char *src)
{
  size_t i = 0;
  char *end;
  long n;

  while (HOST (src[i]))
    ++i;
  if (i == 0 || i >= hnsize)
    return -1;
  memcpy (hn, src, i); hn[i] = 0;
  src += i;
  if (*src++ != ':')
    return -1;			/* The port number is mandatory */
  n = strtol (src, &end, 10);
  if (n < 0 || n > 65535 || end == src)
    return -1;
  *port = (int)n;
  src = end;
  return *src == ' ' ? 0 : -1;
}