#!/usr/bin/env python
# -*- coding: latin-1; -*-
#
# PgWorksheet - PostgreSQL Front End
# http://pgworksheet.projects.postgresql.org/
#
# Copyright © 2004-2005 Henri Michelon & CML http://www.e-cml.org/
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details (read LICENSE.txt).
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
# $Id: Lexical.py,v 1.6 2005/10/25 17:31:24 hmichelon Exp $
#

# http://www.postgresql.org/docs/8.0/static/sql-syntax.html

# basic characters sets
SPACES = [ ' ', '\t', '\n' ]
DIGITS = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
NUMERIC = DIGITS + [ 'e', '.', '+', '-' ]
OPERATOR_CHARS = [ '+', '-', '*', '/', '<', '>', '=', '~', '!', \
                   '@', '#', '%', '^', '&', '|', '`', '?' ]
SPECIAL_CHARS = [ '(', ')', '[', ']', ',', ';', ':', '*', '.' ]
OPERATORS = OPERATOR_CHARS + SPECIAL_CHARS

# not the first character of an identifier
NOT_IDENT_START = SPECIAL_CHARS + OPERATOR_CHARS + DIGITS + [ '$' ]
# not a character of an identifier
NOT_IDENT_CHAR = SPECIAL_CHARS + OPERATOR_CHARS + SPACES + [ "'" ]
# not a character of a dollar quoted string
NOT_DOLLAR_QUOTED = [ '$' ] + SPACES


class Token:

  def __init__(self, token, start_iter, end_iter, value=None):
    self.token = token
    self.start_iter = start_iter
    self.end_iter = end_iter
    self.value = value


class Eob:
  """End of Buffer Exception"""


class Lexical:
  """Simplified lexical analyser"""

  def analyse(self, buffer, start, end):
    """Run the lexical and syntaxical analysers then
    apply the syntax highlight to the buffer"""
    self.buffer = buffer
    self.current = start.copy()
    self.tokens = [];
    try:
      self.lexical_analyser(end.copy())
    except Eob:
      pass
    return self.tokens


  def next_char(self):
    """Returns the next character to analyse"""
    if (self.current.is_end()):
      raise Eob()
    c = self.current.get_char()
    self.current.forward_char()
    return c
  

  def skip_spaces(self, c):
    """Skips everything that looks like a space/tab/etc..."""
    while (c in SPACES):
      c = self.next_char()
    return c  


  def string(self):
    """Single quoted strings"""
    start = self.current.copy()
    start.backward_char()
    prev = None
    c = self.next_char()
    try:
      while (True):
        if ((c == "'") and (prev != '\\')): # a single quote in the string...
          c = self.next_char()
          if (c != "'"):
            break
        prev = c
        c = self.next_char()
    except Eob:    
      end = self.current.copy()
      self.tokens.append(Token('string', start, end))
      raise
    end = self.current.copy()
    end.backward_char()
    self.tokens.append(Token('string', start, end))
    return c

  
  def dollar_string(self):
    """Dollar-quoted strings"""
    # first bound
    start = self.current.copy()
    start.backward_char()
    c = self.next_char()
    string_tag = ''
    try:
      while (c not in NOT_DOLLAR_QUOTED):
        string_tag = string_tag + c
        c = self.next_char()
    except Eob:
      end = self.current.copy()
      self.tokens.append(Token('identifier', start, end, string_tag.upper()))
      raise
    end = self.current.copy()
    end.backward_char()
    if (c != '$'):
      self.tokens.append(Token('identifier', start, end, string_tag.upper()))
      return c
    self.tokens.append(Token('dollarquote', start, end, string_tag.upper()))
    
    # string content
    start = self.current.copy()
    try:
      c = self.next_char()
    except Eob:  
      end = self.current.copy()
      self.tokens.append(Token('identifier', start, end, string_tag.upper()))
      raise
    try:
      while (True):
        if (c == '$'):
          string_end = self.current.copy()
          c = self.next_char()
          s = ''
          while (c not in NOT_DOLLAR_QUOTED):
            s = s + c
            c = self.next_char()
          if (s == string_tag):
            string_end.backward_char()
            self.tokens.append(Token('string', start, string_end))
            end = self.current.copy()
            end.backward_char()
            self.tokens.append(Token('dollarquote', start, end, s.upper()))
            return c
        else:
          c = self.next_char()
    except Eob:
      end = self.current.copy()
      self.tokens.append(Token('string', start, end))
      raise
    end = self.current.copy()
    end.backward_char()
    self.tokens.append(Token('string', start, end))
    return c

  def bit_string_constant(self, start):
    """Binary and Hexadecimal numeric constants using strings"""
    c = self.next_char()
    if (c == "'"):
      c = self.next_char()
      start = self.current.copy()
      start.backward_char()
      start.backward_char()
      start.backward_char()
      while (c != "'"):
        c = self.next_char()
      end = self.current.copy()
      self.tokens.append(Token('numeric_constant', start, end))
      return self.next_char()
    else:
      return self.identifier(c, start)


  def identifier(self, c, ident = ''):
    """An identifier, keyword, type name, etc..."""
    start = self.current.copy()
    for i in range(0, len(ident) + 1):
      start.backward_char()
    try:
      while (c not in NOT_IDENT_CHAR):
        ident = ident + c
        c = self.next_char()
    except Eob:    
      end = self.current.copy()
      self.tokens.append(Token('identifier', start, end, ident.upper()))
      raise
    end = self.current.copy()
    end.backward_char()
    self.tokens.append(Token('identifier', start, end, ident.upper()))
    return c  


  def numeric(self, c):
    """A numeric constant"""
    start = self.current.copy()
    start.backward_char()
    try:
      while (c in NUMERIC):
        c = self.next_char()
    except Eob:
      end = self.current.copy()
      self.tokens.append(Token('numeric_constant', start, end))
      raise
    end = self.current.copy()
    end.backward_char()
    self.tokens.append(Token('numeric_constant', start, end))
    return c  


  def simple_comment(self):
    """One line comment using --"""
    start = self.current.copy()
    start.backward_char()
    start.backward_char()
    c = self.next_char()
    try:
      while (c != '\n'):
        c = self.next_char()
    except Eob:    
      end = self.current.copy()
      self.tokens.append(Token('comment', start, end))
      raise
    end = self.current.copy()
    self.tokens.append(Token('comment', start, end))


  def comment(self):
    """Multi lines comments using /* */"""
    start = self.current.copy()
    start.backward_char()
    start.backward_char()
    c = self.next_char()
    prev = None
    nested = 0
    try:
      while (True):
        if (c == '*'):
          c = self.next_char()
          if (prev == '/'):
            nested = nested + 1
            continue
          if (c == '/'):
            if (nested == 0):
              c = self.next_char()
              break
            else:
              nested = nested - 1
          else:
            prev = c
            continue
        prev = c
        c = self.next_char()
    except Eob:    
      end = self.current.copy()
      self.tokens.append(Token('comment', start, end))
      raise
    end = self.current.copy()
    end.backward_char()
    self.tokens.append(Token('comment', start, end))
    return c


  def psql(self):
    """A PgSQL Command"""
    start = self.current.copy()
    start.backward_char()
    c = self.next_char()
    cmd = '\\'
    try:
      while (c != '\n') and (c != ';'):
        cmd = cmd + c
        c = self.next_char()
    except:
      end = self.current.copy()
      self.tokens.append(Token('psql', start, end, cmd))
      raise
    end = self.current.copy()
    self.tokens.append(Token('psql', start, end, cmd))


  def lexical_analyser(self, fin):
    """A simplified lexical analyser"""
    c = self.next_char()
    while (self.current.compare(fin) <= 0):
      c = self.skip_spaces(c)
      # Multi lines comments
      if (c == '/'):
        c = self.next_char()
        if (c == '*'):
          c = self.comment()
          continue
        else:
          self.current.backward_char()
      # One line comments
      elif (c == '-'):
        c = self.next_char()
        if (c == '-'):
          self.simple_comment()
        else:
          self.current.backward_char()
      # psql commands
      elif (c == '\\'):
        self.psql()
      # numeric  
      elif (c in DIGITS):
        c = self.numeric(c)
        continue
      # bit strings
      elif (c == 'B') or (c == 'b') or (c == 'H') or (c == 'h'):
        c = self.bit_string_constant(c)
        continue
      # strings
      elif (c == "'"):
        c = self.string()
        continue
      # dollar-quoted strings
      elif (c == '$'):
        c = self.dollar_string()
        continue
      # numeric
      elif (c == '.'):
        c = self.next_char()
        if (c in DIGITS):
          self.current.backward_char()
          c = self.numeric(self.current.get_char())
          continue
      # quoted identifiers  
      elif (c == '"'):
        c = self.next_char()
        while (c != '"'):
          c = self.next_char()
      # operators    
      elif (c in OPERATORS):
        start = self.current.copy()
        start.backward_char()
        end = self.current.copy()
        self.tokens.append(Token('operator', start, end, c))
      # everything else
      elif (c not in NOT_IDENT_START):
        c = self.identifier(c)
        continue
      c = self.next_char()