ports//devel/subcommander/work/subcommander-1.2.2/util/utf.cpp

/* ====================================================================
 * Copyright (c) 2006,      Martin Hauner
 *                          http://subcommander.tigris.org
 *
 * Subcommander is licensed as described in the file doc/COPYING, which
 * you should have received as part of this distribution.
 * ====================================================================
 */
 
// sc
#include "utf.h"

// apr
#include <apu.h>

// sys
#include <memory.h>

struct BomInfo
{
  sc::Size             _size;
  const unsigned char* _data;
};

const unsigned char bom_utf8[]    = { 0xef, 0xbb, 0xbf };
const unsigned char bom_utf16be[] = { 0xfe, 0xff };
const unsigned char bom_utf16le[] = { 0xff, 0xfe };
const unsigned char bom_utf32be[] = { 0x00, 0x00, 0xfe, 0xff };
const unsigned char bom_utf32le[] = { 0xff, 0xfe, 0x00, 0x00 };

static BomInfo BomData[] = 
{
  { 0, 0 },
  { sizeof(bom_utf8), bom_utf8 },
  { sizeof(bom_utf16be), bom_utf16be },
  { sizeof(bom_utf16le), bom_utf16le },
  { sizeof(bom_utf32be), bom_utf32be },
  { sizeof(bom_utf32le), bom_utf32le }
};

Bom::Bom()
: _type(Bom::none)
{
}

Bom::Bom( Type type )
: _type(type)
{
}

Bom::Bom( const Bom& src )
: _type(src._type)
{
}

bool Bom::isNull() const
{
  return _type == Bom::none;
}

const unsigned char* Bom::getBom() const
{
  return BomData[_type]._data;
}

sc::Size Bom::getSize() const
{
  return BomData[_type]._size;
}


/* how many bytes we check at most in the given buffer */
static const sc::Size CheckSize = 1024*4;


utf::utf( const unsigned char* buf, sc::Size len )
: _bigEndian(false), _littleEndian(false), _encoding("*")
{
  _buf = new unsigned char[len];
  _len = len;

  memcpy((void*)_buf,buf,len);

  check();
}

utf::~utf()
{
  delete [] _buf;
}

void utf::check()
{
  // for an empty file assume utf-8 without bom
  if( _len == 0 )
  {
    _encoding = "utf-8";
    _bom      = Bom();
    return;
  }

  // detect utf-8
  if( isUtf8Size(_len) )
  {
    bool bom = isUtf8Bom(_buf,_len);
    if( bom && isUtf8Data(_buf,_len,bom) )
    {
      _encoding = "utf-8";
      _bom      = Bom(Bom::utf8);
      return;
    }
    
    if( isUtf8Data(_buf,_len,bom) )
    {
      _encoding = "utf-8";
      _bom      = Bom();
      return;
    }
  }

  // detect utf-16
  if( isUtf16Size(_len) )
  {
    bool beBom = isUtf16BeBom(_buf,_len);
    if( beBom && isUtf16BeData(_buf,_len,beBom) )
    {
      _bigEndian = true;
      _encoding  = "utf-16";
      _bom       = Bom(Bom::utf16be);
      return;
    }

    bool leBom = isUtf16LeBom(_buf,_len);
    if( leBom && isUtf16LeData(_buf,_len,leBom) )
    {
      _littleEndian = true;
      _encoding = "utf-16";
      _bom      = Bom(Bom::utf16le);
      return;
    }

#if 0 // too many false positives without bom
    if( isUtf16BeData(_buf,_len,false) )
    {
      _bigEndian = true;
      _encoding  = "utf-16";
      _bom       = Bom();
      return;
    }
#endif
  }

  // detect utf-32
  if( isUtf32Size(_len) )
  {
    bool beBom = isUtf32BeBom(_buf,_len);
    if( beBom && isUtf32BeData(_buf,_len,beBom) )
    {
      _bigEndian = true;
      _encoding  = "utf-32";
      _bom       = Bom(Bom::utf32be);
      return;
    }

    bool leBom = isUtf32LeBom(_buf,_len);
    if( leBom && isUtf32LeData(_buf,_len,leBom) )
    {
      _littleEndian = true;
      _encoding     = "utf-32";
      _bom          = Bom(Bom::utf32le);
      return;
    }

#if 0 // too many false positives without bom
    if( isUtf32BeData(_buf,_len,false) )
    {
      _bigEndian = true;
      _encoding  = "utf-32";
      _bom       = Bom();
      return;
    }
#endif
  }
}

bool utf::hasEncoding() const
{
  static sc::String locenc("*");
  return _encoding != locenc;
}

bool utf::isBigEndian() const
{
  return _bigEndian;
}

bool utf::isLittleEndian() const
{
  return _littleEndian;
}

bool utf::hasBom() const
{
  return ! _bom.isNull();
}

const Bom& utf::getBom() const
{
  return _bom;
}

const sc::String& utf::getEncoding() const
{
  return _encoding;
}

const unsigned char* utf::getBuffer() const
{
  return _buf;
}

sc::Size utf::getLength() const
{
  return _len;
}

bool utf::isUtf32Size( sc::Size len )
{
  // at least a single character, multiple of 4
  return len >= 4 && ((len % 4) == 0);
}

bool utf::isUtf16Size( sc::Size len )
{
  // at least a single character, multiple of 2
  return len >= 2 && ((len % 2) == 0);
}

bool utf::isUtf8Size( sc::Size len )
{
  // space for bom and at least a single character
  return len >= 1;
}

bool utf::isUtf32BeBom( const unsigned char* buf, sc::Size len )
{
  // utf-32 big endian bom
  return len >= 4
    && buf[0] == 0x00
    && buf[1] == 0x00
    && buf[2] == 0xfe
    && buf[3] == 0xff;
}

bool utf::isUtf32LeBom( const unsigned char* buf, sc::Size len )
{
  // utf-32 little endian bom
  return len >= 4
    && buf[0] == 0xff
    && buf[1] == 0xfe
    && buf[2] == 0x00
    && buf[3] == 0x00;
}

bool utf::isUtf16BeBom( const unsigned char* buf, sc::Size len )
{
  // utf-16 big endian bom
  return len >= 2
    && buf[0] == 0xfe
    && buf[1] == 0xff;
}

bool utf::isUtf16LeBom( const unsigned char* buf, sc::Size len )
{
  // utf-16 little endian bom
  return len >= 2
    && buf[0] == 0xff
    && buf[1] == 0xfe;
}

bool utf::isUtf8Bom( const unsigned char* buf, sc::Size len )
{
  // utf-8 bom
  return len >= 3
    && buf[0] == 0xef
    && buf[1] == 0xbb
    && buf[2] == 0xbf;
}

bool utf::isUtf32BeData( const unsigned char* buf, sc::Size len, bool bom )
{
  sc::Size off = 0;

  if( bom )
  {
    off = 4;
  }

  for( /*off*/; off+4 < len && off < CheckSize; off+=4 )
  {
    unsigned long val =
      (buf[off] << 24) + (buf[off+1] << 16) + (buf[off+2] << 8) + buf[off+3];

    // utf-32 max is 0x0010ffff
    if( val == 0 || val > 0x0010ffff )
    {
      return false;
    }
  }

  return true;
}

bool utf::isUtf32LeData( const unsigned char* buf, sc::Size len, bool bom )
{
  sc::Size off = 0;

  if( bom )
  {
    off = 4;
  }

  for( /*off*/; off+3 < len && off < CheckSize; off+=4 )
  {
    unsigned long val =
      (buf[off+3] << 24) + (buf[off+2] << 16) + (buf[off+1] << 8) + buf[off+0];

    // utf-32 max is 0x0010ffff
    if( val == 0 || val > 0x0010ffff )
    {
      return false;
    }
  }

  return true;
}

bool utf::isUtf16BeData( const unsigned char* buf, sc::Size len, bool bom )
{
  sc::Size off = 0;

  if( bom )
  {
    off = 2;
  }

  for( /*off*/; off+1 < len && off < CheckSize; off+=2 )
  {
    unsigned long val = (buf[off+0] << 8) + buf[off+1];

    // 0 in buffer
    if( val == 0 )
    {
       // .. does not look like utf-16
      return false;
    }
    // two byte sequence
    else if( val < 0xd800 || val > 0xdfff )
    {
      continue;
    }
    // four byte sequence
    else if( val >= 0xd800 && val <= 0xdbff && off+4 < len )
    {
      unsigned long val2 = (buf[off+2] << 8) + buf[off+3];

      if( val2 >= 0xdc00 && val2 <= 0xdfff )
      {
        off+=2;
        continue;
      }

      return false;
    }
    else
    {
      return false;
    }
  }

  return true;
}

bool utf::isUtf16LeData( const unsigned char* buf, sc::Size len, bool bom )
{
  sc::Size off = 0;

  if( bom )
  {
    off = 2;
  }

  for( /*off*/; off+1 < len && off < CheckSize; off+=2 )
  {
    unsigned long val = (buf[off+1] << 8) + buf[off+0];

    // 0 in buffer
    if( val == 0 )
    {
       // .. does not look like utf-16
      return false;
    }
    // two byte sequence
    else if( val < 0xd800 || val > 0xdfff )
    {
      continue;
    }
    // four byte sequence
    else if( val >= 0xd800 && val <= 0xdbff && off+4 < len )
    {
      unsigned long val2 = (buf[off+3] << 8) + buf[off+2];

      if( val2 >= 0xdc00 && val2 <= 0xdfff )
      {
        off+=2;
        continue;
      }

      return false;
    }
    else
    {
      return false;
    }
  }

  return true;
}

bool utf::isUtf8Data( const unsigned char* buf, sc::Size len, bool bom )
{
  sc::Size off = 0;

  if( bom )
  {
    off = 3;
  }

  for( /*off*/; off < len && off < CheckSize; ++off )
  {
    //single byte
    if( buf[off] < 0x80 )
    {
      continue;
    }
    // 2 byte sequence
    else if( ((buf[off+0] & 0xe0) == 0xc0) && (off+1 < len) )
    {
      if( (buf[off+1] & 0xc0) == 0x80 )
      {
        off++;
        continue;
      }

      return false;
    }
    // 3 byte sequence
    else if( ((buf[off] & 0xf0) == 0xe0) && (off+2 < len) )
    {
      if(  (buf[off+1] & 0xc0) == 0x80
        && (buf[off+2] & 0xc0) == 0x80)
      {
        off+=2;
        continue;
      }

      return false;
    }
    // 4 byte sequence
    else if( ((buf[off] & 0xf8) == 0xf0) && (off+3 < len) )
    {
      if(  (buf[off+1] & 0xc0) == 0x80
        && (buf[off+2] & 0xc0) == 0x80
        && (buf[off+3] & 0xc0) == 0x80)
      {
        off+=3;
        continue;
      }

      return false;
    }
    // 5 byte sequence
    else if( ((buf[off] & 0xfc) == 0xf8) && (off+4 < len) )
    {
      if(  (buf[off+1] & 0xc0) == 0x80
        && (buf[off+2] & 0xc0) == 0x80
        && (buf[off+3] & 0xc0) == 0x80
        && (buf[off+4] & 0xc0) == 0x80)
      {
        off+=4;
        continue;
      }

      return false;
    }
    // 6 byte sequence
    else if( ((buf[off] & 0xfe) == 0xfc) && (off+5 < len) )
    {
      if(  (buf[off+1] & 0xc0) == 0x80
        && (buf[off+2] & 0xc0) == 0x80
        && (buf[off+3] & 0xc0) == 0x80
        && (buf[off+4] & 0xc0) == 0x80
        && (buf[off+5] & 0xc0) == 0x80)
      {
        off+=5;
        continue;
      }

      return false;
    }
    // 0 byte in buffer
    else if( buf[off] == 0 )
    {
      // .. looks not like utf-8
      return false;
    }
    else
    {
      // not utf-8
      return false;
    }
  }

  return true;
}
syntax highlighted by Code2HTML, v. 0.9.1