/* ====================================================================
* Copyright (c) 2006, Martin Hauner
* http://subcommander.tigris.org
*
* Subcommander is licensed as described in the file doc/COPYING, which
* you should have received as part of this distribution.
* ====================================================================
*/
// sc
#include "utf.h"
// apr
#include <apu.h>
// sys
#include <memory.h>
struct BomInfo
{
sc::Size _size;
const unsigned char* _data;
};
const unsigned char bom_utf8[] = { 0xef, 0xbb, 0xbf };
const unsigned char bom_utf16be[] = { 0xfe, 0xff };
const unsigned char bom_utf16le[] = { 0xff, 0xfe };
const unsigned char bom_utf32be[] = { 0x00, 0x00, 0xfe, 0xff };
const unsigned char bom_utf32le[] = { 0xff, 0xfe, 0x00, 0x00 };
static BomInfo BomData[] =
{
{ 0, 0 },
{ sizeof(bom_utf8), bom_utf8 },
{ sizeof(bom_utf16be), bom_utf16be },
{ sizeof(bom_utf16le), bom_utf16le },
{ sizeof(bom_utf32be), bom_utf32be },
{ sizeof(bom_utf32le), bom_utf32le }
};
Bom::Bom()
: _type(Bom::none)
{
}
Bom::Bom( Type type )
: _type(type)
{
}
Bom::Bom( const Bom& src )
: _type(src._type)
{
}
bool Bom::isNull() const
{
return _type == Bom::none;
}
const unsigned char* Bom::getBom() const
{
return BomData[_type]._data;
}
sc::Size Bom::getSize() const
{
return BomData[_type]._size;
}
/* how many bytes we check at most in the given buffer */
static const sc::Size CheckSize = 1024*4;
utf::utf( const unsigned char* buf, sc::Size len )
: _bigEndian(false), _littleEndian(false), _encoding("*")
{
_buf = new unsigned char[len];
_len = len;
memcpy((void*)_buf,buf,len);
check();
}
utf::~utf()
{
delete [] _buf;
}
void utf::check()
{
// for an empty file assume utf-8 without bom
if( _len == 0 )
{
_encoding = "utf-8";
_bom = Bom();
return;
}
// detect utf-8
if( isUtf8Size(_len) )
{
bool bom = isUtf8Bom(_buf,_len);
if( bom && isUtf8Data(_buf,_len,bom) )
{
_encoding = "utf-8";
_bom = Bom(Bom::utf8);
return;
}
if( isUtf8Data(_buf,_len,bom) )
{
_encoding = "utf-8";
_bom = Bom();
return;
}
}
// detect utf-16
if( isUtf16Size(_len) )
{
bool beBom = isUtf16BeBom(_buf,_len);
if( beBom && isUtf16BeData(_buf,_len,beBom) )
{
_bigEndian = true;
_encoding = "utf-16";
_bom = Bom(Bom::utf16be);
return;
}
bool leBom = isUtf16LeBom(_buf,_len);
if( leBom && isUtf16LeData(_buf,_len,leBom) )
{
_littleEndian = true;
_encoding = "utf-16";
_bom = Bom(Bom::utf16le);
return;
}
#if 0 // too many false positives without bom
if( isUtf16BeData(_buf,_len,false) )
{
_bigEndian = true;
_encoding = "utf-16";
_bom = Bom();
return;
}
#endif
}
// detect utf-32
if( isUtf32Size(_len) )
{
bool beBom = isUtf32BeBom(_buf,_len);
if( beBom && isUtf32BeData(_buf,_len,beBom) )
{
_bigEndian = true;
_encoding = "utf-32";
_bom = Bom(Bom::utf32be);
return;
}
bool leBom = isUtf32LeBom(_buf,_len);
if( leBom && isUtf32LeData(_buf,_len,leBom) )
{
_littleEndian = true;
_encoding = "utf-32";
_bom = Bom(Bom::utf32le);
return;
}
#if 0 // too many false positives without bom
if( isUtf32BeData(_buf,_len,false) )
{
_bigEndian = true;
_encoding = "utf-32";
_bom = Bom();
return;
}
#endif
}
}
bool utf::hasEncoding() const
{
static sc::String locenc("*");
return _encoding != locenc;
}
bool utf::isBigEndian() const
{
return _bigEndian;
}
bool utf::isLittleEndian() const
{
return _littleEndian;
}
bool utf::hasBom() const
{
return ! _bom.isNull();
}
const Bom& utf::getBom() const
{
return _bom;
}
const sc::String& utf::getEncoding() const
{
return _encoding;
}
const unsigned char* utf::getBuffer() const
{
return _buf;
}
sc::Size utf::getLength() const
{
return _len;
}
bool utf::isUtf32Size( sc::Size len )
{
// at least a single character, multiple of 4
return len >= 4 && ((len % 4) == 0);
}
bool utf::isUtf16Size( sc::Size len )
{
// at least a single character, multiple of 2
return len >= 2 && ((len % 2) == 0);
}
bool utf::isUtf8Size( sc::Size len )
{
// space for bom and at least a single character
return len >= 1;
}
bool utf::isUtf32BeBom( const unsigned char* buf, sc::Size len )
{
// utf-32 big endian bom
return len >= 4
&& buf[0] == 0x00
&& buf[1] == 0x00
&& buf[2] == 0xfe
&& buf[3] == 0xff;
}
bool utf::isUtf32LeBom( const unsigned char* buf, sc::Size len )
{
// utf-32 little endian bom
return len >= 4
&& buf[0] == 0xff
&& buf[1] == 0xfe
&& buf[2] == 0x00
&& buf[3] == 0x00;
}
bool utf::isUtf16BeBom( const unsigned char* buf, sc::Size len )
{
// utf-16 big endian bom
return len >= 2
&& buf[0] == 0xfe
&& buf[1] == 0xff;
}
bool utf::isUtf16LeBom( const unsigned char* buf, sc::Size len )
{
// utf-16 little endian bom
return len >= 2
&& buf[0] == 0xff
&& buf[1] == 0xfe;
}
bool utf::isUtf8Bom( const unsigned char* buf, sc::Size len )
{
// utf-8 bom
return len >= 3
&& buf[0] == 0xef
&& buf[1] == 0xbb
&& buf[2] == 0xbf;
}
bool utf::isUtf32BeData( const unsigned char* buf, sc::Size len, bool bom )
{
sc::Size off = 0;
if( bom )
{
off = 4;
}
for( /*off*/; off+4 < len && off < CheckSize; off+=4 )
{
unsigned long val =
(buf[off] << 24) + (buf[off+1] << 16) + (buf[off+2] << 8) + buf[off+3];
// utf-32 max is 0x0010ffff
if( val == 0 || val > 0x0010ffff )
{
return false;
}
}
return true;
}
bool utf::isUtf32LeData( const unsigned char* buf, sc::Size len, bool bom )
{
sc::Size off = 0;
if( bom )
{
off = 4;
}
for( /*off*/; off+3 < len && off < CheckSize; off+=4 )
{
unsigned long val =
(buf[off+3] << 24) + (buf[off+2] << 16) + (buf[off+1] << 8) + buf[off+0];
// utf-32 max is 0x0010ffff
if( val == 0 || val > 0x0010ffff )
{
return false;
}
}
return true;
}
bool utf::isUtf16BeData( const unsigned char* buf, sc::Size len, bool bom )
{
sc::Size off = 0;
if( bom )
{
off = 2;
}
for( /*off*/; off+1 < len && off < CheckSize; off+=2 )
{
unsigned long val = (buf[off+0] << 8) + buf[off+1];
// 0 in buffer
if( val == 0 )
{
// .. does not look like utf-16
return false;
}
// two byte sequence
else if( val < 0xd800 || val > 0xdfff )
{
continue;
}
// four byte sequence
else if( val >= 0xd800 && val <= 0xdbff && off+4 < len )
{
unsigned long val2 = (buf[off+2] << 8) + buf[off+3];
if( val2 >= 0xdc00 && val2 <= 0xdfff )
{
off+=2;
continue;
}
return false;
}
else
{
return false;
}
}
return true;
}
bool utf::isUtf16LeData( const unsigned char* buf, sc::Size len, bool bom )
{
sc::Size off = 0;
if( bom )
{
off = 2;
}
for( /*off*/; off+1 < len && off < CheckSize; off+=2 )
{
unsigned long val = (buf[off+1] << 8) + buf[off+0];
// 0 in buffer
if( val == 0 )
{
// .. does not look like utf-16
return false;
}
// two byte sequence
else if( val < 0xd800 || val > 0xdfff )
{
continue;
}
// four byte sequence
else if( val >= 0xd800 && val <= 0xdbff && off+4 < len )
{
unsigned long val2 = (buf[off+3] << 8) + buf[off+2];
if( val2 >= 0xdc00 && val2 <= 0xdfff )
{
off+=2;
continue;
}
return false;
}
else
{
return false;
}
}
return true;
}
bool utf::isUtf8Data( const unsigned char* buf, sc::Size len, bool bom )
{
sc::Size off = 0;
if( bom )
{
off = 3;
}
for( /*off*/; off < len && off < CheckSize; ++off )
{
//single byte
if( buf[off] < 0x80 )
{
continue;
}
// 2 byte sequence
else if( ((buf[off+0] & 0xe0) == 0xc0) && (off+1 < len) )
{
if( (buf[off+1] & 0xc0) == 0x80 )
{
off++;
continue;
}
return false;
}
// 3 byte sequence
else if( ((buf[off] & 0xf0) == 0xe0) && (off+2 < len) )
{
if( (buf[off+1] & 0xc0) == 0x80
&& (buf[off+2] & 0xc0) == 0x80)
{
off+=2;
continue;
}
return false;
}
// 4 byte sequence
else if( ((buf[off] & 0xf8) == 0xf0) && (off+3 < len) )
{
if( (buf[off+1] & 0xc0) == 0x80
&& (buf[off+2] & 0xc0) == 0x80
&& (buf[off+3] & 0xc0) == 0x80)
{
off+=3;
continue;
}
return false;
}
// 5 byte sequence
else if( ((buf[off] & 0xfc) == 0xf8) && (off+4 < len) )
{
if( (buf[off+1] & 0xc0) == 0x80
&& (buf[off+2] & 0xc0) == 0x80
&& (buf[off+3] & 0xc0) == 0x80
&& (buf[off+4] & 0xc0) == 0x80)
{
off+=4;
continue;
}
return false;
}
// 6 byte sequence
else if( ((buf[off] & 0xfe) == 0xfc) && (off+5 < len) )
{
if( (buf[off+1] & 0xc0) == 0x80
&& (buf[off+2] & 0xc0) == 0x80
&& (buf[off+3] & 0xc0) == 0x80
&& (buf[off+4] & 0xc0) == 0x80
&& (buf[off+5] & 0xc0) == 0x80)
{
off+=5;
continue;
}
return false;
}
// 0 byte in buffer
else if( buf[off] == 0 )
{
// .. looks not like utf-8
return false;
}
else
{
// not utf-8
return false;
}
}
return true;
}
syntax highlighted by Code2HTML, v. 0.9.1