/* ==================================================================== * Copyright (c) 2006, Martin Hauner * http://subcommander.tigris.org * * Subcommander is licensed as described in the file doc/COPYING, which * you should have received as part of this distribution. * ==================================================================== */ // sc #include "utf.h" // apr #include // sys #include struct BomInfo { sc::Size _size; const unsigned char* _data; }; const unsigned char bom_utf8[] = { 0xef, 0xbb, 0xbf }; const unsigned char bom_utf16be[] = { 0xfe, 0xff }; const unsigned char bom_utf16le[] = { 0xff, 0xfe }; const unsigned char bom_utf32be[] = { 0x00, 0x00, 0xfe, 0xff }; const unsigned char bom_utf32le[] = { 0xff, 0xfe, 0x00, 0x00 }; static BomInfo BomData[] = { { 0, 0 }, { sizeof(bom_utf8), bom_utf8 }, { sizeof(bom_utf16be), bom_utf16be }, { sizeof(bom_utf16le), bom_utf16le }, { sizeof(bom_utf32be), bom_utf32be }, { sizeof(bom_utf32le), bom_utf32le } }; Bom::Bom() : _type(Bom::none) { } Bom::Bom( Type type ) : _type(type) { } Bom::Bom( const Bom& src ) : _type(src._type) { } bool Bom::isNull() const { return _type == Bom::none; } const unsigned char* Bom::getBom() const { return BomData[_type]._data; } sc::Size Bom::getSize() const { return BomData[_type]._size; } /* how many bytes we check at most in the given buffer */ static const sc::Size CheckSize = 1024*4; utf::utf( const unsigned char* buf, sc::Size len ) : _bigEndian(false), _littleEndian(false), _encoding("*") { _buf = new unsigned char[len]; _len = len; memcpy((void*)_buf,buf,len); check(); } utf::~utf() { delete [] _buf; } void utf::check() { // for an empty file assume utf-8 without bom if( _len == 0 ) { _encoding = "utf-8"; _bom = Bom(); return; } // detect utf-8 if( isUtf8Size(_len) ) { bool bom = isUtf8Bom(_buf,_len); if( bom && isUtf8Data(_buf,_len,bom) ) { _encoding = "utf-8"; _bom = Bom(Bom::utf8); return; } if( isUtf8Data(_buf,_len,bom) ) { _encoding = "utf-8"; _bom = Bom(); return; } } // detect utf-16 if( isUtf16Size(_len) ) { bool beBom = isUtf16BeBom(_buf,_len); if( beBom && isUtf16BeData(_buf,_len,beBom) ) { _bigEndian = true; _encoding = "utf-16"; _bom = Bom(Bom::utf16be); return; } bool leBom = isUtf16LeBom(_buf,_len); if( leBom && isUtf16LeData(_buf,_len,leBom) ) { _littleEndian = true; _encoding = "utf-16"; _bom = Bom(Bom::utf16le); return; } #if 0 // too many false positives without bom if( isUtf16BeData(_buf,_len,false) ) { _bigEndian = true; _encoding = "utf-16"; _bom = Bom(); return; } #endif } // detect utf-32 if( isUtf32Size(_len) ) { bool beBom = isUtf32BeBom(_buf,_len); if( beBom && isUtf32BeData(_buf,_len,beBom) ) { _bigEndian = true; _encoding = "utf-32"; _bom = Bom(Bom::utf32be); return; } bool leBom = isUtf32LeBom(_buf,_len); if( leBom && isUtf32LeData(_buf,_len,leBom) ) { _littleEndian = true; _encoding = "utf-32"; _bom = Bom(Bom::utf32le); return; } #if 0 // too many false positives without bom if( isUtf32BeData(_buf,_len,false) ) { _bigEndian = true; _encoding = "utf-32"; _bom = Bom(); return; } #endif } } bool utf::hasEncoding() const { static sc::String locenc("*"); return _encoding != locenc; } bool utf::isBigEndian() const { return _bigEndian; } bool utf::isLittleEndian() const { return _littleEndian; } bool utf::hasBom() const { return ! _bom.isNull(); } const Bom& utf::getBom() const { return _bom; } const sc::String& utf::getEncoding() const { return _encoding; } const unsigned char* utf::getBuffer() const { return _buf; } sc::Size utf::getLength() const { return _len; } bool utf::isUtf32Size( sc::Size len ) { // at least a single character, multiple of 4 return len >= 4 && ((len % 4) == 0); } bool utf::isUtf16Size( sc::Size len ) { // at least a single character, multiple of 2 return len >= 2 && ((len % 2) == 0); } bool utf::isUtf8Size( sc::Size len ) { // space for bom and at least a single character return len >= 1; } bool utf::isUtf32BeBom( const unsigned char* buf, sc::Size len ) { // utf-32 big endian bom return len >= 4 && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xfe && buf[3] == 0xff; } bool utf::isUtf32LeBom( const unsigned char* buf, sc::Size len ) { // utf-32 little endian bom return len >= 4 && buf[0] == 0xff && buf[1] == 0xfe && buf[2] == 0x00 && buf[3] == 0x00; } bool utf::isUtf16BeBom( const unsigned char* buf, sc::Size len ) { // utf-16 big endian bom return len >= 2 && buf[0] == 0xfe && buf[1] == 0xff; } bool utf::isUtf16LeBom( const unsigned char* buf, sc::Size len ) { // utf-16 little endian bom return len >= 2 && buf[0] == 0xff && buf[1] == 0xfe; } bool utf::isUtf8Bom( const unsigned char* buf, sc::Size len ) { // utf-8 bom return len >= 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf; } bool utf::isUtf32BeData( const unsigned char* buf, sc::Size len, bool bom ) { sc::Size off = 0; if( bom ) { off = 4; } for( /*off*/; off+4 < len && off < CheckSize; off+=4 ) { unsigned long val = (buf[off] << 24) + (buf[off+1] << 16) + (buf[off+2] << 8) + buf[off+3]; // utf-32 max is 0x0010ffff if( val == 0 || val > 0x0010ffff ) { return false; } } return true; } bool utf::isUtf32LeData( const unsigned char* buf, sc::Size len, bool bom ) { sc::Size off = 0; if( bom ) { off = 4; } for( /*off*/; off+3 < len && off < CheckSize; off+=4 ) { unsigned long val = (buf[off+3] << 24) + (buf[off+2] << 16) + (buf[off+1] << 8) + buf[off+0]; // utf-32 max is 0x0010ffff if( val == 0 || val > 0x0010ffff ) { return false; } } return true; } bool utf::isUtf16BeData( const unsigned char* buf, sc::Size len, bool bom ) { sc::Size off = 0; if( bom ) { off = 2; } for( /*off*/; off+1 < len && off < CheckSize; off+=2 ) { unsigned long val = (buf[off+0] << 8) + buf[off+1]; // 0 in buffer if( val == 0 ) { // .. does not look like utf-16 return false; } // two byte sequence else if( val < 0xd800 || val > 0xdfff ) { continue; } // four byte sequence else if( val >= 0xd800 && val <= 0xdbff && off+4 < len ) { unsigned long val2 = (buf[off+2] << 8) + buf[off+3]; if( val2 >= 0xdc00 && val2 <= 0xdfff ) { off+=2; continue; } return false; } else { return false; } } return true; } bool utf::isUtf16LeData( const unsigned char* buf, sc::Size len, bool bom ) { sc::Size off = 0; if( bom ) { off = 2; } for( /*off*/; off+1 < len && off < CheckSize; off+=2 ) { unsigned long val = (buf[off+1] << 8) + buf[off+0]; // 0 in buffer if( val == 0 ) { // .. does not look like utf-16 return false; } // two byte sequence else if( val < 0xd800 || val > 0xdfff ) { continue; } // four byte sequence else if( val >= 0xd800 && val <= 0xdbff && off+4 < len ) { unsigned long val2 = (buf[off+3] << 8) + buf[off+2]; if( val2 >= 0xdc00 && val2 <= 0xdfff ) { off+=2; continue; } return false; } else { return false; } } return true; } bool utf::isUtf8Data( const unsigned char* buf, sc::Size len, bool bom ) { sc::Size off = 0; if( bom ) { off = 3; } for( /*off*/; off < len && off < CheckSize; ++off ) { //single byte if( buf[off] < 0x80 ) { continue; } // 2 byte sequence else if( ((buf[off+0] & 0xe0) == 0xc0) && (off+1 < len) ) { if( (buf[off+1] & 0xc0) == 0x80 ) { off++; continue; } return false; } // 3 byte sequence else if( ((buf[off] & 0xf0) == 0xe0) && (off+2 < len) ) { if( (buf[off+1] & 0xc0) == 0x80 && (buf[off+2] & 0xc0) == 0x80) { off+=2; continue; } return false; } // 4 byte sequence else if( ((buf[off] & 0xf8) == 0xf0) && (off+3 < len) ) { if( (buf[off+1] & 0xc0) == 0x80 && (buf[off+2] & 0xc0) == 0x80 && (buf[off+3] & 0xc0) == 0x80) { off+=3; continue; } return false; } // 5 byte sequence else if( ((buf[off] & 0xfc) == 0xf8) && (off+4 < len) ) { if( (buf[off+1] & 0xc0) == 0x80 && (buf[off+2] & 0xc0) == 0x80 && (buf[off+3] & 0xc0) == 0x80 && (buf[off+4] & 0xc0) == 0x80) { off+=4; continue; } return false; } // 6 byte sequence else if( ((buf[off] & 0xfe) == 0xfc) && (off+5 < len) ) { if( (buf[off+1] & 0xc0) == 0x80 && (buf[off+2] & 0xc0) == 0x80 && (buf[off+3] & 0xc0) == 0x80 && (buf[off+4] & 0xc0) == 0x80 && (buf[off+5] & 0xc0) == 0x80) { off+=5; continue; } return false; } // 0 byte in buffer else if( buf[off] == 0 ) { // .. looks not like utf-8 return false; } else { // not utf-8 return false; } } return true; }