/* ==================================================================== * Copyright (c) 2003-2006, Martin Hauner * http://subcommander.tigris.org * * Subcommander is licensed as described in the file doc/COPYING, which * you should have received as part of this distribution. * ==================================================================== */ // sc #include "utf8.h" // apr #include // apr util #include // sys #include utf8::Buffer::Buffer() : _buffer(0), _size(0) { } utf8::Buffer::Buffer( const char* buf, sc::Size size ) : _buffer(buf), _size(size) { } utf8::Buffer::Buffer( const Buffer& src ) : _buffer(src._buffer), _size(src._size) { } /////////////////////////////////////////////////////////// apr_pool_t* utf8::_pool = 0; const char* XlateHandle_utf8_utf16 = "utf8 to utf16"; int utf8::getXlateHandle( void** handle, const char* frompage, const char* topage, const char* key ) { if( ! _pool ) { apr_status_t status = apr_pool_create( &_pool, 0 ); if( status != APR_SUCCESS ) { return status; } } void *old_handle = 0; apr_pool_userdata_get( &old_handle, key, _pool ); if( old_handle ) { *handle = old_handle; return APR_SUCCESS; } apr_xlate_t* xlate; apr_status_t status; status = apr_xlate_open( &xlate, "utf-16", "utf-8", _pool ); if( status != APR_SUCCESS ) { assert(false); return status; } *handle = xlate; apr_pool_userdata_set( *handle, key, apr_pool_cleanup_null, _pool ); return APR_SUCCESS; } utf8::Buffer utf8::to16( const char* src, sc::Size srcSize, apr_pool_t* pool ) { apr_xlate_t* xlate; apr_status_t status; status = getXlateHandle( (void**)&xlate, "utf-8", "utf-16", XlateHandle_utf8_utf16 ); if( status != APR_SUCCESS ) { return Buffer(); } #if 0 status = apr_xlate_open( &xlate, "utf-16", "utf-8", pool ); if( status != APR_SUCCESS ) { return Buffer(); } #endif apr_size_t tmpSize = srcSize * 2; // just a guess char* tmp = new char[tmpSize]; apr_size_t srcLen = srcSize; apr_size_t dstLen = tmpSize; Buffer result; status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen ); if( status == APR_SUCCESS ) { sc::Size s = tmpSize - dstLen; result = Buffer( apr_pstrmemdup( pool, tmp, s ), s ); } //status = apr_xlate_close(xlate); delete [] tmp; return result; } utf8::Buffer utf8::to8( const char* src, sc::Size srcSize, const char* srcCP, apr_pool_t* pool ) { apr_xlate_t* xlate; apr_status_t status; status = apr_xlate_open( &xlate, "utf-8", srcCP, pool ); if( status != APR_SUCCESS ) { return Buffer(); } apr_size_t tmpSize = srcSize * 2; // just a guess char* tmp = new char[tmpSize]; apr_size_t srcLen = srcSize; apr_size_t dstLen = tmpSize; Buffer result; status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen ); if( status == APR_SUCCESS ) { sc::Size s = tmpSize - dstLen; result = Buffer( apr_pstrmemdup( pool, tmp, s ), s ); } status = apr_xlate_close(xlate); delete [] tmp; return result; } // naive implementations of some utf8 string methods // expect well formed utf8 strings /** * \brief find the next char in an utf-8 string. * * \param str an utf-8 encoded 0 terminated string. */ char* utf8::next8( const char* str ) { // loop as long as we find a second, third or forth byte // of an utf8 char while( (*(++str) & 0xc0) == 0x80 ) { } return (char*)str; } char* utf8::next8( const char* str, sc::Size chars ) { for( sc::Size i = 0; i < chars; i++ ) { str = next8( str ); } return (char*)str; } char* utf8::prev8( const char* str, const char* start ) { --str; while( (str >= start) && (*str & 0xc0) == 0x80 ) { --str; } return (char*)str; } char* utf8::prev8( const char* str, const char* start, sc::Size chars ) { for( sc::Size i = 0; i < chars; i++ ) { str = prev8( str, start ); } return (char*)str; } // start of a single byte character const int ascii_mask = 0x80; // 1--- ---- const int ascii = 0x00; // 0--- ---- // start of a 2 byte surrogate const int sur2_mask = 0xe0; // 111- ---- const int sur2 = 0xc0; // 110- ---- // start of a 3 byte surrogate const int sur3_mask = 0xf0; // 1111 ---- const int sur3 = 0xe0; // 1110 ---- // start of a 4 byte surrogate const int sur4_mask = 0xf8; // 1111 1--- const int sur4 = 0xf0; // 1111 0--- size_t utf8::strlen8( const char* str, sc::Size size ) { // handle empty strings if( str == 0 || *str == 0 || size == 0 ) { return 0; } const char* cur = str; unsigned long len = 0; while( cur < str + size ) { if( (*cur & ascii_mask) == ascii ) { cur++; } else if( (*cur & sur2_mask) == sur2 ) { cur += 2; } else if( (*cur & sur3_mask) == sur3 ) { cur += 3; } else if( (*cur & sur4_mask) == sur4 ) { cur += 4; } else { // this is NOT an utf-8 string :( // assume a single byte character and continue with the next char // so we don't enter an endless loop! cur++; } len++; } return len; }