/* ====================================================================
* Copyright (c) 2003-2006, Martin Hauner
* http://subcommander.tigris.org
*
* Subcommander is licensed as described in the file doc/COPYING, which
* you should have received as part of this distribution.
* ====================================================================
*/
// sc
#include "utf8.h"
// apr
#include <apr_strings.h>
// apr util
#include <apr_xlate.h>
// sys
#include <assert.h>
utf8::Buffer::Buffer() : _buffer(0), _size(0)
{
}
utf8::Buffer::Buffer( const char* buf, sc::Size size )
: _buffer(buf), _size(size)
{
}
utf8::Buffer::Buffer( const Buffer& src )
: _buffer(src._buffer), _size(src._size)
{
}
///////////////////////////////////////////////////////////
apr_pool_t* utf8::_pool = 0;
const char* XlateHandle_utf8_utf16 = "utf8 to utf16";
int utf8::getXlateHandle( void** handle, const char* frompage, const char* topage, const char* key )
{
if( ! _pool )
{
apr_status_t status = apr_pool_create( &_pool, 0 );
if( status != APR_SUCCESS )
{
return status;
}
}
void *old_handle = 0;
apr_pool_userdata_get( &old_handle, key, _pool );
if( old_handle )
{
*handle = old_handle;
return APR_SUCCESS;
}
apr_xlate_t* xlate;
apr_status_t status;
status = apr_xlate_open( &xlate, "utf-16", "utf-8", _pool );
if( status != APR_SUCCESS )
{
assert(false);
return status;
}
*handle = xlate;
apr_pool_userdata_set( *handle, key, apr_pool_cleanup_null, _pool );
return APR_SUCCESS;
}
utf8::Buffer utf8::to16( const char* src, sc::Size srcSize, apr_pool_t* pool )
{
apr_xlate_t* xlate;
apr_status_t status;
status = getXlateHandle( (void**)&xlate, "utf-8", "utf-16", XlateHandle_utf8_utf16 );
if( status != APR_SUCCESS )
{
return Buffer();
}
#if 0
status = apr_xlate_open( &xlate, "utf-16", "utf-8", pool );
if( status != APR_SUCCESS )
{
return Buffer();
}
#endif
apr_size_t tmpSize = srcSize * 2; // just a guess
char* tmp = new char[tmpSize];
apr_size_t srcLen = srcSize;
apr_size_t dstLen = tmpSize;
Buffer result;
status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen );
if( status == APR_SUCCESS )
{
sc::Size s = tmpSize - dstLen;
result = Buffer( apr_pstrmemdup( pool, tmp, s ), s );
}
//status = apr_xlate_close(xlate);
delete [] tmp;
return result;
}
utf8::Buffer utf8::to8( const char* src, sc::Size srcSize, const char* srcCP, apr_pool_t* pool )
{
apr_xlate_t* xlate;
apr_status_t status;
status = apr_xlate_open( &xlate, "utf-8", srcCP, pool );
if( status != APR_SUCCESS )
{
return Buffer();
}
apr_size_t tmpSize = srcSize * 2; // just a guess
char* tmp = new char[tmpSize];
apr_size_t srcLen = srcSize;
apr_size_t dstLen = tmpSize;
Buffer result;
status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen );
if( status == APR_SUCCESS )
{
sc::Size s = tmpSize - dstLen;
result = Buffer( apr_pstrmemdup( pool, tmp, s ), s );
}
status = apr_xlate_close(xlate);
delete [] tmp;
return result;
}
// naive implementations of some utf8 string methods
// expect well formed utf8 strings
/**
* \brief find the next char in an utf-8 string.
*
* \param str an utf-8 encoded 0 terminated string.
*/
char* utf8::next8( const char* str )
{
// loop as long as we find a second, third or forth byte
// of an utf8 char
while( (*(++str) & 0xc0) == 0x80 )
{
}
return (char*)str;
}
char* utf8::next8( const char* str, sc::Size chars )
{
for( sc::Size i = 0; i < chars; i++ )
{
str = next8( str );
}
return (char*)str;
}
char* utf8::prev8( const char* str, const char* start )
{
--str;
while( (str >= start) && (*str & 0xc0) == 0x80 )
{
--str;
}
return (char*)str;
}
char* utf8::prev8( const char* str, const char* start, sc::Size chars )
{
for( sc::Size i = 0; i < chars; i++ )
{
str = prev8( str, start );
}
return (char*)str;
}
// start of a single byte character
const int ascii_mask = 0x80; // 1--- ----
const int ascii = 0x00; // 0--- ----
// start of a 2 byte surrogate
const int sur2_mask = 0xe0; // 111- ----
const int sur2 = 0xc0; // 110- ----
// start of a 3 byte surrogate
const int sur3_mask = 0xf0; // 1111 ----
const int sur3 = 0xe0; // 1110 ----
// start of a 4 byte surrogate
const int sur4_mask = 0xf8; // 1111 1---
const int sur4 = 0xf0; // 1111 0---
size_t utf8::strlen8( const char* str, sc::Size size )
{
// handle empty strings
if( str == 0 || *str == 0 || size == 0 )
{
return 0;
}
const char* cur = str;
unsigned long len = 0;
while( cur < str + size )
{
if( (*cur & ascii_mask) == ascii )
{
cur++;
}
else if( (*cur & sur2_mask) == sur2 )
{
cur += 2;
}
else if( (*cur & sur3_mask) == sur3 )
{
cur += 3;
}
else if( (*cur & sur4_mask) == sur4 )
{
cur += 4;
}
else
{
// this is NOT an utf-8 string :(
// assume a single byte character and continue with the next char
// so we don't enter an endless loop!
cur++;
}
len++;
}
return len;
}
syntax highlighted by Code2HTML, v. 0.9.1