ports//devel/subcommander/work/subcommander-1.2.2/util/utf8.cpp

/* ====================================================================
 * Copyright (c) 2003-2006, Martin Hauner
 *                          http://subcommander.tigris.org
 *
 * Subcommander is licensed as described in the file doc/COPYING, which
 * you should have received as part of this distribution.
 * ====================================================================
 */

// sc
#include "utf8.h"

// apr
#include <apr_strings.h>

// apr util
#include <apr_xlate.h>

// sys
#include <assert.h>


utf8::Buffer::Buffer() : _buffer(0), _size(0)
{
}

utf8::Buffer::Buffer( const char* buf, sc::Size size )
: _buffer(buf), _size(size)
{
}

utf8::Buffer::Buffer( const Buffer& src )
: _buffer(src._buffer), _size(src._size)
{
}


///////////////////////////////////////////////////////////

apr_pool_t* utf8::_pool = 0;


const char* XlateHandle_utf8_utf16 = "utf8 to utf16";


int utf8::getXlateHandle( void** handle, const char* frompage, const char* topage, const char* key )
{
  if( ! _pool )
  {
    apr_status_t status = apr_pool_create( &_pool, 0 );
    if( status != APR_SUCCESS )
    {
      return status;
    }
  }

  void *old_handle = 0;


  apr_pool_userdata_get( &old_handle, key, _pool );
  if( old_handle )
  {
    *handle = old_handle;
    return APR_SUCCESS;
  }


  apr_xlate_t* xlate;
  apr_status_t status;

  status = apr_xlate_open( &xlate, "utf-16", "utf-8", _pool );
  if( status != APR_SUCCESS )
  {
    assert(false);
    return status;
  }

  *handle = xlate;
  apr_pool_userdata_set( *handle, key, apr_pool_cleanup_null, _pool );

  return APR_SUCCESS;
}



utf8::Buffer utf8::to16( const char* src, sc::Size srcSize, apr_pool_t* pool )
{
  apr_xlate_t* xlate;
  apr_status_t status;

  status = getXlateHandle( (void**)&xlate, "utf-8", "utf-16", XlateHandle_utf8_utf16 );
  if( status != APR_SUCCESS )
  {
    return Buffer();
  }
  
#if 0
  status = apr_xlate_open( &xlate, "utf-16", "utf-8", pool );
  if( status != APR_SUCCESS )
  {
    return Buffer();
  }
#endif

  apr_size_t tmpSize = srcSize * 2;        // just a guess
  char*      tmp     = new char[tmpSize]; 

  apr_size_t srcLen = srcSize;
  apr_size_t dstLen = tmpSize;

  Buffer result;

  status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen );
  if( status == APR_SUCCESS )
  {
    sc::Size s = tmpSize - dstLen;
    result = Buffer( apr_pstrmemdup( pool, tmp, s ), s );
  }

  //status = apr_xlate_close(xlate);
  delete [] tmp;

  return result;
}


utf8::Buffer utf8::to8( const char* src, sc::Size srcSize, const char* srcCP, apr_pool_t* pool )
{
  apr_xlate_t* xlate;
  apr_status_t status;

  status = apr_xlate_open( &xlate, "utf-8", srcCP, pool );
  if( status != APR_SUCCESS )
  {
    return Buffer();
  }

  apr_size_t tmpSize = srcSize * 2;        // just a guess
  char*      tmp     = new char[tmpSize]; 

  apr_size_t srcLen = srcSize;
  apr_size_t dstLen = tmpSize;

  Buffer result;

  status = apr_xlate_conv_buffer( xlate, src, &srcLen, tmp, &dstLen );
  if( status == APR_SUCCESS )
  {
    sc::Size s = tmpSize - dstLen;
    result = Buffer( apr_pstrmemdup( pool, tmp, s ), s );
  }

  status = apr_xlate_close(xlate);
  delete [] tmp;

  return result;
}


// naive implementations of some utf8 string methods
// expect well formed utf8 strings

/**
 * \brief find the next char in an utf-8 string.
 *
 * \param str an utf-8 encoded 0 terminated string.
 */

char* utf8::next8( const char* str )
{
  // loop as long as we find a second, third or forth byte
  // of an utf8 char
  while( (*(++str) & 0xc0) == 0x80 )
  {
  }
  return (char*)str;
}

char* utf8::next8( const char* str, sc::Size chars )
{
  for( sc::Size i = 0; i < chars; i++ )
  {
    str = next8( str );
  }
  return (char*)str;
}


char* utf8::prev8( const char* str, const char* start )
{
  --str;
  while( (str >= start) && (*str & 0xc0) == 0x80 )
  {
    --str;
  }
  return (char*)str;
}


char* utf8::prev8( const char* str, const char* start, sc::Size chars )
{
  for( sc::Size i = 0; i < chars; i++ )
  {
    str = prev8( str, start );
  }
  return (char*)str;
}


// start of a single byte character
const int ascii_mask = 0x80;    // 1--- ----
const int ascii      = 0x00;    // 0--- ----

// start of a 2 byte surrogate
const int sur2_mask  = 0xe0;    // 111- ---- 
const int sur2       = 0xc0;    // 110- ---- 

// start of a 3 byte surrogate
const int sur3_mask  = 0xf0;    // 1111 ---- 
const int sur3       = 0xe0;    // 1110 ---- 

// start of a 4 byte surrogate
const int sur4_mask  = 0xf8;    // 1111 1--- 
const int sur4       = 0xf0;    // 1111 0--- 

size_t utf8::strlen8( const char* str, sc::Size size )
{
  // handle empty strings
  if( str == 0 || *str == 0 || size == 0 )
  {
    return 0;
  }

  const char*   cur = str;
  unsigned long len = 0;

  while( cur < str + size )
  {
    if( (*cur & ascii_mask) == ascii )
    {
      cur++;
    }
    else if( (*cur & sur2_mask) == sur2 )
    {
      cur += 2;
    }
    else if( (*cur & sur3_mask) == sur3 )
    {
      cur += 3;
    }
    else if( (*cur & sur4_mask) == sur4 )
    {
      cur += 4;
    }
    else
    {
      // this is NOT an utf-8 string :(
      // assume a single byte character and continue with the next char
      // so we don't enter an endless loop!
      cur++;
    }
   len++;
  }

  return len;
}
syntax highlighted by Code2HTML, v. 0.9.1