/* ====================================================================
 * Copyright (c) 2006,      Martin Hauner
 *                          http://subcommander.tigris.org
 *
 * Subcommander is licensed as described in the file doc/COPYING, which
 * you should have received as part of this distribution.
 * ====================================================================
 */
  
#ifndef _SC_UTF_H
#define _SC_UTF_H

// sc
#include "util/types.h"
#include "util/String.h"

class Bom
{
public:
  enum Type { none = 0, utf8, utf16be, utf16le, utf32be, utf32le };
  
  Bom();
  Bom( Type type );
  Bom( const Bom& src );

  bool isNull() const;

  const unsigned char* getBom() const;
  sc::Size getSize() const;

private:
  Type _type;
};

/**
 * helper class to test/detect various utf encoded files.
 */
class utf
{
public:
  utf( const unsigned char* buf, sc::Size len );
  ~utf();

  /** detected an encoding? */
  bool hasEncoding() const;

  bool hasBom() const;
  const Bom& getBom() const;
  const sc::String& getEncoding() const;

  const unsigned char* getBuffer() const;
  sc::Size getLength() const;

  bool isBigEndian() const;
  bool isLittleEndian() const;

  /** check if \a len fits an utf-32 encoded file */
  static bool isUtf32Size( sc::Size len );

  /** check if \a len fits an utf-16 encoded file */
  static bool isUtf16Size( sc::Size len );

  /** check if \a len fits an utf-8 encoded file */
  static bool isUtf8Size( sc::Size len );

  /** check if \a buf start with a utf-32be bom */
  static bool isUtf32BeBom( const unsigned char* buf, sc::Size len );

  /** check if \a buf start with a utf-32le bom */
  static bool isUtf32LeBom( const unsigned char* buf, sc::Size len );

  /** check if \a buf start with a utf-16be bom */
  static bool isUtf16BeBom( const unsigned char* buf, sc::Size len );

  /** check if \a buf start with a utf-16le bom */
  static bool isUtf16LeBom( const unsigned char* buf, sc::Size len );

  /** check if \a buf start with a utf-8 bom */
  static bool isUtf8Bom( const unsigned char* buf, sc::Size len );

  /** check first kbyte of \a buf if it contains valid utf-32be data */
  static bool isUtf32BeData( const unsigned char* buf, sc::Size len, bool bom );

  /** check first kbyte of \a buf if it contains valid utf-32le data */
  static bool isUtf32LeData( const unsigned char* buf, sc::Size len, bool bom );

  /** check first kbyte of \a buf if it contains valid utf-16be data */
  static bool isUtf16BeData( const unsigned char* buf, sc::Size len, bool bom );

  /** check first kbyte of \a buf if it contains valid utf-16le data */
  static bool isUtf16LeData( const unsigned char* buf, sc::Size len, bool bom );

  /** check first kbyte of \a buf if it contains valid utf-8 data */
  static bool isUtf8Data( const unsigned char* buf, sc::Size len, bool bom );

private:
  void check();

  bool       _bigEndian;
  bool       _littleEndian;

  Bom        _bom;
  sc::String _encoding;

  const unsigned char* _buf;
  sc::Size             _len;
};

#endif // _SC_UTF_H


syntax highlighted by Code2HTML, v. 0.9.1