// This file is part of The New Aspell // Copyright (C) 2001 by Kevin Atkinson under the GNU LGPL license // version 2.0 or 2.1. You should have received a copy of the LGPL // license along with this library if you did not you can find // it at http://www.gnu.org/. #include #include #include #include "asc_ctype.hpp" #include "convert.hpp" #include "fstream.hpp" #include "getdata.hpp" #include "config.hpp" #include "errors.hpp" #include "stack_ptr.hpp" #include "cache-t.hpp" #include "file_util.hpp" #include "file_data_util.hpp" #include "vararray.hpp" #include "iostream.hpp" #include "gettext.h" namespace acommon { typedef unsigned char byte; typedef unsigned char Uni8; typedef unsigned short Uni16; typedef unsigned int Uni32; ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// // // Lookups // ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// // // ToUniLookup // class ToUniLookup { Uni32 data[256]; static const Uni32 npos = (Uni32)(-1); public: void reset(); Uni32 operator[] (char key) const {return data[(unsigned char)key];} bool have(char key) const {return data[(unsigned char)key] != npos;} bool insert(char key, Uni32 value); }; void ToUniLookup::reset() { for (int i = 0; i != 256; ++i) data[i] = npos; } bool ToUniLookup::insert(char key, Uni32 value) { if (data[(unsigned char)key] != npos) return false; data[(unsigned char)key] = value; return true; } ////////////////////////////////////////////////////////////////////// // // FromUniLookup // // Assumes that the maximum number of items in the table is 256 // Also assumes (unsigned char)i == i % 256 // Based on the iso-8859-* character sets it is very fast, almost all // lookups involving no more than 2 comparisons. // NO looks ups involded more than 3 compassions. // Also, no division (or modules) is done whatsoever. struct UniItem { Uni32 key; char value; }; class FromUniLookup { private: static const Uni32 npos = (Uni32)(-1); UniItem * overflow_end; UniItem data[256*4]; UniItem overflow[256]; // you can never be too careful; public: FromUniLookup() {} void reset(); inline char operator() (Uni32 key, char unknown = '?') const; bool insert(Uni32 key, char value); }; void FromUniLookup::reset() { for (unsigned i = 0; i != 256*4; ++i) data[i].key = npos; overflow_end = overflow; } inline char FromUniLookup::operator() (Uni32 k, char unknown) const { const UniItem * i = data + (unsigned char)k * 4; if (i->key == k) return i->value; ++i; if (i->key == k) return i->value; ++i; if (i->key == k) return i->value; ++i; if (i->key == k) return i->value; if (i->key == npos) return unknown; for(i = overflow; i != overflow_end; ++i) if (i->key == k) return i->value; return unknown; } bool FromUniLookup::insert(Uni32 k, char v) { UniItem * i = data + (unsigned char)k * 4; UniItem * e = i + 4; while (i != e && i->key != npos) { if (i->key == k) return false; ++i; } if (i == e) { for(i = overflow; i != overflow_end; ++i) if (i->key == k) return false; } i->key = k; i->value = v; return true; } ////////////////////////////////////////////////////////////////////// // // CharLookup // class CharLookup { private: int data[256]; public: void reset(); char operator[] (char key) const {return data[(unsigned char)key];} bool insert(char key, char value); }; void CharLookup::reset() { for (int i = 0; i != 256; ++i) data[i] = -1; } bool CharLookup::insert(char key, char value) { if (data[(unsigned char)key] != -1) return false; data[(unsigned char)key] = value; return true; } ////////////////////////////////////////////////////////////////////// // // NormLookup // template struct NormTable { static const unsigned struct_size; unsigned mask; unsigned height; unsigned width; unsigned size; T * end; T data[1]; // hack for data[] }; template const unsigned NormTable::struct_size = sizeof(NormTable) - 1; template struct NormLookupRet { const typename T::To * to; const From * last; NormLookupRet(const typename T::To * t, From * l) : to(t), last(l) {} }; template static inline NormLookupRet norm_lookup(const NormTable * d, From * s, From * stop, const typename T::To * def, From * prev) { loop: if (s != stop) { const T * i = d->data + (static_cast(*s) & d->mask); for (;;) { if (i->from == static_cast(*s)) { if (i->sub_table) { // really tail recursion if (i->to[1] != T::to_non_char) {def = i->to; prev = s;} d = (const NormTable *)(i->sub_table); s++; goto loop; } else { return NormLookupRet(i->to, s); } } else { i += d->height; if (i >= d->end) break; } } } return NormLookupRet(def, prev); } template static void free_norm_table(NormTable * d) { for (T * cur = d->data; cur != d->end; ++cur) { if (cur->sub_table) free_norm_table(static_cast *>(cur->sub_table)); } free(d); } struct FromUniNormEntry { typedef Uni32 From; Uni32 from; typedef byte To; byte to[4]; static const From from_non_char = (From)(-1); static const To to_non_char = 0x10; static const unsigned max_to = 4; void * sub_table; } #ifdef __GNUC__ __attribute__ ((aligned (16))) #endif ; struct ToUniNormEntry { typedef byte From; byte from; typedef Uni16 To; Uni16 to[3]; static const From from_non_char = 0x10; static const To to_non_char = 0x10; static const unsigned max_to = 3; void * sub_table; } #ifdef __GNUC__ __attribute__ ((aligned (16))) #endif ; ////////////////////////////////////////////////////////////////////// // // read in char data // PosibErr read_in_char_data (const Config & config, ParmStr encoding, ToUniLookup & to, FromUniLookup & from) { to.reset(); from.reset(); String dir1,dir2,file_name; fill_data_dir(&config, dir1, dir2); find_file(file_name,dir1,dir2,encoding,".cset"); FStream data; PosibErrBase err = data.open(file_name, "r"); if (err.get_err()) { char mesg[300]; snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), file_name.c_str()); return make_err(unknown_encoding, encoding, mesg); } unsigned chr; Uni32 uni; String line; char * p; do { p = get_nb_line(data, line); } while (*p != '/'); for (chr = 0; chr != 256; ++chr) { p = get_nb_line(data, line); if (strtoul(p, 0, 16) != chr) return make_err(bad_file_format, file_name); uni = strtoul(p + 3, 0, 16); to.insert(chr, uni); from.insert(uni, chr); } return no_err; } ////////////////////////////////////////////////////////////////////// // // read in norm data // struct Tally { int size; Uni32 mask; int max; int * data; Tally(int s, int * d) : size(s), mask(s - 1), max(0), data(d) { memset(data, 0, sizeof(int)*size); } void add(Uni32 chr) { Uni32 p = chr & mask; data[p]++; if (data[p] > max) max = data[p]; } }; template static PosibErr< NormTable * > create_norm_table(IStream & in, String & buf) { const char * p = get_nb_line(in, buf); assert(*p == 'N'); ++p; int size = strtoul(p, (char **)&p, 10); VARARRAY(T, d, size); memset(d, 0, sizeof(T) * size); int sz = 1 << (unsigned)floor(log(size <= 1 ? 1.0 : size - 1)/log(2.0)); VARARRAY(int, tally0_d, sz); Tally tally0(sz, tally0_d); VARARRAY(int, tally1_d, sz*2); Tally tally1(sz*2, tally1_d); VARARRAY(int, tally2_d, sz*4); Tally tally2(sz*4, tally2_d); T * cur = d; while (p = get_nb_line(in, buf), *p != '.') { Uni32 f = strtoul(p, (char **)&p, 16); cur->from = static_cast(f); assert(f == cur->from); tally0.add(f); tally1.add(f); tally2.add(f); ++p; assert(*p == '>'); ++p; assert(*p == ' '); ++p; unsigned i = 0; if (*p != '-') { for (;; ++i) { const char * q = p; Uni32 t = strtoul(p, (char **)&p, 16); if (q == p) break; assert(i < d->max_to); cur->to[i] = static_cast(t); assert(t == static_cast(cur->to[i])); } } else { cur->to[0] = 0; cur->to[1] = T::to_non_char; } if (*p == ' ') ++p; if (*p == '/') cur->sub_table = create_norm_table(in,buf); ++cur; } assert(cur - d == size); Tally * which = &tally0; if (which->max > tally1.max) which = &tally1; if (which->max > tally2.max) which = &tally2; NormTable * final = (NormTable *)calloc(1, NormTable::struct_size + sizeof(T) * which->size * which->max); memset(final, 0, NormTable::struct_size + sizeof(T) * which->size * which->max); final->mask = which->size - 1; final->height = which->size; final->width = which->max; final->end = final->data + which->size * which->max; final->size = size; for (cur = d; cur != d + size; ++cur) { T * dest = final->data + (cur->from & final->mask); while (dest->from != 0) dest += final->height; *dest = *cur; if (dest->from == 0) dest->from = T::from_non_char; } for (T * dest = final->data; dest < final->end; dest += final->height) { if (dest->from == 0 || dest->from == T::from_non_char && dest->to[0] == 0) { dest->from = T::from_non_char; dest->to[0] = T::to_non_char; } } return final; } PosibErr NormTables::get_new(const String & encoding, const Config * config) { String dir1,dir2,file_name; fill_data_dir(config, dir1, dir2); find_file(file_name,dir1,dir2,encoding,".cmap"); FStream in; PosibErrBase err = in.open(file_name, "r"); if (err.get_err()) { char mesg[300]; snprintf(mesg, 300, _("This could also mean that the file \"%s\" could not be opened for reading or does not exist."), file_name.c_str()); return make_err(unknown_encoding, encoding, mesg); // FIXME } NormTables * d = new NormTables; d->key = encoding; String l; get_nb_line(in, l); remove_comments(l); assert (l == "INTERNAL"); get_nb_line(in, l); remove_comments(l); assert (l == "/"); d->internal = create_norm_table(in, l); get_nb_line(in, l); remove_comments(l); assert (l == "STRICT"); char * p = get_nb_line(in, l); remove_comments(l); if (l == "/") { d->strict_d = create_norm_table(in, l); d->strict = d->strict_d; } else { assert(*p == '='); ++p; ++p; assert(strcmp(p, "INTERNAL") == 0); d->strict = d->internal; } while (get_nb_line(in, l)) { remove_comments(l); d->to_uni.push_back(ToUniTable()); ToUniTable & e = d->to_uni.back(); e.name.resize(l.size()); for (unsigned i = 0; i != l.size(); ++i) e.name[i] = asc_tolower(l[i]); char * p = get_nb_line(in, l); remove_comments(l); if (l == "/") { e.ptr = e.data = create_norm_table(in,l); } else { assert(*p == '='); ++p; ++p; for (char * q = p; *q; ++q) *q = asc_tolower(*q); Vector::iterator i = d->to_uni.begin(); while (i->name != p && i != d->to_uni.end()) ++i; assert(i != d->to_uni.end()); e.ptr = i->ptr; get_nb_line(in, l); } } return d; } NormTables::~NormTables() { free_norm_table(internal); if (strict_d) free_norm_table(strict_d); for (unsigned i = 0; i != to_uni.size(); ++i) { if (to_uni[i].data) free_norm_table(to_uni[i].data); } } ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// // // Convert // ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// bool operator== (const Convert & rhs, const Convert & lhs) { return strcmp(rhs.in_code(), lhs.in_code()) == 0 && strcmp(rhs.out_code(), lhs.out_code()) == 0; } ////////////////////////////////////////////////////////////////////// // // Trivial Conversion // template struct DecodeDirect : public Decode { void decode(const char * in0, int size, FilterCharVector & out) const { const Chr * in = reinterpret_cast(in0); if (size == -1) { for (;*in; ++in) out.append(*in); } else { const Chr * stop = reinterpret_cast(in0 +size); for (;in != stop; ++in) out.append(*in); } } PosibErr decode_ec(const char * in0, int size, FilterCharVector & out, ParmStr) const { DecodeDirect::decode(in0, size, out); return no_err; } }; template struct EncodeDirect : public Encode { void encode(const FilterChar * in, const FilterChar * stop, CharVector & out) const { for (; in != stop; ++in) { Chr c = in->chr; if (c != in->chr) c = '?'; out.append(&c, sizeof(Chr)); } } PosibErr encode_ec(const FilterChar * in, const FilterChar * stop, CharVector & out, ParmStr orig) const { for (; in != stop; ++in) { Chr c = in->chr; if (c != in->chr) { char m[70]; snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); return make_err(invalid_string, orig, m); } out.append(&c, sizeof(Chr)); } return no_err; } bool encode(FilterChar * &, FilterChar * &, FilterCharVector &) const { return true; } }; template struct ConvDirect : public DirectConv { void convert(const char * in0, int size, CharVector & out) const { if (size == -1) { const Chr * in = reinterpret_cast(in0); for (;*in != 0; ++in) out.append(in, sizeof(Chr)); } else { out.append(in0, size); } } PosibErr convert_ec(const char * in0, int size, CharVector & out, ParmStr) const { ConvDirect::convert(in0, size, out); return no_err; } }; ////////////////////////////////////////////////////////////////////// // // Lookup Conversion // struct DecodeLookup : public Decode { ToUniLookup lookup; PosibErr init(ParmStr code, const Config & c) { FromUniLookup unused; return read_in_char_data(c, code, lookup, unused); } void decode(const char * in, int size, FilterCharVector & out) const { if (size == -1) { for (;*in; ++in) out.append(lookup[*in]); } else { const char * stop = in + size; for (;in != stop; ++in) out.append(lookup[*in]); } } PosibErr decode_ec(const char * in, int size, FilterCharVector & out, ParmStr) const { DecodeLookup::decode(in, size, out); return no_err; } }; struct DecodeNormLookup : public Decode { typedef ToUniNormEntry E; NormTable * data; DecodeNormLookup(NormTable * d) : data(d) {} // must be null terminated // FIXME: Why must it be null terminated? void decode(const char * in, int size, FilterCharVector & out) const { const char * stop = in + size; // will word even if size -1 while (in != stop) { if (*in == 0) { if (size == -1) break; out.append(0); ++in; } else { NormLookupRet ret = norm_lookup(data, in, stop, 0, in); for (unsigned i = 0; ret.to[i] && i < E::max_to; ++i) out.append(ret.to[i]); in = ret.last + 1; } } } PosibErr decode_ec(const char * in, int size, FilterCharVector & out, ParmStr) const { DecodeNormLookup::decode(in, size, out); return no_err; } }; struct EncodeLookup : public Encode { FromUniLookup lookup; PosibErr init(ParmStr code, const Config & c) {ToUniLookup unused; return read_in_char_data(c, code, unused, lookup);} void encode(const FilterChar * in, const FilterChar * stop, CharVector & out) const { for (; in != stop; ++in) { out.append(lookup(*in)); } } PosibErr encode_ec(const FilterChar * in, const FilterChar * stop, CharVector & out, ParmStr orig) const { for (; in != stop; ++in) { char c = lookup(*in, '\0'); if (c == '\0' && in->chr != 0) { char m[70]; snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); return make_err(invalid_string, orig, m); } out.append(c); } return no_err; } bool encode(FilterChar * & in0, FilterChar * & stop, FilterCharVector & out) const { FilterChar * in = in0; for (; in != stop; ++in) *in = lookup(*in); return true; } }; struct EncodeNormLookup : public Encode { typedef FromUniNormEntry E; NormTable * data; EncodeNormLookup(NormTable * d) : data(d) {} // *stop must equal 0 void encode(const FilterChar * in, const FilterChar * stop, CharVector & out) const { while (in < stop) { if (*in == 0) { out.append('\0'); ++in; } else { NormLookupRet ret = norm_lookup(data, in, stop, (const byte *)"?", in); for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) out.append(ret.to[i]); in = ret.last + 1; } } } PosibErr encode_ec(const FilterChar * in, const FilterChar * stop, CharVector & out, ParmStr orig) const { while (in < stop) { if (*in == 0) { out.append('\0'); ++in; } else { NormLookupRet ret = norm_lookup(data, in, stop, 0, in); if (ret.to == 0) { char m[70]; snprintf(m, 70, _("The Unicode code point U+%04X is unsupported."), in->chr); return make_err(invalid_string, orig, m); } for (unsigned i = 0; i < E::max_to && ret.to[i]; ++i) out.append(ret.to[i]); in = ret.last + 1; } } return no_err; } bool encode(FilterChar * & in, FilterChar * & stop, FilterCharVector & buf) const { buf.clear(); while (in < stop) { if (*in == 0) { buf.append(FilterChar(0)); ++in; } else { NormLookupRet ret = norm_lookup(data, in, stop, (const byte *)"?", in); const FilterChar * end = ret.last + 1; unsigned width = 0; for (; in != end; ++in) width += in->width; buf.append(FilterChar(ret.to[0], width)); for (unsigned i = 1; i < E::max_to && ret.to[i]; ++i) { buf.append(FilterChar(ret.to[i],0)); } } } buf.append(0); in = buf.pbegin(); stop = buf.pend(); return true; } }; ////////////////////////////////////////////////////////////////////// // // UTF8 // #define get_check_next \ if (in == stop) goto error; \ c = *in; \ if ((c & 0xC0) != 0x80) goto error; \ ++in; \ u <<= 6; \ u |= c & 0x3F; \ ++w; static inline FilterChar from_utf8 (const char * & in, const char * stop, Uni32 err_char = '?') { Uni32 u = (Uni32)(-1); FilterChar::Width w = 1; // the first char is guaranteed not to be off the end char c = *in; ++in; while (in != stop && (c & 0xC0) == 0x80) {c = *in; ++in; ++w;} if ((c & 0x80) == 0x00) { // 1-byte wide u = c; } else if ((c & 0xE0) == 0xC0) { // 2-byte wide u = c & 0x1F; get_check_next; } else if ((c & 0xF0) == 0xE0) { // 3-byte wide u = c & 0x0F; get_check_next; get_check_next; } else if ((c & 0xF8) == 0xF0) { // 4-byte wide u = c & 0x07; get_check_next; get_check_next; get_check_next; } else { goto error; } return FilterChar(u, w); error: return FilterChar(err_char, w); } static inline void to_utf8 (FilterChar in, CharVector & out) { FilterChar::Chr c = in; if (c < 0x80) { out.append(c); } else if (c < 0x800) { out.append(0xC0 | c>>6); out.append(0x80 | c & 0x3F); } else if (c < 0x10000) { out.append(0xE0 | c>>12); out.append(0x80 | c>>6 & 0x3F); out.append(0x80 | c & 0x3F); } else if (c < 0x200000) { out.append(0xF0 | c>>18); out.append(0x80 | c>>12 & 0x3F); out.append(0x80 | c>>6 & 0x3F); out.append(0x80 | c & 0x3F); } } struct DecodeUtf8 : public Decode { ToUniLookup lookup; void decode(const char * in, int size, FilterCharVector & out) const { const char * stop = in + size; // this is OK even if size == -1 while (*in && in != stop) { out.append(from_utf8(in, stop)); } } PosibErr decode_ec(const char * in, int size, FilterCharVector & out, ParmStr orig) const { const char * begin = in; const char * stop = in + size; // this is OK even if size == -1 while (*in && in != stop) { FilterChar c = from_utf8(in, stop, (Uni32)-1); if (c == (Uni32)-1) { char m[70]; snprintf(m, 70, _("Invalid UTF-8 sequence at position %d."), in - begin); return make_err(invalid_string, orig, m); } out.append(c); } return no_err; } }; struct EncodeUtf8 : public Encode { FromUniLookup lookup; void encode(const FilterChar * in, const FilterChar * stop, CharVector & out) const { for (; in != stop; ++in) { to_utf8(*in, out); } } PosibErr encode_ec(const FilterChar * in, const FilterChar * stop, CharVector & out, ParmStr) const { for (; in != stop; ++in) { to_utf8(*in, out); } return no_err; } }; ////////////////////////////////////////////////////////////////////// // // Cache // static GlobalCache decode_cache("decode"); static GlobalCache encode_cache("encode"); static GlobalCache norm_tables_cache("norm_tables"); ////////////////////////////////////////////////////////////////////// // // new_aspell_convert // void Convert::generic_convert(const char * in, int size, CharVector & out) { buf_.clear(); decode_->decode(in, size, buf_); FilterChar * start = buf_.pbegin(); FilterChar * stop = buf_.pend(); if (!filter.empty()) filter.process(start, stop); encode_->encode(start, stop, out); } const char * fix_encoding_str(ParmStr enc, String & buf) { buf.clear(); buf.reserve(enc.size() + 1); for (size_t i = 0; i != enc.size(); ++i) buf.push_back(asc_tolower(enc[i])); if (strncmp(buf.c_str(), "iso8859", 7) == 0) buf.insert(buf.begin() + 3, '-'); // For backwards compatibility if (buf == "ascii" || buf == "ansi_x3.4-1968") return "iso-8859-1"; else if (buf == "machine unsigned 16" || buf == "utf-16") return "ucs-2"; else if (buf == "machine unsigned 32" || buf == "utf-32") return "ucs-4"; else return buf.c_str(); } bool ascii_encoding(const Config & c, ParmStr enc0) { if (enc0.empty()) return true; if (enc0 == "ANSI_X3.4-1968" || enc0 == "ASCII" || enc0 == "ascii") return true; String buf; const char * enc = fix_encoding_str(enc0, buf); if (strcmp(enc, "utf-8") == 0 || strcmp(enc, "ucs-2") == 0 || strcmp(enc, "ucs-4") == 0) return false; String dir1,dir2,file_name; fill_data_dir(&c, dir1, dir2); file_name << dir1 << enc << ".cset"; if (file_exists(file_name)) return false; if (dir1 == dir2) return true; file_name.clear(); file_name << dir2 << enc << ".cset"; return !file_exists(file_name); } PosibErr internal_new_convert(const Config & c, ParmString in, ParmString out, bool if_needed, Normalize norm) { String in_s; in = fix_encoding_str(in, in_s); String out_s; out = fix_encoding_str(out, out_s); if (if_needed && in == out) return 0; StackPtr conv(new Convert); switch (norm) { case NormNone: RET_ON_ERR(conv->init(c, in, out)); break; case NormFrom: RET_ON_ERR(conv->init_norm_from(c, in, out)); break; case NormTo: RET_ON_ERR(conv->init_norm_to(c, in, out)); break; } return conv.release(); } PosibErr Decode::get_new(const String & key, const Config * c) { StackPtr ptr; if (key == "iso-8859-1") ptr.reset(new DecodeDirect); else if (key == "ucs-2") ptr.reset(new DecodeDirect); else if (key == "ucs-4") ptr.reset(new DecodeDirect); else if (key == "utf-8") ptr.reset(new DecodeUtf8); else ptr.reset(new DecodeLookup); RET_ON_ERR(ptr->init(key, *c)); ptr->key = key; return ptr.release(); } PosibErr Encode::get_new(const String & key, const Config * c) { StackPtr ptr; if (key == "iso-8859-1") ptr.reset(new EncodeDirect); else if (key == "ucs-2") ptr.reset(new EncodeDirect); else if (key == "ucs-4") ptr.reset(new EncodeDirect); else if (key == "utf-8") ptr.reset(new EncodeUtf8); else ptr.reset(new EncodeLookup); RET_ON_ERR(ptr->init(key, *c)); ptr->key = key; return ptr.release(); } Convert::~Convert() {} PosibErr Convert::init(const Config & c, ParmStr in, ParmStr out) { RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); decode_ = decode_c.get(); RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); encode_ = encode_c.get(); conv_ = 0; if (in == out) { if (in == "ucs-2") { conv_ = new ConvDirect; } else if (in == "ucs-4") { conv_ = new ConvDirect; } else { conv_ = new ConvDirect; } } if (conv_) RET_ON_ERR(conv_->init(decode_, encode_, c)); return no_err; } PosibErr Convert::init_norm_from(const Config & c, ParmStr in, ParmStr out) { if (!c.retrieve_bool("normalize") && !c.retrieve_bool("norm-required")) return init(c,in,out); RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, out)); RET_ON_ERR(setup(decode_c, &decode_cache, &c, in)); decode_ = decode_c.get(); if (c.retrieve_bool("norm-strict")) { encode_s = new EncodeNormLookup(norm_tables_->strict); encode_ = encode_s; encode_->key = out; encode_->key += ":strict"; } else { encode_s = new EncodeNormLookup(norm_tables_->internal); encode_ = encode_s; encode_->key = out; encode_->key += ":internal"; } conv_ = 0; return no_err; } PosibErr Convert::init_norm_to(const Config & c, ParmStr in, ParmStr out) { String norm_form = c.retrieve("norm-form"); if ((!c.retrieve_bool("normalize") || norm_form == "none") && !c.retrieve_bool("norm-required")) return init(c,in,out); if (norm_form == "none" && c.retrieve_bool("norm-required")) norm_form = "nfc"; RET_ON_ERR(setup(norm_tables_, &norm_tables_cache, &c, in)); RET_ON_ERR(setup(encode_c, &encode_cache, &c, out)); encode_ = encode_c.get(); NormTables::ToUni::const_iterator i = norm_tables_->to_uni.begin(); for (; i != norm_tables_->to_uni.end() && i->name != norm_form; ++i); assert(i != norm_tables_->to_uni.end()); decode_s = new DecodeNormLookup(i->ptr); decode_ = decode_s; decode_->key = in; decode_->key += ':'; decode_->key += i->name; conv_ = 0; return no_err; } PosibErr MBLen::setup(const Config &, ParmStr enc0) { String buf; const char * enc = fix_encoding_str(enc0,buf); if (strcmp(enc, "utf-8") == 0) encoding = UTF8; else if (strcmp(enc, "ucs-2") == 0) encoding = UCS2; else if (strcmp(enc, "ucs-4") == 0) encoding = UCS4; else encoding = Other; return no_err; } unsigned MBLen::operator()(const char * str, const char * stop) { unsigned size = 0; switch (encoding) { case Other: return stop - str; case UTF8: for (; str != stop; ++str) { if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) ++size; } return size; case UCS2: return (stop - str)/2; case UCS4: return (stop - str)/4; } return 0; } }