/* utf8dec.cc UTF-8 to wchar_t Decoder Copyright (c) 2000 Kriang Lerdsuwanakij email: lerdsuwa@users.sourceforge.net This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "utf8dec.h" #include "strmisc.h" void utf8_decode::decode(wchar_t &wc, unsigned char c) { #ifdef USE_UTF8_MODE if (IsUTF8Mode()) { // Decode UTF-8 if (char_left == 0) { // No sequence in progress if (c < 0x80 || c > 0xFF) { // Out of UTF-8 range wc = c; extra_char = 0; need_char = 0; // End of sequence need_update = 1; // Display should be updated return; } else if (c < 0xC0) { // 10xxxxxx without 11xxxxxx extra_char = 0; need_char = 0; // End of sequence need_update = 0; // Display should be updated wc = 0; // Ignore return; } else if (c < 0xE0) { // 110xxxxx 10yyyyyy wc = (c & 0x1F) << 6; if (c & 0x1E) // Shouldn't be encoded like this is_validated = true; else is_validated = false; char_count = 1; char_left = 1; extra_char = 0; need_char = 1; need_update = 0; return; } else if (c < 0xF0) { // 1110xxxx 10yyyyyy 10yyyyyy wc = (c & 0xF) << 12; if (c & 0xF) is_validated = true; else is_validated = false; char_count = 2; char_left = 2; extra_char = 0; need_char = 1; need_update = 0; return; } else if (c < 0xF8) { // 11110xxx 10yyyyyy 10yyyyyy 10yyyyyy wc = (c & 0x7) << 18; if (c & 0x7) is_validated = true; else is_validated = false; char_count = 3; char_left = 3; extra_char = 0; need_char = 1; need_update = 0; return; } else if (c < 0xFC) { // 111110xx 10yyyyyy 10yyyyyy 10yyyyyy // 10yyyyyy wc = (c & 0x3) << 24; if (c & 0x3) is_validated = true; else is_validated = false; char_count = 4; char_left = 4; extra_char = 0; need_char = 1; need_update = 0; return; } else if (c < 0xFE) { // 1111110x 10yyyyyy 10yyyyyy 10yyyyyy // 10yyyyyy 10yyyyyy wc = (c & 0x1) << 30; if (c & 0x1) is_validated = true; else is_validated = false; char_count = 5; char_left = 5; extra_char = 0; need_char = 1; need_update = 0; return; } else { // Bad sequence extra_char = 0; need_char = 0; need_update = 0; wc = 0; return; } } else { if (c < 0x80 || c >= 0xC0) { extra_char = 1; need_char = 0; need_update = 0; wc = 0; return; } switch (char_count) { case 1: break; case 2: if (char_left == 2 && (c & 0x3F) >= 0x20) is_validated = true; break; case 3: if (char_left == 3 && (c & 0x3F) >= 0x10) is_validated = true; break; case 4: if (char_left == 4 && (c & 0x3F) >= 0x8) is_validated = true; break; case 5: if (char_left == 5 && (c & 0x3F) >= 0x4) is_validated = true; break; } wc |= (c & 0x3F) << ((char_left-1)*6); --char_left; if (char_left) { // Wait for more input extra_char = 0; need_char = 1; need_update = 0; return; } if (!is_validated) { // Invalid sequence extra_char = 0; need_char = 0; need_update = 0; wc = 0; return; } if (wc >= 0xD800 && wc <= 0xDFFF) { // Invalid surrogate usage extra_char = 0; need_char = 0; need_update = 0; wc = 0; return; } if (wc == 0xFFFE || wc == 0xFFFF) { // Invalid character extra_char = 0; need_char = 0; need_update = 0; wc = 0; return; } extra_char = 0; need_char = 0; need_update = 1; return; } } else #endif { wc = c; need_char = 0; // End of sequence need_update = 1; // Display should be updated return; } }