package String::Multibyte::Johab; use vars qw($VERSION); $VERSION = '1.02'; # Hangul Letter Next Trailing Byte use vars qw(@HLNT %HLNT1 %HLNT2 %HLNT3); # Hangul Syllable Next Trailing Byte use vars qw(@HSNT %HSNT1 %HSNT2 %HSNT3); %HLNT1 = ( 0x44, 0x46, 0x47, 0x4A, 0x50, 0x54, 0x54, 0x61, 0x61, 0x81, 0x81, 0xA1, 0xA1, 0xC1, 0xC1, 0xE1, 0xE1, 0x41, ); %HLNT2 = ( 0x41, 0x61, 0x61, 0x81, 0x81, 0xA1, 0xA1, 0xC1, 0xC1, 0xE1, 0xE1, 0x41, ); %HLNT3 = ( 0x41, 0x61, 0x61, 0x81, 0x81, 0xA1, 0xA1, 0x41, ); @HLNT = (\%HLNT1, \%HLNT2, \%HLNT2, \%HLNT3); %HSNT1 = ( 0x41, 0x61, 0x71, 0x73, 0x7D, 0x81, 0x91, 0x93, 0x9D, 0xA1, 0xB1, 0xB3, 0xBD, 0xC1, 0xD1, 0xD3, 0xDD, 0xE1, 0xF1, 0xF3, 0xFD, 0x41, ); %HSNT2 = ( 0x51, 0x53, 0x5D, 0x61, 0x71, 0x73, 0x7D, 0x81, 0x91, 0x93, 0x9D, 0xA1, 0xB1, 0xB3, 0xBD, 0xC1, 0xD1, 0xD3, 0xDD, 0xE1, 0xF1, 0xF3, 0xFD, 0x41, ); %HSNT3 = ( 0x51, 0x53, 0x5D, 0x61, 0x71, 0x73, 0x7D, 0x81, 0x91, 0x93, 0x9D, 0xA1, 0xB1, 0xB3, 0xBD, 0x41, ); @HSNT = (\%HSNT1, \%HSNT2, \%HSNT2, \%HSNT3); +{ charset => 'Johab', regexp => '(?:[\x00-\x7F]|[\xD8-\xDE\xE0-\xF9][\x31-\x7E\x91-\xFE]|' . '\x84[\x44\x46\x47\x4A-\x50\x54\x61\x81\xA1\xC1\xE1]|' . '[\x85\x86][\x41\x61\x81\xA1\xC1\xE1]|\x87[\x41\x61\x81\xA1]|' . '[\x88\x8C\x90\x94\x98\x9C\xA0\xA4\xA8\xAC\xB0\xB4\xB8\xBC' . '\xC0\xC4\xC8\xCC\xD0][\x41\x61-\x71\x73-\x7D\x81-\x91\x93-\x9D' . '\xA1-\xB1\xB3-\xBD\xC1-\xD1\xD3-\xDD\xE1-\xF1\xF3-\xFD]|' . '[\x89\x8A\x8D\x8E\x91\x92\x95\x96\x99\x9A\x9D\x9E\xA1\xA2\xA5\xA6' . '\xA9\xAA\xAD\xAE\xB1\xB2\xB5\xB6\xB9\xBA\xBD\xBE\xC1\xC2\xC5\xC6' . '\xC9\xCA\xCD\xCE\xD1\xD2][\x41-\x51\x53-\x5D\x61-\x71\x73-\x7D' . '\x81-\x91\x93-\x9D\xA1-\xB1\xB3-\xBD\xC1-\xD1\xD3-\xDD\xE1-\xF1' . '\xF3-\xFD]|' . '[\x8B\x8F\x93\x97\x9B\x9F\xA3\xA7\xAB\xAF\xB3\xB7\xBB\xBF\xC3\xC7' . '\xCB\xCF\xD3][\x41-\x51\x53-\x5D\x61-\x71\x73-\x7D\x81-\x91' . '\x93-\x9D\xA1-\xB1\xB3-\xBD])', cmpchar => sub { $_[0] cmp $_[1] }, nextchar => sub { my $ch = shift; my $len = length $ch; if ($len == 1) { return $ch eq "\x7F" ? "\x84\x44" : chr(ord($ch)+1); } elsif ($len == 2) { return undef if $ch eq "\xF9\xFE"; return "\xD8\x31" if $ch eq "\xD3\xBD"; # Hangul to non-Hangul return "\xE0\x31" if $ch eq "\xDE\xFE"; # gap in non-Hangul my ($n, $c, $d); ($c, $d) = unpack('CC', $ch); if (0x84 <= $c && $c <= 0x87 && ($n = $HLNT[$c % 4]{$d}) || 0x88 <= $c && $c <= 0xD3 && ($n = $HSNT[$c % 4]{$d}) ) { return $n == 0x41 ? pack('CC', $c+1, $n) : pack('CC', $c, $n); } else { return $d == 0xFE ? chr($c+1)."\x31" : $d == 0x7E ? chr($c)."\x91" : pack('CC', $c, $d+1); } } else { return; } }, }; __END__ =head1 NAME String::Multibyte::Johab - internally used by String::Multibyte for Johab =head1 SYNOPSIS use String::Multibyte; $johab = String::Multibyte->new('Johab'); $johab_length = $johab->length($johab_string); =head1 DESCRIPTION C is used for manipulation of strings in Johab. Byte range of single-byte characters: C<0x00..0x7F>. Leading byte range of non-Hangul double-byte characters: C<0xD8..0xDE, 0xE0..0xF9>. Trailing byte range of non-Hangul double-byte characters: C<0x31..0x7E, 0x91..0xFE>. Character order (invalid code points are excluded): C<0x00..0x7F>, C, C<0xD831..0xF9FE>. =head1 CAVEAT C<0x7F> (DEL) is included. =head1 SEE ALSO L =cut