#!/usr/bin/perl # $File: //member/autrijus/Encode-HanConvert/map/map2ucm.pl $ $Author: autrijus $ # $Revision: #2 $ $Change: 3939 $ $DateTime: 2003/01/27 22:52:26 $ use strict; use Encode 1.41; use File::Spec; use File::Basename; my $path = dirname($0); conv(File::Spec->catdir($path, 'b2g_map.txt') => 'big5-simp', 'gbk'); conv(File::Spec->catdir($path, 'g2b_map.txt') => 'gbk-trad', 'big5'); sub conv { my ($src, $target, $enc) = @_; my %count; open IN, $src or die $!; open OUT, ">$target.ucm" or die $!; print OUT << "."; # This is generated from $src -- please change that file instead. # Yes, this .ucm map is not round-trip safe; HanConvert is a lossy operation. "$target" . print OUT +HEADER(); print OUT +B5HEADER() unless $target =~ /gbk/i; ; ; while () { my $uchar = decode($enc, substr($_, 3, 2)) or next; printf OUT " \\x%02X\\x%02X |%u\n", ord($uchar), ord(substr($_, 0, 1)), ord(substr($_, 1, 1)), 0; # XXX - suggestions welcome to the fallback char here } print OUT +B5FOOTER() unless $target =~ /gbk/i; print OUT +FOOTER(); close OUT; close IN; } use constant HEADER => << '.'; 1 2 \x3F # CHARMAP \x00 |0 # NULL \x01 |0 # START OF HEADING \x02 |0 # START OF TEXT \x03 |0 # END OF TEXT \x04 |0 # END OF TRANSMISSION \x05 |0 # ENQUIRY \x06 |0 # ACKNOWLEDGE \x07 |0 # BELL \x08 |0 # BACKSPACE \x09 |0 # HORIZONTAL TABULATION \x0A |0 # LINE FEED \x0B |0 # VERTICAL TABULATION \x0C |0 # FORM FEED \x0D |0 # CARRIAGE RETURN \x0E |0 # SHIFT OUT \x0F |0 # SHIFT IN \x10 |0 # DATA LINK ESCAPE \x11 |0 # DEVICE CONTROL ONE \x12 |0 # DEVICE CONTROL TWO \x13 |0 # DEVICE CONTROL THREE \x14 |0 # DEVICE CONTROL FOUR \x15 |0 # NEGATIVE ACKNOWLEDGE \x16 |0 # SYNCHRONOUS IDLE \x17 |0 # END OF TRANSMISSION BLOCK \x18 |0 # CANCEL \x19 |0 # END OF MEDIUM \x1A |0 # SUBSTITUTE \x1B |0 # ESCAPE \x1C |0 # FILE SEPARATOR \x1D |0 # GROUP SEPARATOR \x1E |0 # RECORD SEPARATOR \x1F |0 # UNIT SEPARATOR \x20 |0 # SPACE \x21 |0 # EXCLAMATION MARK \x22 |0 # QUOTATION MARK \x23 |0 # NUMBER SIGN \x24 |0 # DOLLAR SIGN \x25 |0 # PERCENT SIGN \x26 |0 # AMPERSAND \x27 |0 # APOSTROPHE \x28 |0 # LEFT PARENTHESIS \x29 |0 # RIGHT PARENTHESIS \x2A |0 # ASTERISK \x2B |0 # PLUS SIGN \x2C |0 # COMMA \x2D |0 # HYPHEN-MINUS \x2E |0 # FULL STOP \x2F |0 # SOLIDUS \x30 |0 # DIGIT ZERO \x31 |0 # DIGIT ONE \x32 |0 # DIGIT TWO \x33 |0 # DIGIT THREE \x34 |0 # DIGIT FOUR \x35 |0 # DIGIT FIVE \x36 |0 # DIGIT SIX \x37 |0 # DIGIT SEVEN \x38 |0 # DIGIT EIGHT \x39 |0 # DIGIT NINE \x3A |0 # COLON \x3B |0 # SEMICOLON \x3C |0 # LESS-THAN SIGN \x3D |0 # EQUALS SIGN \x3E |0 # GREATER-THAN SIGN \x3F |0 # QUESTION MARK \x40 |0 # COMMERCIAL AT \x41 |0 # LATIN CAPITAL LETTER A \x42 |0 # LATIN CAPITAL LETTER B \x43 |0 # LATIN CAPITAL LETTER C \x44 |0 # LATIN CAPITAL LETTER D \x45 |0 # LATIN CAPITAL LETTER E \x46 |0 # LATIN CAPITAL LETTER F \x47 |0 # LATIN CAPITAL LETTER G \x48 |0 # LATIN CAPITAL LETTER H \x49 |0 # LATIN CAPITAL LETTER I \x4A |0 # LATIN CAPITAL LETTER J \x4B |0 # LATIN CAPITAL LETTER K \x4C |0 # LATIN CAPITAL LETTER L \x4D |0 # LATIN CAPITAL LETTER M \x4E |0 # LATIN CAPITAL LETTER N \x4F |0 # LATIN CAPITAL LETTER O \x50 |0 # LATIN CAPITAL LETTER P \x51 |0 # LATIN CAPITAL LETTER Q \x52 |0 # LATIN CAPITAL LETTER R \x53 |0 # LATIN CAPITAL LETTER S \x54 |0 # LATIN CAPITAL LETTER T \x55 |0 # LATIN CAPITAL LETTER U \x56 |0 # LATIN CAPITAL LETTER V \x57 |0 # LATIN CAPITAL LETTER W \x58 |0 # LATIN CAPITAL LETTER X \x59 |0 # LATIN CAPITAL LETTER Y \x5A |0 # LATIN CAPITAL LETTER Z \x5B |0 # LEFT SQUARE BRACKET \x5C |0 # REVERSE SOLIDUS \x5D |0 # RIGHT SQUARE BRACKET \x5E |0 # CIRCUMFLEX ACCENT \x5F |0 # LOW LINE \x60 |0 # GRAVE ACCENT \x61 |0 # LATIN SMALL LETTER A \x62 |0 # LATIN SMALL LETTER B \x63 |0 # LATIN SMALL LETTER C \x64 |0 # LATIN SMALL LETTER D \x65 |0 # LATIN SMALL LETTER E \x66 |0 # LATIN SMALL LETTER F \x67 |0 # LATIN SMALL LETTER G \x68 |0 # LATIN SMALL LETTER H \x69 |0 # LATIN SMALL LETTER I \x6A |0 # LATIN SMALL LETTER J \x6B |0 # LATIN SMALL LETTER K \x6C |0 # LATIN SMALL LETTER L \x6D |0 # LATIN SMALL LETTER M \x6E |0 # LATIN SMALL LETTER N \x6F |0 # LATIN SMALL LETTER O \x70 |0 # LATIN SMALL LETTER P \x71 |0 # LATIN SMALL LETTER Q \x72 |0 # LATIN SMALL LETTER R \x73 |0 # LATIN SMALL LETTER S \x74 |0 # LATIN SMALL LETTER T \x75 |0 # LATIN SMALL LETTER U \x76 |0 # LATIN SMALL LETTER V \x77 |0 # LATIN SMALL LETTER W \x78 |0 # LATIN SMALL LETTER X \x79 |0 # LATIN SMALL LETTER Y \x7A |0 # LATIN SMALL LETTER Z \x7B |0 # LEFT CURLY BRACKET \x7C |0 # VERTICAL LINE \x7D |0 # RIGHT CURLY BRACKET \x7E |0 # TILDE \x7F |0 # DELETE \x80 |0 # . use constant B5HEADER => << '.'; \x81 |0 # \x82 |0 # BREAK PERMITTED HERE \x83 |0 # NO BREAK HERE \x84 |0 # \x85 |0 # NEXT LINE \x86 |0 # START OF SELECTED AREA \x87 |0 # END OF SELECTED AREA \x88 |0 # CHARACTER TABULATION SET \x89 |0 # CHARACTER TABULATION WITH JUSTIFICATION \x8A |0 # LINE TABULATION SET \x8B |0 # PARTIAL LINE DOWN \x8C |0 # PARTIAL LINE UP \x8D |0 # REVERSE LINE FEED \x8E |0 # SINGLE SHIFT TWO \x8F |0 # SINGLE SHIFT THREE \x90 |0 # DEVICE CONTROL STRING \x91 |0 # PRIVATE USE ONE \x92 |0 # PRIVATE USE TWO \x93 |0 # SET TRANSMIT STATE \x94 |0 # CANCEL CHARACTER \x95 |0 # MESSAGE WAITING \x96 |0 # START OF GUARDED AREA \x97 |0 # END OF GUARDED AREA \x98 |0 # START OF STRING \x99 |0 # \x9A |0 # SINGLE CHARACTER INTRODUCER \x9B |0 # CONTROL SEQUENCE INTRODUCER \x9C |0 # STRING TERMINATOR \x9D |0 # OPERATING SYSTEM COMMAND \x9E |0 # PRIVACY MESSAGE \x9F |0 # APPLICATION PROGRAM COMMAND \xA0 |0 # NO-BREAK SPACE . use constant B5FOOTER => << '.'; \xFA |0 # LATIN SMALL LETTER U WITH ACUTE \xFC |0 # LATIN SMALL LETTER U WITH CIRCUMFLEX \xFD |0 # LATIN SMALL LETTER Y WITH ACUTE \xFE |0 # LATIN SMALL LETTER THORN . use constant FOOTER => << '.'; \xFF |0 # LATIN SMALL LETTER Y WITH DIAERESIS END CHARMAP .