# # genmap_tchinese.py: Traditional Chinese Codecs Map Generator # # Copyright (C) 2003-2004 Hye-Shik Chang . # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # $Id: genmap_tchinese.py,v 1.8 2004/07/07 15:35:45 perky Exp $ # import sys import re from genmap_support import * BIG5_C1 = (0xa1, 0xfe) BIG5_C2 = (0x40, 0xfe) # big5 map doesn't have 0xA3E1 (EURO SIGN), but we ignore # that for forward compatiblilty. "Hey! we have the euro-big5!" :) CP950_C1 = BIG5_C1 CP950_C2 = BIG5_C2 BIG5HKSCS_C1= (0x88, 0xfe) BIG5HKSCS_C2= (0x40, 0xfe) CNS11643_C1 = (0x21, 0x7e) CNS11643_C2 = (0x21, 0x7e) try: big5map = open('BIG5.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT" raise SystemExit try: cp950map = open('CP950.TXT') except IOError: print "=>> Please download mapping table from http://www.unicode." \ "org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT" raise SystemExit try: big5hkscsmap = open('big5-iso.txt') except IOError: print "=>> Please download mapping table from http://www.info.gov." \ "hk/digital21/eng/hkscs/download/big5-iso.txt" raise SystemExit try: cns11643map = open("cns-11643-1992.ucm") except IOError: print "=>> Please download mapping table from http://oss.software." \ "ibm.com/cvs/icu/~checkout~/icu/source/data/mappings/cns-11643" \ "-1992.ucm" raise SystemExit class CNS11643MapReader(UCMReader): def parsedata(self, data): d = UCMReader.parsedata(self, data) return ord(d[0]) - 0x80, (ord(d[1]) << 8) + ord(d[2]) def readmap(self): print "Loading UCM mapping from ", self.file decmaps = [{} for i in range(8)] # 0(placeholder) 1-7 encmap_bmp = {} encmap_nonbmp = {} for i, (plane, code) in self.itertokens(): if code & 0x8000: raise ValueError if i < 0x10000: c1i = code >> 8 encmap = encmap_bmp elif 0x20000 <= i < 0x30000: c1i = (code >> 8) | 0x80 i &= 0xffff encmap = encmap_nonbmp else: raise ValueError, i decmaps[plane].setdefault(c1i, {}) decmaps[plane][c1i][code & 0xff] = i encmap.setdefault(i >> 8, {}) encmap[i >> 8][i & 0xff] = (plane, code) return decmaps, encmap_bmp, encmap_nonbmp def bh2s(code): return ((code >> 8) - 0x88) * (0xfe - 0x40 + 1) + ((code & 0xff) - 0x40) def loadhkscsmap(fo): print "Loading from", fo fo.seek(0, 0) decmap = {} encmap_bmp, encmap_nonbmp = {}, {} isbmpmap = {} lineparse = re.compile('^([0-9A-F]{4})\s*([0-9A-F]{4})\s*([0-9A-F]{4})\s*' '(2?[0-9A-F]{4})$') for line in fo: data = lineparse.findall(line) if not data: continue hkscs, u1993, ubmp, u2001 = [int(x, 16) for x in data[0]] decmap.setdefault(hkscs >> 8, {}) if u2001 > 0xffff: encmap = encmap_nonbmp isbmpmap[bh2s(hkscs)] = 1 u2001 &= 0xffff else: encmap = encmap_bmp decmap[hkscs >> 8][hkscs & 0xff] = u2001 encmap.setdefault(u2001 >> 8, {}) encmap[u2001 >> 8][u2001 & 0xff] = hkscs return decmap, encmap_bmp, encmap_nonbmp, isbmpmap print "Loading Mapping File..." cp950decmap = loadmap(cp950map) big5decmap = loadmap(big5map) hkscsdecmap, hkscsencmap_bmp, hkscsencmap_nonbmp, isbmpmap = \ loadhkscsmap(big5hkscsmap) cns11643decmaps, cns11643encmap_bmp, cns11643encmap_nonbmp = \ CNS11643MapReader(cns11643map).readmap() # big5 mapping fix (see doc/NOTES.big5) for m in """\ 0xA15A 0x2574 0xA1C3 0xFFE3 0xA1C5 0x02CD 0xA1FE 0xFF0F 0xA240 0xFF3C 0xA2CC 0x5341 0xA2CE 0x5345""".splitlines(): bcode, ucode = map(eval, m.split()) big5decmap[bcode >> 8][bcode & 0xff] = ucode big5encmap, cp950encmap = {}, {} for c1, m in big5decmap.items(): for c2, code in m.items(): big5encmap.setdefault(code >> 8, {}) if not big5encmap[code >> 8].has_key(code & 0xff): big5encmap[code >> 8][code & 0xff] = c1 << 8 | c2 for c1, m in cp950decmap.items(): for c2, code in m.items(): cp950encmap.setdefault(code >> 8, {}) if not cp950encmap[code >> 8].has_key(code & 0xff): cp950encmap[code >> 8][code & 0xff] = c1 << 8 | c2 # fix unicode->big5 duplicated mapping priority big5encmap[0xFF][0x0F] = 0xA241 big5encmap[0xFF][0x3C] = 0xA242 big5encmap[0x53][0x41] = 0xA451 big5encmap[0x53][0x45] = 0xA4CA cp950encmap[0x53][0x41] = 0xA451 cp950encmap[0x53][0x45] = 0xA4CA for c1, m in cp950encmap.items(): for c2, code in m.items(): if (big5encmap.has_key(c1) and big5encmap[c1].has_key(c2) and big5encmap[c1][c2] == code): del cp950encmap[c1][c2] for c1, m in cp950decmap.items(): for c2, code in m.items(): if (big5decmap.has_key(c1) and big5decmap[c1].has_key(c2) and big5decmap[c1][c2] == code): del cp950decmap[c1][c2] omap = open('mappings_tw.h', 'w') printcopyright(omap) print "Generating BIG5 decode map..." filler = BufferedFiller() genmap_decode(filler, "big5", BIG5_C1, BIG5_C2, big5decmap) print_decmap(omap, filler, "big5", big5decmap) print "Generating BIG5 encode map..." filler = BufferedFiller() genmap_encode(filler, "big5", big5encmap) print_encmap(omap, filler, "big5", big5encmap) print "Generating CP950 extension decode map..." filler = BufferedFiller() genmap_decode(filler, "cp950ext", BIG5_C1, BIG5_C2, cp950decmap) print_decmap(omap, filler, "cp950ext", cp950decmap) print "Generating CP950 extension encode map..." filler = BufferedFiller() genmap_encode(filler, "cp950ext", cp950encmap) print_encmap(omap, filler, "cp950ext", cp950encmap) omap = open('mappings_hk.h', 'w') printcopyright(omap) print "Generating BIG5HKSCS decode map..." filler = BufferedFiller() genmap_decode(filler, "big5hkscs", BIG5HKSCS_C1, BIG5HKSCS_C2, hkscsdecmap) print_decmap(omap, filler, "big5hkscs", hkscsdecmap) print "Generating BIG5HKSCS decode map Unicode plane hints..." filler = BufferedFiller() def fillhints(hintfrom, hintto): print >> omap, "static const unsigned char big5hkscs_phint_%d[] = {" \ % (hintfrom) for msbcode in range(hintfrom, hintto+1, 8): v = 0 for c in range(msbcode, msbcode+8): v |= isbmpmap.get(c, 0) << (c - msbcode) filler.write('%d,' % v) filler.printout(omap) print >> omap, "};\n" fillhints(bh2s(0x8840), bh2s(0xa0fe)) fillhints(bh2s(0xc6a1), bh2s(0xc8fe)) fillhints(bh2s(0xf9d6), bh2s(0xfefe)) print "Generating BIG5HKSCS encode map (BMP)..." filler = BufferedFiller() genmap_encode(filler, "big5hkscs_bmp", hkscsencmap_bmp) print_encmap(omap, filler, "big5hkscs_bmp", hkscsencmap_bmp) print "Generating BIG5HKSCS encode map (non-BMP)..." filler = BufferedFiller() genmap_encode(filler, "big5hkscs_nonbmp", hkscsencmap_nonbmp) print_encmap(omap, filler, "big5hkscs_nonbmp", hkscsencmap_nonbmp) omap = open('mappings_cns11643.h', 'w') printcopyright(omap) for plane in range(1, 8): print "Generating CNS11643 plane %d decode map..." % plane charset = "cns11643_%d" % plane filler = BufferedFiller() genmap_decode(filler, charset, CNS11643_C1, CNS11643_C2, cns11643decmaps[plane]) genmap_decode(filler, charset, [x | 0x80 for x in CNS11643_C1], CNS11643_C2, cns11643decmaps[plane]) print_decmap(omap, filler, charset, cns11643decmaps[plane]) class CNS11643EncodeMapWriter(EncodeMapWriter): elemtype = 'unsigned char' indextype = 'struct unim_index_bytebased' def write_nochar(self): self.filler.write('0,', '0,', '0,') def write_char(self, point): raise ValueError def write_multic(self, point): self.filler.write('%d,' % point[0], '%d,' % (point[1] >> 8), '%d,' % (point[1] & 0xff)) print "Generating CNS11643 encode map (BMP)..." CNS11643EncodeMapWriter(omap, 'cns11643_bmp', cns11643encmap_bmp) print "Generating CNS11643 encode map (non-BMP)..." CNS11643EncodeMapWriter(omap, 'cns11643_nonbmp', cns11643encmap_nonbmp) print >> omap, "static const struct dbcs_index *cns11643_decmap[] = {" print >> omap, "NULL," print >> omap, "\n".join(["cns11643_%d_decmap," % plane for plane in range(1, 8)]) print >> omap, "};" print "\nDone!"