#!/usr/local/bin/perl # $Id: /mirror/trunk/bin/b2g.pl 39 2006-12-12T15:44:19.891338Z kcwu $ $VERSION = '0.12'; =head1 NAME b2g.pl - Traditional to Simplified Chinese converter =head1 SYNOPSIS B [ B<-p> ] [ B<-u> ] [ I ...] > I =head1 USAGE % b2g.pl -p < big5.txt > gbk.txt % b2g.pl -pu < trad.txt > simp.txt =head1 DESCRIPTION The B utility reads files sequentially, converts them from Traditional to Simplified Chinese, then writes them to the standard output. The I arguments are processed in command-line order. If I is a single dash (C<->) or absent, this program reads from the standard input. The C<-p> switch enables rudimentary phrase-oriented substition via a small built-in lexicon. The C<-u> switch specifies that both the input and output streams should be UTF-8 encoded. If not specified, the input stream is assumed to be in Big5, and the output will be encoded in GBK. =head1 CAVEATS In pure-perl implementations (pre-5.8 perl or without a C compiler), C<-p> and C<-u> cannot be used together. =cut use strict; use Getopt::Std; sub MAP (); my %opts; BEGIN { getopts('hup', \%opts); if ($opts{h}) { system("perldoc", $0); exit } $SIG{__WARN__} = sub {}; } use constant UTF8 => $opts{u}; use constant DICT => ($opts{p} and (!UTF8 or $] >= 5.008)); use Encode::HanConvert; my $KEYS = join('|', map quotemeta, sort { length($b) <=> length($a) } keys %{+MAP}) if DICT; my $MAP = +MAP if DICT; if (@ARGV) { for (@ARGV) { unless(open F, $_) { warn "Can't open $_: $!"; next; } convert(\*F); close F; } } else { convert(\*STDIN); } sub convert { my ($fh) = @_; if ($] >= 5.008) { if (UTF8) { binmode($fh, ':encoding(trad-simp)'); binmode(STDOUT, ':utf8') } else { binmode($fh, ':encoding(big5-simp)'); binmode(STDOUT, ':encoding(gbk)') } } while (<$fh>) { unless ($] >= 5.008) { if (UTF8) { Encode::HanConvert::trad_to_simp($_) } else { Encode::HanConvert::big5_to_gb($_) } } if (DICT) { s/($KEYS)/$MAP->{$1}/g } print; } } use constant MAP => DICT && { map { UTF8 ? Encode::decode(gbk => $_) : $_ } ( '乙太' => '以太', '乙太网路' => '以太网络', '九九乘法表' => '九九表', '中断点' => '断点', '介面' => '接口', '元件' => '组件', '公事包' => '公文包', '升等考试' => '升级考试', '升幂' => '升序', '太空人' => '航天员', '太空衣' => '宇宙飞行服', '太空梭' => '航天飞机', '太空船' => '宇宙飞船', '戈巴契夫' => '戈尔巴乔夫', '户口名簿' => '户口簿', '支援' => '支持', '文件夹' => '活页夹', '日尔曼民族' => '日耳曼民族', '比萨斜塔' => '比塞塔', '火拼' => '火并', '片语' => '词组', '功能表' => '菜单', '包谷' => '苞谷', '史达林' => '斯大林', '外太空' => '外层空间', '布希' => '布什', '平行作业' => '并行操作', '平行埠' => '并行端口', '平行线' => '并行线', '幼稚园' => '幼儿园', '母音' => '元音', '资料' => '数据', '休士顿' => '休斯敦', '仲介' => '中介', '光碟机' => '光驱', '全形' => '全角', '共用' => '共享', '冰棒' => '棒冰', '列印' => '打印', '印表机' => '打印机', '向光性' => '向旋光性', '因数' => '因子', '回圈' => '循环', '回应' => '响应', '多明尼加' => '多米尼加', '字串' => '字符串', '字首' => '前缀', '存档' => '存盘', '安甯' => '安宁', '收银机' => '收款机', '羽量级' => '轻量级', '衣索比亚' => '埃塞俄比亚', '西元' => '公元', '位址' => '地址', '伫列' => '队列', '佐证' => '左证', '伺服器' => '服务器', '作业系统' => '操作系统', '伯明罕' => '伯明翰', '低阶语言' => '低级语言', '即时' => '实时', '吸光性' => '吸旋光性', '困在' => '捆在', '困来困去' => '捆来捆去', '困暸' => '捆了', '宏都拉斯' => '洪都拉斯', '序列埠' => '串行端口', '沙拉油' => '色拉油', '沙乌地阿拉伯' => '沙特阿拉伯', '沈括' => '沉括', '身历声' => '立体声', '防写' => '写保护', '来福线' => '来复线', '其他' => '其它', '协定' => '协议', '卷轴' => '滚动条', '周边' => '外围', '屈光性' => '屈旋光性', '注脚' => '脚注', '注解' => '批注', '物件' => '对象', '知识份子' => '知识分子', '矽石' => '硅石', '矽晶片' => '硅芯片', '矽电晶体' => '硅晶体管', '空白键' => '空格键', '邱吉尔' => '丘吉尔', '门迳' => '门径', '阿姆斯壮' => '阿姆斯特朗', '阿斯匹灵' => '阿司匹林', '南瓜' => '番瓜', '指标' => '指针', '括弧' => '括号', '映射' => '映像', '畏光性' => '畏旋光性', '相容' => '兼容', '耶诞节' => '圣诞节', '背光性' => '背旋光性', '计时器' => '定时器', '计程车' => '出租车', '计算机' => '计算器', '迪斯可' => '迪斯科', '重播' => '回放', '食具' => '餐具', '倒楣' => '倒霉', '倡狂' => '猖狂', '候机室' => '候机楼', '套装软体' => '软件包', '座标' => '坐标', '弱光性' => '弱旋光性', '浮水印' => '水印', '乌沈沈' => '乌沉沉', '破音字' => '多音字', '秘笈' => '秘籍', '索马利亚' => '索马里', '航太总署' => '航天总署', '草菴' => '草庵', '记忆体' => '内存', '酒齇鼻' => '酒齄鼻', '阵列' => '数组', '偺们' => '咱们', '副程式' => '子程序', '副档名' => '扩展名', '啦啦队' => '拉拉队', '唯读' => '只读', '堆叠' => '堆栈', '专案' => '项目', '常式' => '例程', '康乃狄格' => '康涅狄克', '捷径' => '快捷方式', '扫描器' => '扫描仪', '启动' => '激活', '毫安培' => '毫安', '毕氏定理' => '勾股定理', '毕卡索' => '毕加索', '莫札特' => '莫扎特', '软片' => '胶卷', '通讯录' => '通讯簿', '通道' => '信道', '连线' => '联机', '速食' => '快餐', '钗钸' => '钗钚', '阴沈沈' => '阴沉沉', '顶呱呱' => '顶刮刮', '麻塞诸塞' => '马萨诸塞', '麻痹不暸' => '麻痹不了', '晶片' => '芯片', '智慧' => '智能', '游标' => '光标', '番茄' => '西红柿', '登出' => '注销', '硬体' => '硬件', '程式' => '程序', '程序控制' => '过程控制', '答录机' => '录音机', '答覆' => '答复', '结夥' => '结伙', '菸毒' => '烟毒', '视窗' => '窗口', '象模象样' => '像模象样', '超连结' => '超级链接', '邮递区号' => '邮政编码', '黑沈沈' => '黑沉沉', '乱数' => '随机数', '汇流排' => '总线', '塑胶' => '塑料', '奥克拉荷马州' => '俄克拉荷马州', '奥会' => '奥委会', '感光性' => '感旋光性', '新罕布夏' => '新罕布什尔', '暗沈沈' => '暗沉沉', '滑鼠' => '鼠标', '义大利' => '意大利', '圣地牙哥' => '圣地亚哥', '解析度' => '分辨率', '解码' => '译码', '试算表' => '电子表格', '资讯' => '信息', '载入' => '加载', '运算元' => '操作数', '运算式' => '表达式', '钜富' => '巨富', '闸道' => '网关', '雷射印表机' => '激光打印机', '雷根' => '里根', '电晶体' => '晶体管', '电传视讯' => '图文电视', '电脑' => '计算机', '电脑程式' => '计算机程序', '电锅' => '电饭锅', '图示' => '图标', '惨澹' => '惨淡', '摺合' => '折合', '演算法' => '算法', '磁片' => '磁盘', '磁轨' => '磁道', '磁区' => '扇区', '磁碟' => '磁盘', '磁碟机' => '磁盘驱动器', '磁碟档' => '磁盘文件', '碳粉匣' => '墨粉盒', '福马林' => '福尔马林', '管龠' => '管钥', '网路' => '网络', '蒐购' => '搜购', '宾士' => '奔驰', '赫鲁雪夫' => '赫鲁晓夫', '远端' => '远程', '凤梨' => '菠萝', '徵收' => '征收', '数据机' => '调制解调器', '暂存器' => '缓存器', '标签' => '卷标', '模组' => '模块', '模拟' => '仿真', '模拟计算机' => '仿真计算器', '磐石' => '盘石', '范本' => '模板', '糊里糊涂' => '胡里胡涂', '糊涂' => '胡涂', '线上作业' => '联机操作', '赐諡' => '赐谥', '遮罩' => '屏蔽', '醉薰薰' => '醉熏熏', '墨沈沈' => '墨沉沉', '壁报' => '墙报', '暸如' => '了如', '积体电路' => '集成电路', '萤幕' => '屏幕', '谘询' => '咨询', '诺曼地' => '诺曼底', '诺曼第' => '诺曼底', '录影' => '录像', '趋光性' => '趋旋光性', '鍊钢' => '炼钢', '锺县' => '钟县', '霜淇淋' => '冰淇淋', '渖阳' => '沈阳', '简报' => '演示文稿', '藉机' => '借机', '薰天' => '熏天', '转捩点' => '转折点', '杂讯' => '噪声', '鼕鼕' => '冬冬', '罗布林卡' => '罗布尔卡', '罗德岛' => '罗得岛', '识别字' => '标识符', '跷课' => '逃课', '辞汇' => '词汇', '关键字' => '关键词', '类比' => '模拟', '麴菌' => '曲菌', '携带型' => '便携式', '栏位' => '字段', '镭射' => '激光', '权杖' => '令牌', '读卡机' => '卡片阅读机', '佘太君' => '畲太君', '氂牛' => '牦牛', '衚同' => '胡同', '擣衣' => '捣衣', '回覆' => '回复', '位元组' => '字节', '链结' => '链接', '密码' => '口令', '「' => '‘', '」' => '’', '『' => '“', '』' => '”', '讚美' => '贊美', '讚頌' => '贊頌', '讚詞' => '贊詞', '檯前' => '台前', '檯燈' => '台燈', '檯上' => '台上', '臺灣' => '台灣', '臺北' => '台北', '臺南' => '台南', '臺東' => '台東', '颱風' => '台風', '麵粉' => '面粉', '麵包' => '面包', '麵條' => '面條', '炒麵' => '炒面', '撈麵' => '撈面', '王後' => '王后', '母後' => '母后', ) }; __END__ =head1 SEE ALSO L, L =head1 AUTHORS Currently maintained by Kuang-che Wu Ekcwu@csie.orgE. Orignal author: Audrey Tang Ecpan@audreyt.orgE =head1 COPYRIGHT Copyright 2002, 2003, 2004 by Audrey Tang Ecpan@audreyt.orgE. Copyright 2006 by Kuang-che Wu Ekcwu@csie.orgE. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See L =cut