/* Time-stamp: <2007-06-30 00:07:33 poser> * * Prints the names of the characters in a Unicode file. * * * Copyright (C) 2003-2007 William J. Poser (billposer@alum.mit.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * or go to the web page: http://www.gnu.org/licenses/gpl.txt. */ #include "config.h" #include #include #ifdef HAVE_LOCALE_H #include #endif #ifdef HAVE_LIBINTL_H #include #else #define gettext(x) (x) #endif #include #include #include #include #include #include #include "unicode.h" #include "utf8error.h" #include "unirange.h" #include "offsettbl.h" #include "nelson.h" #define MSGSIZE 128 char version[]=PACKAGE_VERSION; char compdate[]="Compiled " __DATE__ " " __TIME__ ; char pgname[]="uniname"; char msg [MSGSIZE]; extern int Offset_Entries; extern struct oft Offset_Table[]; static short CharacterOffsetP = 1; /* Show character offset? */ static short ByteOffsetP = 1; /* Show byte offset? */ static short UTF32CodeP = 1; /* Show UTF32 code? */ static short EncodingP = 1; /* Show encoding? */ static short UnicodeRangeP =0; /* Show Unicode range? */ static short UnicodeNameP =1; /* Show Unicode name? */ static short ShowGlyphP = 1; /* Display glyph? */ static short LineNumberP = 0; /* Display line number? */ static short SkipAsciiP = 0; /* Skip ASCII characters */ static short SkipAsciiSpaceP = 0; /* Skip ASCII whitespace characters */ static short SkipBMPP =0; /* Skip characters within BMP */ void ShowVersion(void){ fprintf(stderr,"\n%s %s\n",pgname,PACKAGE_VERSION); fprintf(stderr,"%s\n",compdate); fprintf(stderr,"Copyright (C) 2003-2007 William J. Poser\n"); fprintf(stderr,"Released under the terms of the GNU General Public License.\n\n"); } void ShowUsage(void){ fprintf(stderr,"Name the characters in a Unicode file.\n"); fprintf(stderr,"For each character, prints the character and byte offsets,\n"); fprintf(stderr,"the hex UTF32 character code, the encoding as a sequence of hex byte values\n"); fprintf(stderr,"the glyph, the range, and the character's Unicode name.\n"); fprintf(stderr,"Usage: %s [flags] ()\n",pgname); fprintf(stderr," -a Skip ASCII characters.\n"); fprintf(stderr," -A Skip ASCII whitespace characters.\n"); fprintf(stderr," -b Suppress printing of byte offset.\n"); fprintf(stderr," -B Skip characters within the BMP.\n"); fprintf(stderr," -c Suppress printing of character offset.\n"); fprintf(stderr," -e Suppress printing of encoding.\n"); fprintf(stderr," -g Suppress printing of glyph.\n"); fprintf(stderr," -h Print help information.\n"); fprintf(stderr," -l Print line number.\n"); fprintf(stderr," -n Suppress printing of Unicode name.\n"); fprintf(stderr," -p Suppress printing of headers every screenfull.\n"); fprintf(stderr," -q Input is native-order UTF-32 [default is UTF-8].\n"); fprintf(stderr," -r Print Unicode range.\n"); fprintf(stderr," -s skip to specified character.\n"); fprintf(stderr," -S skip to specified byte.\n"); fprintf(stderr," -u Suppress printing of UTF-32 code.\n"); fprintf(stderr," -v Print version information.\n"); fprintf(stderr," -V Validate input (determine whether it is valid UTF-8).\n"); putc('\n',stderr); fprintf(stderr,"Report bugs to: billposer@alum.mit.edu\n"); putc('\n',stderr); } long GetOffset(UTF32 ch) { int l; /* Lower bound of region */ int u; /* Upper bound of region */ int m; /* Midpoint of region */ l = 0; u = Offset_Entries -1; /* Standard binary search */ while(l <= u){ m = (l + u) / 2; if(ch < Offset_Table[m].b) u = m - 1; else if(ch > Offset_Table[m].e) l = m + 1; else return(Offset_Table[m].offset); /* NOTREACHED */ } } /* Used for transliterating Hangul blocks */ static char *Choseong[]={ "G","KK","N","D","TT","L","M","B","PP","S","SS","","J","CC","C","K","T","P","H"}; static char *Jungseong[]={"A","AE","YA","YAE","EO","E","YEO","YE","O","WA","WAE","OE","YO","U","WEO","WE","WI","YU","EU","YI","I"}; static char *Jongseong[]={"","G","KK","GS","N","NJ","NH","D","L","LG","LM","LB","LS","LT","LP","LH","M","B","BS","S","SS","NG","J","C","K","T","P","H"}; static int JongseongSize = sizeof(Jongseong)/sizeof(char *); static int MiddleBlockSize = (sizeof(Jungseong)/sizeof(char *)) * (sizeof(Jongseong)/sizeof(char *)); #define HANGUL_BLOCK_BASE 0xAC00 char * TransliterateHangulSyllable(UTF32 c) { unsigned long sc; unsigned long ChoseongIndex; unsigned long JungseongIndex; unsigned long JongseongIndex; unsigned long Next; static char buf[9]; sc = c - HANGUL_BLOCK_BASE; ChoseongIndex = sc/MiddleBlockSize; Next = sc % MiddleBlockSize; JungseongIndex = Next / JongseongSize; JongseongIndex = Next % JongseongSize; strcpy(buf,Choseong[ChoseongIndex]); strcat(buf,Jungseong[JungseongIndex]); strcat(buf,Jongseong[JongseongIndex]); return(buf); } int HandleReadError(UTF32 c, unsigned char *rp, long LineCnt, long CharCnt, long ByteCnt) { extern void ExplicateBadUTF8(FILE *, unsigned char *); switch (c) { case UTF8_NOTENOUGHBYTES: fprintf(stderr,gettext("Truncated UTF-8 sequence encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); exit(1); break; case UTF8_BADINCODE: fprintf(stderr,gettext("Invalid UTF-8 code encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); ExplicateBadUTF8(stderr,rp); exit(1); break; case UTF8_BADOUTCODE: fprintf(stderr,gettext("Encountered invalid Unicode at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); exit(1); break; case UTF8_IOERROR: snprintf(msg,MSGSIZE-1,gettext("Error reading input at line %ld, character %ld, byte %ld.\n"),LineCnt,CharCnt,ByteCnt); perror(msg); exit(1); break; default: /* Normal EOF */ return(0); break; /* NOTREACHED */ } } long SlurpBytes (long StartByte, long *InputLine) { int c; long CurrentByteOffset; long InputLineNumber; CurrentByteOffset = 0L; InputLineNumber = 1L; while( (c = getc(stdin)) != EOF){ if(c == 0x0A) InputLineNumber++; CurrentByteOffset++; if(CurrentByteOffset == StartByte) break; } if(c == EOF){ fprintf(stderr,"Reached end of file before specified byte offset.\n"); exit(3); } *InputLine = InputLineNumber; return CurrentByteOffset; } /* Decides whether a character is displayable */ short DisplayableP(UTF32 c){ if(c <= 0x20) return 0; /* ASCII control character */ if((c >= 0x7F) && (c <= 0xA0) ) return 0; /* ASCII DEL and Unicode control characterss */ if(c == 0x3000) return 0; /* Ideographic space */ if(c == 0xFEFF) return 0; /* Zero width no break space */ if((c >= 0x2000) && (c <= 0x200F) ) return 0; /* Various spaces and direction codes */ else return 1; } void PutHeader(){ if(CharacterOffsetP) printf("character "); if(ByteOffsetP) printf("byte "); if(LineNumberP) printf("line "); if(UTF32CodeP) printf("UTF-32 "); if(EncodingP) printf("encoded as "); if(ShowGlyphP) printf("glyph "); if(UnicodeRangeP) printf("range "); if(UnicodeNameP) printf("name"); fputs("\n",stdout); } int main (int ac, char *av[]) { UTF32 c; int oc; /* Command line option flag */ int UCBytes; long int Current_Byte_Offset; long int Current_Char_Offset; int infd; unsigned char *rawptr; char rawstr[24]; char *rawstrptr; char *nameptr; char *endptr; /* Used by strtoul */ char *tmpstr; /* Temp string */ int prpos; int tmpint; /* Temporary value */ int i; long NameOffset; /* Must be signed */ int RangeIndex; UTF32 Maximum_Known_Code; unsigned int FirstNelson; /* Index of first entry in array of Nelson codes */ int NelsonEntries; /* The number of Nelson codes matching a Unicode code */ long StartCharacterOffset = 0L; long StartByteOffset = 0L; long OutputLineNumber; long PreviousHeaderOutputLineNumber; long InputLineNumber = 1L; int Screen_Lines = 24; short NelsonP = 1; /* Show Nelson code for kanji if available? */ short Header_Each_Page_P = 1; /* Print column headers every screenfull? */ short ValidateP = 0; /* Just check to see if the file is all valid UTF-8? */ char *udefstr = "Undefined"; char *nospecinfstr = "No character name available"; char *charudefrangestr = "Character in undefined range"; char *beyondstrptr ="Codepoint beyond known range"; /* char *privateusestr ="Unknown character in Private Use Area"; */ char *eptr; /* For checking strtol conversions */ UTF32 (*rfunc)(int,int *,unsigned char **); extern char *unames[]; extern UTF32 Get_UTF32_From_UTF8 (int,int *,unsigned char **); extern UTF32 Get_UTF32(int,int *,unsigned char **); extern int optind; extern int opterr; extern int optopt; extern int GetRange(UTF32); extern struct cr Range_Table []; extern int GetNelson(UTF32,unsigned int *); extern void putu8(UTF32); extern struct unmap NelsonTbl[]; opterr = 0; rfunc = Get_UTF32_From_UTF8; CharacterOffsetP = 1; /* Show character offset? */ ByteOffsetP = 1; /* Show byte offset? */ UTF32CodeP = 1; /* Show UTF32 code? */ EncodingP = 1; /* Show encoding? */ UnicodeRangeP =0; /* Show Unicode range? */ UnicodeNameP =1; /* Show Unicode name? */ ShowGlyphP = 1; /* Display glyph? */ Current_Byte_Offset = 0L; Current_Char_Offset = 0L; Maximum_Known_Code = Offset_Table[Offset_Entries-1].e; #ifdef HAVE_SETLOCALE setlocale(LC_ALL,""); #endif #ifdef HAVE_LIBINTL_H bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); #endif /* Handle command line arguments */ while( (oc = getopt(ac,av,"aAbBceghlnpqrs:S:uvV")) != EOF){ switch(oc){ case 'a': SkipAsciiP = 1; break; case 'A': SkipAsciiSpaceP = 1; break; case 'b': ByteOffsetP = 0; break; case 'B': SkipBMPP =1; break; case 'c': CharacterOffsetP = 0; break; case 'e': EncodingP = 0; break; case 'g': ShowGlyphP = 0; break; case 'h': ShowUsage(); exit(2); break; /* NOTREACHED */ case 'l': LineNumberP = 1; break; case 'n': UnicodeNameP = 0; break; case 'p': Header_Each_Page_P = 0; break; case 'q': rfunc = Get_UTF32; EncodingP = 0; break; case 'r': UnicodeRangeP = 1; break; case 's': StartCharacterOffset = strtol(optarg, &eptr,0); if( (*eptr != '\0') || (errno == EINVAL)){ fprintf(stderr,"Start character offset %s ill-formed.\n",optarg); exit(2); } if(errno == ERANGE){ fprintf(stderr,"Start character offset out of range.\n"); exit(2); } if(StartCharacterOffset < 0){ fprintf(stderr,"Invalid negative start character offset %ld\n",StartCharacterOffset); exit(2); } break; case 'S': StartByteOffset = strtol(optarg, &eptr,0); if( (*eptr != '\0') || (errno == EINVAL)){ fprintf(stderr,"Start byte offset %s ill-formed.\n",optarg); exit(2); } if(errno == ERANGE){ fprintf(stderr,"Start byte offset out of range.\n"); exit(2); } if(StartByteOffset < 0L){ fprintf(stderr,"Invalid negative start byte offset.\n"); exit(2); } break; case 'u': UTF32CodeP = 0; break; case 'V': ValidateP = 1; break; case 'v': ShowVersion(); exit(2); break; /* NOTREACHED */ default: fprintf(stderr,"%s: invalid option flag %c\n",pgname,optopt); ShowVersion(); ShowUsage(); exit(2); } } if(optind < ac){ infd = open(av[optind],O_RDONLY); if(infd < 0){ snprintf(msg,MSGSIZE-1,"%s: unable to open input file.\n%s",pgname,av[optind]); perror(msg); exit(3); } } else infd = fileno(stdin); if(StartByteOffset > 0L) { CharacterOffsetP = 0; fprintf(stderr, "Disabling printing of character offset since initial bytes are being skipped.\n"); } if(ValidateP) { while ( (c = (*rfunc)(infd,&UCBytes,&rawptr)) <= UNI_MAX_UTF32){ Current_Char_Offset++; Current_Byte_Offset+=UCBytes; if(c == 0x000A) InputLineNumber++; } if(c > UNI_MAX_UTF32){ (void)HandleReadError(c,rawptr,InputLineNumber,Current_Char_Offset,Current_Byte_Offset-UCBytes); exit(1); } exit(0); } /* Skip any characters to be skipped */ if(StartCharacterOffset > 0){ while ( (c = (*rfunc)(infd,&UCBytes,&rawptr)) <= UNI_MAX_UTF32){ Current_Char_Offset++; Current_Byte_Offset+=UCBytes; if(c == 0x000A) InputLineNumber++; if(Current_Char_Offset == StartCharacterOffset) break; } if(c > UNI_MAX_UTF32){ if(HandleReadError(c,rawptr,InputLineNumber,Current_Char_Offset,Current_Byte_Offset-UCBytes) == 0){ fprintf(stderr,"Reached end of file before specified character offset.\n"); exit(3); } } } if(StartByteOffset > 0L){ if(LineNumberP) Current_Byte_Offset = SlurpBytes(StartByteOffset,&InputLineNumber); else { /* Attempt to seek. If that fails because input is a pipe, slurp bytes */ if(lseek(infd,(off_t) StartByteOffset,SEEK_SET) < 0){ if(errno == ESPIPE) Current_Byte_Offset = SlurpBytes(StartByteOffset,&InputLineNumber); else{ fprintf(stderr,"Error seeking on input.\n"); exit(3); } } /* If we get here seek was successful */ Current_Byte_Offset = StartByteOffset; } } /* Find out how many lines per screenfull */ if(Header_Each_Page_P){ if( (tmpstr = getenv("LINES")) != NULL){ tmpint = strtoul(tmpstr,&endptr,0); if( (errno == EINVAL) || (errno == ERANGE)){ fprintf(stderr,"Invalid value in LINES environment variable."); fprintf(stderr,"Using default of %d.\n",Screen_Lines); } else Screen_Lines = tmpint; } else{ fprintf(stderr,"No LINES variable in environment so unable to determine lines per page.\n"); fprintf(stderr,"Using default of %d.\n",Screen_Lines); } } if(!Header_Each_Page_P) PutHeader(); PreviousHeaderOutputLineNumber = OutputLineNumber = 0L; while ( (c = (*rfunc)(infd,&UCBytes,&rawptr)) <= UNI_MAX_UTF32){ if(c == 0x000A) InputLineNumber++; Current_Char_Offset++; Current_Byte_Offset+=UCBytes; OutputLineNumber++; if(Header_Each_Page_P && ( (OutputLineNumber != PreviousHeaderOutputLineNumber) || (OutputLineNumber == 0)) && (OutputLineNumber % (Screen_Lines - 2) == 1)) { PreviousHeaderOutputLineNumber = OutputLineNumber; PutHeader(); } if(c > Maximum_Known_Code){ if(CharacterOffsetP) printf("%9ld ",Current_Char_Offset-1); if(ByteOffsetP) printf("%9ld ",Current_Byte_Offset-UCBytes); if(LineNumberP) printf("%8ld ",InputLineNumber); if(UTF32CodeP) printf("%06lX ",c); if(EncodingP) printf("%-14s ",rawstr); if(ShowGlyphP){ if(DisplayableP(c)){ putu8(c); printf(" "); } else printf(" "); } if(UnicodeRangeP)printf("%-39s ",beyondstrptr); if(UnicodeNameP && !UnicodeRangeP) printf("%-35s",beyondstrptr); } else{ if(SkipAsciiP && (c < 0x80)) { OutputLineNumber--; continue; } if(SkipBMPP && (c <= UNI_MAX_BMP)) { OutputLineNumber--; continue; } if(SkipAsciiSpaceP && (c <= 0xFF && isspace (c))) { OutputLineNumber--; continue; } if(EncodingP){ rawstrptr=rawstr;prpos=0; for(i=0; i= 0xAC00) && (c <= 0xD7A3) ){ /* Hangul syllable */ sprintf(msg,"HANGUL SYLLABLE %s",TransliterateHangulSyllable(c)); nameptr = msg; } else{ if(UnicodeRangeP) nameptr = nospecinfstr; else{ sprintf(msg,"Unknown character in range %s",Range_Table[RangeIndex].name); nameptr = msg; } } break; default: nameptr = unames[c - NameOffset]; break; } if(CharacterOffsetP) printf("%9ld ",Current_Char_Offset-1); if(ByteOffsetP)printf("%9ld ",Current_Byte_Offset-UCBytes); if(LineNumberP) printf("%8ld ",InputLineNumber); if(UTF32CodeP)printf("%06lX ",c); if(EncodingP)printf("%-14s ",rawstr); if(ShowGlyphP){ if(DisplayableP(c)){ putu8(c); printf(" "); } else printf(" "); /* Edited 2004/12/11 */ } if(UnicodeRangeP)printf("%-39s ",Range_Table[RangeIndex].name); /* if(UnicodeNameP)printf("%-35s",nameptr); */ if(UnicodeNameP)printf("%s",nameptr); } fputs("\n",stdout); } /* End of while */ (void)HandleReadError(c,rawptr,InputLineNumber,Current_Char_Offset,Current_Byte_Offset-UCBytes); exit(0); }