/* Time-stamp: <2006-09-19 11:17:34 poser> */ /* * Copyright (C) 2005-2006 William J. Poser (billposer@alum.mit.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * or go to the web page: http://www.gnu.org/licenses/gpl.txt. */ #include "config.h" #include #include #include #ifdef HAVE_LOCALE_H #include #endif #ifdef HAVE_LIBINTL_H #include #else #define gettext(x) (x) #endif #include "unicode.h" #include "utf8error.h" #ifdef BMPONLY #define CHARSETSIZE 65536 /* 16 bits */ #else #define CHARSETSIZE 2097152 /* Full Unicode = 21 bits */ #endif #define MSGSIZE 128 char pgname[]="unihist"; char compdate[]=__DATE__ " " __TIME__ ; char msg [MSGSIZE]; static long CharacterCounts[CHARSETSIZE]; void ShowUsage(void){ fprintf(stderr, gettext("Filter to generate a histogram of the Unicode characters in the input.\n")); fprintf(stderr, gettext(" -c Suppress printing of counts and percentages.\n")); fprintf(stderr, gettext(" -g Suppress printing of glyphs.\n")); fprintf(stderr, gettext(" -u Suppress printing of code as text.\n")); fprintf(stderr, gettext(" -h Print this help message.\n")); fprintf(stderr, gettext(" -v Print version.\n")); } void ShowVersion(void){ fprintf(stderr,"\n%s %s\n",pgname,PACKAGE_VERSION); #ifdef BMPONLY fprintf(stderr,gettext("This version compiled to handle only the BMP.\n")); #endif fprintf(stderr,gettext("Compiled %s.\n"),compdate); fprintf(stderr,"Copyright 2005-2006 William J. Poser\n"); fprintf(stderr,gettext("Released under the terms of the GNU General Public License.\n")); fprintf(stderr,gettext("Report bugs to: billposer@alum.mit.edu.\n")); } int HandleReadError(UTF32 c, unsigned char *rp, long LineCnt, long CharCnt, long ByteCnt) { extern void ExplicateBadUTF8(FILE *, unsigned char *); switch (c) { case UTF8_NOTENOUGHBYTES: fprintf(stderr,gettext("Truncated UTF-8 sequence encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); exit(1); break; case UTF8_BADINCODE: fprintf(stderr,gettext("Invalid UTF-8 code encountered at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); ExplicateBadUTF8(stderr,rp); exit(1); break; case UTF8_BADOUTCODE: fprintf(stderr,gettext("Encountered invalid Unicode at line %ld, character %ld, byte %ld.\n"),LineCnt, CharCnt, ByteCnt); exit(1); break; case UTF8_IOERROR: snprintf(msg,MSGSIZE-1,gettext("Error reading input at line %ld, character %ld, byte %ld.\n"),LineCnt,CharCnt,ByteCnt); perror(msg); exit(1); break; default: /* Normal EOF */ return(0); break; /* NOTREACHED */ } } /* Decides whether a character is displayable */ short DisplayableP(wchar_t c){ if(c <= 0x20) return 0; /* ASCII control characters */ if((c >= 0x7F) && (c <= 0xA0) ) return 0; /* ASCII DEL and Unicode control characters */ if(c == 0x3000) return 0; /* Ideographic space */ if(c == 0xFEFF) return 0; /* Zero width no break space */ if((c >= 0x2000) && (c <= 0x200F) ) return 0; /* Various spaces and direction codes */ return 1; } int main(int ac, char **av) { wchar_t c; /* Input character as UTF-32 */ long i; /* Loop variable */ int infd; /* Input file descriptor */ int oc; /* Getopt option flag */ long TotalChars; /* Total number of characters in input */ long ByteOffset = 0L; /* Offset from beginning of input in bytes, counting from zero */ long LineNumber = 0L; int UCBytes; /* Bytes of input occupied by the current input character */ unsigned char *rawptr; short CntP =1; /* Output counts and percentages? */ short CodeP = 1; /* Output UTF-32 codes in ascii hex?*/ short GlyphP = 1; /* Output UTF-8 glyphs? */ #ifdef HAVE_NUMBER_GROUP_SEPARATOR char *fmtstr = "\t%7.3f\t%'8ld"; #else char *fmtstr = "\t%7.3f\t%8ld"; #endif extern int optopt; extern int opterr; extern UTF32 Get_UTF32_From_UTF8 (int,int *,unsigned char **); extern void putu8 (wchar_t); opterr = 0; /* We'll handle errors ourselves */ while( (oc = getopt(ac,av,"cghuv")) != EOF){ switch(oc){ case 'c': CntP = 0; break; case 'g': GlyphP = 0; break; case 'h': ShowUsage(); exit(1); case 'u': CodeP = 0; break; case 'v': ShowVersion(); exit(1); default: fprintf(stderr,gettext("%s: invalid option flag %c\n"),pgname,optopt); ShowVersion(); ShowUsage(); exit(2); } } #ifdef HAVE_SETLOCALE setlocale(LC_ALL,""); #endif #ifdef HAVE_LIBINTL_H bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); #endif infd = fileno(stdin); /* Initialize counts */ for(i=0L;i CHARSETSIZE){ fprintf(stderr, gettext("Encountered input outside the BMP (plane 0): 0x%06X at byte offset %ld.\n"), c,ByteOffset-UCBytes); fprintf(stderr, gettext("This version of %s has been compiled to handle only characters in the BMP.\n"),pgname); fprintf(stderr, gettext("Recompile with BMPONLY undefined to handle all of Unicode.\n")); exit(3); } #endif CharacterCounts[c]+=1L; ++TotalChars; } (void)HandleReadError(c,rawptr,LineNumber,TotalChars,ByteOffset); /* Output */ for(i=0L;i < CHARSETSIZE; i++){ if(CharacterCounts[i] == 0) continue; if(CntP) printf(fmtstr,100.0 * (((double)CharacterCounts[i])/(double)TotalChars),CharacterCounts[i]); if(CodeP) printf("\t0x%06lX",i); if(GlyphP && DisplayableP(i)){ putchar('\t');putu8(i);} putchar('\n'); } exit(0); }