/* Time-stamp: <2006-12-29 21:36:57 poser> * * Copyright (C) 2003-2006 William J. Poser (billposer@alum.mit.edu) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * or go to the web page: http://www.gnu.org/licenses/gpl.txt. * * Given a sequence of bytes, this program determines whether that sequence * constitutes a valid UTF-8 code. If not, it explains why not. * If so, it explains why and shows how the UTF32 value is assembled. * * Author: Bill Poser (wjposer@unagi.cis.upenn.edu) * */ #include "config.h" #include #include #include #include #include #include #include "unicode.h" #define FALSE 0 #define TRUE 1 char *pgname ="ExplicateUTF8"; char compdate[]="Compiled " __DATE__ " " __TIME__ ; /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of bytes that should follow. */ static const char TrailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* * Magic values subtracted from a buffer value during UTF8 conversion. * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ static const UTF32 OffsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; void Usage(void){ fprintf(stderr,"Determine and explain the validity of a potential UTF-8 byte sequence\n"); fprintf(stderr,"%s: ((-o Offset) )\n",pgname); } void ShowVersion(void){ fprintf(stderr,"\n%s %s\n",pgname,PACKAGE_VERSION); fprintf(stderr,"%s\n",compdate); fprintf(stderr,"Copyright (C) 2003-2006 William J. Poser\n"); fprintf(stderr,"Released under the terms of the GNU General Public License.\n\n"); } int main(int ac, char **av){ int BytesNeeded; int BytesRead; int UsefulBits; unsigned char c[6]; int i; UTF32 ch; unsigned char *cptr; unsigned char ShiftedByte; char tempstr[33]; int GotBits; int oc; unsigned long Offset = 0L; int infd; int spaces; extern char * binfmtc(unsigned char); extern char * binfmtl(unsigned long); extern int opterr; extern int optind; extern char *optarg; extern int optopt; opterr=0; cptr = &(c[0]); while( (oc = getopt(ac,av,"ho:v")) != EOF){ switch(oc){ case 'h': Usage(); exit(2); break; /* NOTREACHED */ case 'o': Offset = atol(optarg); break; case 'v': ShowVersion(); exit(2); break; /* NOTREACHED */ case '?': default: fprintf(stderr,"Unrecognized option %c\n",(char) optopt); exit(1); } } if(optind < ac) { infd = open(av[optind], O_RDONLY); if(infd < 0){ perror(NULL); fprintf(stderr,"%s: unable to open file %s\n",pgname,av[optind]); exit(2); } if(lseek(infd, (off_t)Offset,SEEK_SET) < 0){ perror(NULL); exit(1); } } else infd = fileno(stdin); /* Get the first byte */ BytesRead = read(infd,(void *) c,1); if (BytesRead == 0){ fprintf(stderr,"%s: could not read first byte from input.\n",pgname); exit(2); } if (BytesRead < 0){ perror(NULL); exit(2); } if( (c[0] & 0xC0) == 0x80){ printf("The first byte, value 0x%02X, with bit pattern %s,\nis not a valid first byte of a UTF-8\nsequence because its high bits are 10.\nA valid first byte must be of the form 0nnnnnnn or 11nnnnnn.\n",c[0],binfmtc(c[0])); exit(0); } if(c[0] <= 0x7F){ printf("The first byte, value 0x%02X, bit pattern %s, is a valid UTF-8 code by itself\nsince its high bit is 0.\n",c[0],binfmtc(c[0])); exit(0); } BytesNeeded = (int) TrailingBytesForUTF8[c[0]]; BytesRead = read(infd,(void *) &c[1],(size_t) BytesNeeded); if(BytesRead != BytesNeeded){ printf("The sequence is not a valid UTF-8 character\nbecause the first byte, value 0x%02X, bit pattern %s,\ntells us that a total of %d bytes are needed but\n only %d bytes are present.\n", c[0],binfmtc(c[0]),BytesNeeded+1,BytesRead+1); exit(1); } for (i = 1; i <= BytesRead; i++){ if (( c[i] & 0xC0) != 0x80){ printf("The sequence is not a valid UTF-8 character\nbecause byte %d, value 0x%02X, bit pattern %s\nis not a valid continuation byte, whose high bits must be 10.\n",i+1,c[i],binfmtc(c[i])); exit(1); } } /* If we get here, everything is okay, so assemble the UTF32 value */ ch = 0; switch (BytesNeeded) { case 5: ch += *cptr++; ch <<= 6; case 4: ch += *cptr++; ch <<= 6; case 3: ch += *cptr++; ch <<= 6; case 2: ch += *cptr++; ch <<= 6; case 1: ch += *cptr++; ch <<= 6; case 0: ch += *cptr++; } ch -= OffsetsFromUTF8[BytesNeeded]; printf("The sequence "); for (i = 0; i <= BytesRead; i++) printf("0x%02X ",c[i]); printf("\n "); for (i = 0; i <= BytesRead; i++) printf("%s ",binfmtc(c[i])); printf("\n"); printf("is a valid UTF-8 character encoding equivalent to UTF32 0x%08lX.\n",ch); printf("The first byte tells us that there should be %d\ncontinuation bytes since it begins with %d contiguous 1s.\nThere are %d following bytes and all are valid\ncontinuation bytes since they all have high bits 10.\n",BytesNeeded,BytesNeeded+1,BytesNeeded); UsefulBits = 6- BytesNeeded; printf("The first byte contributes its low %d bits.\n",UsefulBits); GotBits = UsefulBits + (6 * BytesNeeded); printf("The remaining bytes each contribute their low 6 bits,\nfor a total of %d bits: ",GotBits); ShiftedByte = c[0] << (BytesNeeded +2); sprintf(tempstr,"%s",binfmtc(ShiftedByte)); tempstr[6-BytesNeeded] = '\0'; printf("%s ",tempstr); for(i = 1; i <= BytesNeeded; i++){ ShiftedByte = c[i] << 2; sprintf(tempstr,"%s",binfmtc(ShiftedByte)); tempstr[6] = '\0'; printf("%s ",tempstr); } printf("\n"); printf("This is padded to 32 places with %d zeros: %n%s\n",(32-GotBits),&spaces,binfmtl(ch)); sprintf(tempstr," "); sprintf(tempstr,"%08lX",ch); tempstr[28] = tempstr[7]; tempstr[24] = tempstr[6]; tempstr[20] = tempstr[5]; tempstr[16] = tempstr[4]; tempstr[12] = tempstr[3]; tempstr[8] = tempstr[2]; tempstr[4] = tempstr[1]; tempstr[1] = 0x20; tempstr[2] = 0x20; tempstr[3] = 0x20; tempstr[5] = 0x20; tempstr[6] = 0x20; tempstr[7] = (unsigned char) 0x20; tempstr[29] =(unsigned char) 0x00; printf("%*s%s\n",spaces,"",tempstr); exit(0); }