#include #include #include #include #include "safestring.h" typedef struct _HzPhrase { u_char hz[MAX_PHRASE_LEN*2+1]; u_char freq; struct _HzPhrase *next; }HzPhrase; typedef struct _KeyPhrase { u_short len; u_char key[2*MAX_PHRASE_LEN+1]; u_short count; // number of Phrase items HzPhrase *hzph; struct _KeyPhrase *next; }KeyPhrase, *PKeyPhrase; PinYin pytab[26][MAX_EACH_PY]; PKeyPhrase phtab[MAX_PY_NUM]; u_short phcount[MAX_PY_NUM]; //Get PYahead for key u_short KeyAhead(u_char *key) { u_short py; py = '\0'; py |= key[0] << 2 & 0x3ff; py |= key[1] >> 6; return py; } //Convert Phrase_Key to PY_Key void Key2PYKey(u_char *key, u_short *pykey, u_char len) { int i,j,p; for (i=0; i> (6 - p); p += 2; if ( (p %= 8) == 0 ) j++; } } //Convert PY_Key to Phrase_Key void PYKey2Key(u_char *key, u_short *pykey, u_char len) { int i,j,p; int klen=(len*10)/8 +1; for (j=0; j> (2 + p); key[j] |= pykey[i] << (6 - p) ; p += 2; if ( (p %= 8) == 0 ) j++; } } int LoadTable(char* pathname) { FILE *stream; char str[1024], *strpy, *strhz; int i=1, j=0, lastpy=0, curpy; if ( (stream = fopen( pathname, "r" )) == NULL ) { fprintf(stderr,"%s file not found\n",pathname); exit(1); } while ( !feof( stream )) { if ( fgets(str,1024,stream) != NULL) { strpy = strtok(str, " \f\n\r\t\v"); curpy = strpy[0]-'a'; if (curpy != lastpy) j = 0; safe_strncpy(pytab[curpy][j].py, strpy, MAX_PY_LEN); pytab[curpy][j].key = i; lastpy = curpy; i++,j++; } } fclose(stream); return 0; } /* divide the string strbuf into string arrays according to space and Tab */ int String2Array(char *strbuf,int len, char strarr[][len]) { int i=0, cursor=0, count=0, buflen = strlen(strbuf); while(i < buflen) { while(i < buflen && (strbuf[i] == ' ' || strbuf[i] == '\011')) i++; // skip space cursor = i; while(i < buflen && strbuf[i] != ' ' && strbuf[i] != '\011') i++; // skip non-space if (i > cursor) { strncpy(strarr[count],strbuf+cursor,i-cursor); strarr[count++][i-cursor] = '\0'; } } return count; } int SavePhraseToMem(char *str,u_char *key,u_char len,u_char freq) { PKeyPhrase kph,tmpkph; HzPhrase *hzph; int first; short ahead; if (len<2) return 0; /* single char phrase ignored */ if (len > MAX_PHRASE_LEN) { fprintf(stderr, "buffer overrun\n"); abort(); } ahead = (short)KeyAhead(key); kph = phtab[ahead]; if (kph != NULL) // first phrase of this pinyin { first=1; do { if (first) first = 0; else kph = kph->next; /* find the matched pinyin keyphrase */ if (kph->len == len && !memcmp(kph->key,key,KEYLEN(len))) { for(hzph = kph->hzph; hzph != NULL; hzph = hzph->next) if (!memcmp(hzph->hz,str,2*len)) // same phrase { fprintf(stderr,"Duplicate phrase %s detected, ignored!\n", hzph->hz); return 0; } hzph = kph->hzph; while(hzph->next != NULL) hzph = hzph->next; // reach the end of the link list if ((hzph->next = (HzPhrase *)malloc(sizeof(HzPhrase)))==NULL) { fprintf(stderr,"no enough memory\n"); exit(1); } kph->count++; hzph = hzph->next; hzph->freq = 0; hzph->next = NULL; memcpy(hzph->hz,str,len*2); /* len < MAX_PHRASE_LEN */ hzph->hz[len*2] = '\0'; return 1; // insert a new Hanzi Phrase at the end of the link list } }while(kph->next != NULL); } // not found , no matched pinyin keyphrase, allocate a new one if ((tmpkph = (KeyPhrase *)malloc(sizeof(KeyPhrase))) == NULL) { fprintf(stderr,"no enough memory\n"); exit(1); } if (phtab[ahead] == NULL) phtab[ahead] = tmpkph; else kph->next = tmpkph; tmpkph->len = len; tmpkph->count = 1; memcpy(tmpkph->key,key,KEYLEN(len)); /* len < MAX_PHRASE_LEN */ tmpkph->next = NULL; if ((tmpkph->hzph = (HzPhrase *)malloc(sizeof(HzPhrase))) == NULL) { fprintf(stderr,"no enough memory\n"); exit(1); } tmpkph->hzph->freq = freq; tmpkph->hzph->next = NULL; memcpy(tmpkph->hzph->hz,str,len*2); /* len < MAX_PHRASE_LEN */ tmpkph->hzph->hz[len*2] = '\0'; phcount[ahead]++; return 1; } int file_size = 0; int SavePhraseToFile(char *pathname) { KeyPhrase *kph,*kphtmp; HzPhrase *hzph,*hzphtmp; int j, k; u_char key[MAX_KEY_LEN],freq; u_short len,count; FILE *out; if ((out = fopen( pathname, "wb" )) == NULL) { fprintf(stderr,"%s cant open.\n",pathname); exit(1); } for (j=1; jhzph; kphtmp = kph; kph = kph->next; len = kphtmp->len; if (len > MAX_PHRASE_LEN) { fprintf(stderr, "buffer overrun\n"); abort(); } memcpy(key,kphtmp->key,KEYLEN(len)); fwrite(&len,sizeof(len),1, out); fwrite(&(kphtmp->count),sizeof(kphtmp->count),1,out); fwrite(key,sizeof(u_char),KEYLEN(len),out); /* len, key[len+1], count, phrase, freq , phrase, freq ...*/ file_size += SizeOfPhrase(len, kphtmp->count); while (hzph != NULL) { hzphtmp = hzph; hzph = hzph->next; fwrite(hzphtmp->hz,sizeof(u_char),len*2,out); fwrite(&(hzphtmp->freq),sizeof(hzphtmp->freq),1,out); free(hzphtmp); } free(kphtmp); } } fwrite(&file_size,sizeof(file_size),1,out); printf("File size=%d\n\n",file_size); fclose(out); return 1; } int LoadPhraseFromFile(char *pathname) { FILE *stream; int i,j; char str[250]; u_short len; u_char key[MAX_KEY_LEN]; unsigned short pykey[MAX_PHRASE_LEN]; int count,ahead,flag =0 ,freq; char strarr[MAX_PHRASE_LEN+4][2*MAX_PHRASE_LEN+1]; if ( (stream = fopen( pathname, "r" )) == NULL ) { fprintf(stderr,"%s cant open.\n",pathname); exit(1); } while ( !feof( stream )) { if ( fgets(str,250,stream) != NULL) { str[strlen(str)-1] = '\0'; count = String2Array(str,2*MAX_PHRASE_LEN+1,strarr); len = strlen(strarr[0])/2; /* len+1 = count, freq = 0 len+2 = count, freq = xx */ if ((len != count-1 && len != count-2) || len > MAX_PHRASE_LEN) { fprintf(stderr,"Phrase %s error!!!\n",str); continue; } if (len == count-2) { freq = atoi(strarr[count-1]); if (freq > 255) freq = 255; count--; } else freq = 0; for (i=1; i25) { fprintf(stderr,"Phrase %s error!!!\n",str); break; } for(j=0; pytab[ahead][j].key; j++) { if (!strcmp(pytab[ahead][j].py,strarr[i])) { pykey[i-1] = pytab[ahead][j].key; flag = 1; break; } } if (!flag) break; } // for if (!flag) { fprintf(stderr,"Phrase %s error!!!\n",str); continue; } PYKey2Key(key, pykey, len); /* printf("%s, len=%d, key0=%d, key1 =%d, key=%d\n", str,len,(int)key[0],(int)key[1],(int)key[2]); */ SavePhraseToMem(str,key,len,freq); } } fclose(stream); return (0); } int main(int argc,char **argv) { int i; if (argc != 3) { fprintf(stderr,"usage: %s \n",argv[0]); return 1; } for(i = 0; i < MAX_PY_NUM; i++) { phtab[i] = NULL; phcount[i] = 0; } LoadTable("./pinyin.map"); LoadPhraseFromFile(argv[1]); SavePhraseToFile(argv[2]); return 0; }