/* humanzip - compresses text files in a human readable way Copyright (C) 2007 Matthew Strait This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ using namespace std; #include #include #include #include #include #include #include #include "humanzip.h" #define TABLESIZE 55 const string utftable[TABLESIZE] = { "ă", "ḇ", "č", "đ", "ę", "ḟ", "ġ", "ħ", "ĩ", "ĵ", "ķ", "ŀ", "ṁ", "ń", "ō", "ṕ", "ƍ", "ŕ", "ş", "ŧ", "ů", "↺", "ŵ", "ж", "ŷ", "ž", "æ", "ḃ", "ç", "ð", "ɘ", "ϕ", "ǥ", "ȟ", "ḭ", "ʝ", "ǩ", "ł", "ḿ", "ṋ", "ø", "Ƿ", "ƣ", "ṟ", "ƨ", "θ", "ű", "ṽ", "ẉ", "ϫ", "ȳ", "ƶ", "œ", "ƌ", "å" }; const string UTFtable[TABLESIZE] = { "Ă", "Ḇ", "Č", "Đ", "Ę", "Ḟ", "Ġ", "Ħ", "Ĩ", "Ĵ", "Ķ", "Ŀ", "Ṁ", "Ń", "Ō", "Ṕ", "ℚ", "Ŕ", "Ş", "Ŧ", "Ů", "℣", "Ŵ", "Ж", "Ŷ", "Ž", "Æ", "Ḃ", "Ç", "Ð", "Ǝ", "Φ", "Ǥ", "Ȟ", "Ḭ", "ℷ", "Ǩ", "Ł", "Ḿ", "Ṋ", "Ø", "₱", "Ƣ", "Ṟ", "Ƨ", "Θ", "Ű", "Ṽ", "Ẉ", "Ϫ", "Ȳ", "Ƶ", "Œ", "Ƌ", "Å" }; #define BLACKSIZE 35 const string blacklist[BLACKSIZE] = { "hers", "they", "them", "mine", "ours", "whom", "when", "that", "from", "with", "this", "were", "will", "been", "have", "what", "your", "into", "which", "where", "would", "could", "these", "there", "about", "might", "their", "theirs", "should", "didn't" "of the", "in the", "for the", "to the", "with the" }; struct candidate{ string s; int words; int goodness; }; static int useblacklist = 1; static unsigned int nabbrs = 10; static int maxwords = 1; static int verbose = 0; // output detailed progress information? static int quiet = 0; // output only errors? static int keeporig = 0; // keep original files? static int tostdout = 0; // write to stdout? static int overwrite = 0; // overwrite existing files? static void handle_cmdline(vector & filenames, int argc, char ** argv) { int done = 0; const char * opts = "fcqvkl:n:ah?"; while(!done){ char c; switch(c = getopt(argc, argv, opts)) { case -1: done = 1; break; case 'f': overwrite = 1; break; case 'c': keeporig = 1; tostdout = 1; break; case 'k': keeporig = 1; break; case 'v': verbose = 1; break; case 'q': quiet = 1; break; case 'a': useblacklist = 0; break; case 'l': maxwords = atoi(optarg); if(maxwords < 1){ cerr << "Max words can't be less than one.\n"; exit(1); } if(maxwords > 256) cerr << "Max words is very large! Are you sure?\n"; break; case 'n': nabbrs = atoi(optarg); if(nabbrs > TABLESIZE){ cerr << "Can only handle " << TABLESIZE << " abbreviations\n"; nabbrs = TABLESIZE; } if(nabbrs < 1){ cerr << "Invalid number of abbreviations\n"; exit(1); } break; case 'h': case '?': default: cerr << "humanzip v" << HUMANVERSION << " © 2007 Matthew Strait\n" << "humanzip comes with ABSOLUTELY NO WARRANTY. This is free software\n" << "and you may redistribute it under the terms of the GPLv2.\n" << "\n" << "Basic syntax: humanzip file1 [file2]\n" << "\n" << "If no file is specified, humanzip reads from stdin.\n" << "The compressed version is written to, e.g., file1.hz\n" << "or stdout if the input was stdin. Options:\n" << "\n" << "-a\tDo not use the blacklist of common English words and phrases.\n" << "-n #\tUse at most # abbreviations, max "<< TABLESIZE << ", default " << nabbrs << "." << endl << "-l #\tLook for abbreviations of at most # words, default " << maxwords << "." << endl << "-k\tKeep the uncompressed files instead of deleting them.\n" << "-c\tSend output to stdout and keep uncompressed files.\n" << "-f\tForce overwriting of existing output files.\n" << "-v\tBe verbose.\n" << "-q\tBe quiet. Output only errors.\n" << "-h\tPrint this help and exit.\n" << "\n"; exit(1); break; } } for(int i = optind; i < argc; i++) filenames.push_back(argv[i]); if(filenames.size() == 0) filenames.push_back("/dev/stdin"); } // Allows for a sort by alphabet (doesn't matter which way) // and then by position in the file (does matter than it's accending) static int pf(phrase a, phrase b) { int foo = a.s.compare(b.s); if(foo < 0) return 0; else if(foo > 0) return 1; else return(a.pos < b.pos); } static int checkascii(const string & s) { for(unsigned int i = 0; i < s.size(); i++) if(s[i] < 0) return 0; return 1; } // We want to catch words that appear in the document in both their signular // and plural forms and recognize them as the same word. static void handleplural(string & word, string & delims, const vector & dictionary) { // Word of 1 or 2 letters can't be plurals ending in 's' or "es". if(word.size() < 3) return; // If it doesn't end in 's', it's too hard to handle. if(word[word.size()-1] != 's') return; // if the word isn't in the dictionary, forget about it. if(!binary_search(dictionary.begin(), dictionary.end(), word)) return; // if taking the 's' off the end yields another valid word, assume that // it's a plural string wordwithouts = word.substr(0, word.size()-1); if(binary_search(dictionary.begin(), dictionary.end(), wordwithouts)){ word.erase(word.size()-1); delims = 's' + delims; return; } // if that didn't work and it doesn't end in "es", it's too hard to handle. if(word[word.size()-2] != 'e') return; // if taking the "es" off the end yields another word, assume plural string wordwithoutes = word.substr(0, word.size()-2); if(binary_search(dictionary.begin(), dictionary.end(), wordwithoutes)){ word.erase(word.size()-1); word.erase(word.size()-1); delims = "es" + delims; return; } } /* reads file into two vectors, one of words and one of delimeters. For instance a file with the content "hello, this is a test." would produce words={"hello", "this", "is", "a", "test"} delims={"", ", ", " ", " ", " ", "."} delims *always* has one more element than words. Returns number of characters read. */ int readfile(vector & words, vector & delims, const string & filename) { int charsin = 0; ifstream infile(filename.c_str()); if(!infile.is_open()){ cerr << "Couldn't open " << filename << " skipping.\n"; return -1; } vector dictionary; ifstream dict("/usr/share/dict/words"); if(!dict.is_open()){ cerr<<"Couldn't open /usr/share/dict/words. Won't try to handle plurals.\n"; } else{ string temp; while(dict >> temp) dictionary.push_back(temp); dict.close(); } string trailingdelims; while(1){ string word, leadingspace; char c; // First get whitespace and punctuation, if any. while(isspace(infile.peek()) || ispunct(infile.peek())){ infile.get(c); leadingspace += c; } // store this along with any puctuation from the end of the last word delims.push_back(trailingdelims + leadingspace); charsin += (trailingdelims + leadingspace).size(); // Then get non-delims with possible trailing punctuation. // If there aren't any, we're done. if(!(infile >> word)) break; if(!checkascii(word)){ cerr << "In " << filename << ", got non-ascii string " << word << ". Cannot operate on non-ascii, skipping file.\n"; return -1; } //now strip off and store the punctuation at the end of the word trailingdelims = ""; while(ispunct(word[word.size()-1])){ trailingdelims = word[word.size()-1] + trailingdelims; word.erase(word.size()-1); } //now figure out if words that end in "s" or "es" are plural and, if //so, move the "s"/"es" to the delims. handleplural(word, trailingdelims, dictionary); words.push_back(word); charsin += word.size(); } infile.close(); return charsin; } // strings are sorted by position when they get here. Returns number of times // that the phrase in question appears not overlapping. e.g. if the phrase // is "happy happy" and the file contains "happy happy happy happy happy", this // returns 2. static int nnonoverlapping(const vector & phrases, int first, int last) { int length = phrases[first].words; int count = 1; // first one is obviously ok unsigned int firstvalidposition = phrases[first].pos + length; for(int i = first+1; i <= last; i++){ if(phrases[i].pos >= firstvalidposition){ count++; firstvalidposition = phrases[i].pos + length; } } return count; } // Take a vector of strings and make it into a vector of ordered pairs // of unique strings and number of times that string occured in the input. static void sortuniqc(vector & cands, vector & p) { sort(p.begin(), p.end(), pf); // this is the slow part for(unsigned int i = 0; i < p.size();){ candidate sct; sct.s = p[i].s; sct.words = p[i].words; int first = i; do{ i++; }while(i < p.size() && sct.s == p[i].s); sct.goodness = nnonoverlapping(p, first, i-1); // goodness now = # reps if(sct.goodness > 1) cands.push_back(sct); // make very loose cut } } // Sanitizes strings by converting line breaks into paragraph symbols. static string convertlinebreaks(const string & s) { string answer; for(unsigned int i = 0; i < s.size(); i++){ if(s[i] == '\n') answer += "¶"; else if(s[i] == '\r') answer += "⁋"; else answer += s[i]; } return answer; } // Assign values to how good a candidate replacement is static void weight(candidate & cand) { // how many characters will be saved? cand.goodness *= (cand.s.size()-1); // subtract from this the space used by the key // this approximately finds space usage by assuming 80 character columns cand.goodness -= ((6 + cand.s.size())/80 + 1)*80; // If the string to be replaced is less than 4 characters long // do not use it. if(cand.s.size() < 4){ cand.goodness = 0; return; } // If the string is on the black list, do not use it. if(useblacklist){ for(int j = 0; j < BLACKSIZE; j++){ if(cand.s == blacklist[j]){ cand.goodness = 0; break; } } } //cerr<<"Tested: "< b.words); } // Given the text of the file, a starting position and a number of words, // returns a string consisting of the words starting at that position, // including the delimiters between them string yoink(const vector & words, const vector & delims, unsigned int pos, unsigned int len) { string answer; if(pos + len - 1 >= words.size()) { cerr << "Bad yoink!"; return ""; } for(unsigned int i = 0; i < len; i++){ answer += words[pos+i]; if(i != len-1) answer += delims[pos+i+1]; } //cerr << "Yoinked: " << answer << endl; return answer; } static int doreplacement(ofstream & outfile, vector & rtab, const vector & words, const vector & delims) { string s; int charsout = 0; sort(rtab.begin(), rtab.end(), rtf); if(verbose){ for(unsigned int i = 0; i < rtab.size(); i++) cerr << "\nSearching for: " << rtab[i].search << " " << rtab[i].timesused; cerr << endl; } outfile << delims[0]; // always exists, never part of a replacement charsout += delims[0].size(); for(unsigned int w = 0; w < words.size();){ int gotit = 0; for(unsigned int r = 0; r < rtab.size(); r++){ // can't match because the search string is longer than the remaining text if(rtab[r].words > words.size() - w) continue; string text = yoink(words, delims, w, rtab[r].words); if(text == rtab[r].search){ outfile << convertlinebreaks(rtab[r].replacelower); gotit = 1; w += rtab[r].words; rtab[r].timesused++; break; } if(lowerfirstletter(text) == rtab[r].search){ outfile << convertlinebreaks(rtab[r].replaceupper); gotit = 1; w += rtab[r].words; rtab[r].timesused++; break; } } if(!gotit){ outfile << words[w]; charsout += words[w].size(); w++; } else charsout++; outfile << delims[w]; charsout += delims[w].size(); } return charsout; } static int finalcheckandprintkey(ofstream & outfile, vector & rt) { int charsout = 13; // ĦŨӍĄŅŽȈƤǷΈÐ\n\n outfile << "ĦŨӍĄŅŽȈƤǷΈÐ\n"; //used as a magic number for trivially zipped files for(vector::iterator i = rt.begin(); i != rt.end(); i++){ if((*i).timesused > 1){ // The key looks nicer with an upper case first letter outfile << (*i).replacelower << "/" << (*i).replaceupper << " - " << raisefirstletter(convertlinebreaks((*i).search)) << "\n"; charsout += (*i).search.size() + 7; } else{ if(verbose) cerr << "Eliminated: " << (*i).search << endl; rt.erase(i); i--; } } outfile << endl; // end marker. Also looks nice. return charsout; } // cands is sorted to put the best replacements at the front static void makereplacementtable(vector & replacementtable, const vector & cands) { searchreplace temp; temp.timesused = 0; // set when the replacement is used vector utftableusage(TABLESIZE, 0); for(unsigned int i=0; replacementtable.size()= TABLESIZE) continue; // typically happens if temp.search[0|1] isn't a letter if(!utftableusage[choice[c]]){ temp.replacelower = utftable[choice[c]]; temp.replaceupper = UTFtable[choice[c]]; utftableusage[choice[c]] = 1; gotit = 1; break; } } if(!gotit){ int lastavailable = TABLESIZE-1; while(utftableusage[lastavailable]){ lastavailable--; if(lastavailable < 0){ cerr << "NOT REACHED: table exhausted\n";} } temp.replacelower = utftable[lastavailable]; temp.replaceupper = UTFtable[lastavailable]; utftableusage[lastavailable] = 1; } replacementtable.push_back(temp); } } static void lowerfirstletter(vector & phrases) { for(unsigned int i = 0; i < phrases.size(); i++) if(phrases[i].s.size() > 0) phrases[i].s[0] = tolower(phrases[i].s[0]); } // rip off of bzip2 output static void printstats(float charsin, float charsout, const string & filename) { if(quiet) return; cerr << " " << filename << ":\t" << setprecision(3) << charsin/charsout << ":1, " << (1-charsout/charsin)*100 << "% saved, " << int(charsin) << " in, " << int(charsout) << " out.\n"; } vector makephrases(const vector & words, const vector & delims, int len) { vector phrases; int nwords = words.size(); for(int iword = 0; iword < nwords-len+1; iword++){ phrase ph; ph.words = len; ph.pos = iword; for(int word = 0; word < len; word++){ ph.s += words[iword+word]; if(word != len-1) ph.s += delims[iword+word+1]; } phrases.push_back(ph); } return phrases; } static int wcf(candidate a, candidate b) { return(a.goodness > b.goodness); } static int writecompressed(vector & replacementtable, const vector & words, const vector & delims, const string & filename) { if(verbose) cerr << "\nOutputting compressed text..."; string outfilename; if(filename == "/dev/stdin" || tostdout) outfilename = "/dev/stdout"; else outfilename = filename + ".hz"; ofstream outfile(outfilename.c_str()); if(!outfile.is_open()){ cerr << "Couldn't open " << outfilename << " for writing!\n"; return -1; } ofstream devnull("/dev/null"); if(!devnull.is_open()){ cerr << "Couldn't open /dev/null for writing!\n"; return -1; } // dry run to eliminate unused abbreviations if(verbose) cerr << "dry run:\n"; doreplacement(devnull, replacementtable, words, delims); int charsout = finalcheckandprintkey(outfile, replacementtable); if(verbose) cerr << "Ok, for real this time:\n"; charsout += doreplacement(outfile, replacementtable, words, delims); outfile.close(); if(!keeporig){ if(verbose) cerr << "Deleting " << filename << endl; unlink(filename.c_str()); } if(verbose) cerr << "\n"; return charsout; } // Checks if we're ok with writing out the compressed file: returns 1 on // sucess, 0 on failure. As a side effect, this creates an empty output file. static int checkoutfile(const string & filename) { if(tostdout || filename == "/dev/stdin") return 1; // stdout had better always be ok string outfilename = filename + ".hz"; // if the file already exists and we're not willing to ovewrite it, fail struct stat buffer; if(!overwrite && (-1 != stat(outfilename.c_str(), &buffer))){ cerr << "Output file " << outfilename << " already exists, skipping.\n"; return 0; } // if the file does not exist, or it does and we're willing to overwrite // it, check if we can actually write to it ofstream outfile(outfilename.c_str()); if(!outfile.is_open()){ cerr << "Couldn't open " << outfilename << " for writing!\n"; return 0; } else{ outfile.close(); return 1; } } // Returns 1 on failure, 0 on sucess static int doit(const string & filename) { vector words, delims; vector replacementtable; int charsin = 0, charsout = 0; if(verbose) cerr << "Reading file..."; if(-1 == (charsin = readfile(words, delims, filename))) return 1; if(!checkoutfile(filename)) return 1; if(verbose) cerr << "\nMaking and judging abbreviations"; vector cands; // master candidates for(int len = 1; len <= maxwords; len++){ vector phrases = makephrases(words, delims, len); vector candcands; // temporary candidates lowerfirstletter(phrases); sortuniqc(candcands, phrases); for(unsigned int i = 0; i < candcands.size(); i++){ weight(candcands[i]); // not too bad if(candcands[i].goodness > 0) // keep only vaugly reasonable ones cands.push_back(candcands[i]); } cerr << "."; } // sort master array of candidates if(verbose) cerr << "\nSorting candidates..."; sort(cands.begin(), cands.end(), wcf); // slow, but not as bad as the above if(verbose) cerr << "\nMaking replacement table..."; makereplacementtable(replacementtable, cands); // fast if(-1 == (charsout = writecompressed(replacementtable,words,delims,filename))) return 1; printstats(charsin, charsout, filename); return 0; } int main(int argc, char ** argv) { vector filenames; handle_cmdline(filenames, argc, argv); int nfailed = 0; for(unsigned int i = 0; i < filenames.size(); i++) nfailed += doit(filenames[i]); return nfailed; }