############################################################################ # # Name: makeind.icn # # Title: makeind.icn # # Author: Richard L. Goerwitz # # Version: 2.4 # ############################################################################ # # This file, makeind.icn, compiles into an indexing program which # creates a series of files offering the user rapid access to # individual elements (usually words) within a text file. Access is # gained through a set of basic retrieval utilities contained in the # file retrieve.icn, bmp2text.icn, retrops.icn, and others included # with this package. In order to be indexable, files must interleave # string coded bitfield-style designators with text in the following # manner: # # ::001:001:001 # This is text. # ::001:001:002 # This is more text. # # The lines beginning with :: (a double colon) mark bitfield-style # location-designators. Location designators are strings with digit # fields of fixed number and length separated either by nothing (as # in, say 001001002), or better yet by non-digits (e.g. 001:001:002). # NOTE WELL: The bitmaps must come in ascending order. For example, # if we assume three-field bitmaps, 002:001:014 would come before # 003:001:013. If your file is not sorted properly, depending on # the structure of the file, retrieve() may 1) abort, 2) retrieve the # wrong text, or 3) work perfectly fine. Unless you're absolutely # sure of what you're doing, write a quick sort routine and put the # file in order before invoking makind on it. # # usage: makeind -f filename -m int -n int [-l int] [-s] # # When calling makeind, you must specify the filename to be indexed # (-f filename), the maximum field value (-m max-value; e.g. if # fields can go from 0 to 255, then -m 255 would be used), and the # number of fields (-n field-number). The -s switch directs makeind # to create a case-sensitive index. The default is case-insensitive. # -l [int] tells makeind to create a .LIM file, which is only needed # if you want to retrieve text by location marker, and not just via # the index (for this, you'll need something to translate human- # readable references into retrieve's native format). # # BUGS: This indexing routine is going to eat up a _tremendous_ # amount of memory when used on large files, since every token in the # input file gets its own entry in wordtbl, and each entry gets a set # as its corresponding key. If you don't have the memory, then you # could use strings instead of sets (the insert routines will be just # a tiny bit more complicated). Intermediate files could also be # used. Drop me a line if you want help. Otherwise, make sure you # have at *least* two megabytes core for every megabyte of text in # the file you wish to index (or else a very, very good virtual # memory management system). # # NOTE: The -S [field-sep] option is currently disabled because using # it slows things down drastically. If you want to be able to # specify what separator to use when breaking files down into # individual words, consult ./gettokens.icn. # # NOTE ALSO: Makeind compresses the input file somewhat, and in the # process, backs the old file up. If you need the old file back again, # look for a file with the extension .BAK (the original name may be # slightly permutated). Makeind will not overwrite this .BAK file, so # please either erase or move it when you are finished. If you try to # remake without doing this, makeind (ever cautious) will abort. # ############################################################################ # # Links: options.icn, codeobj.icn, ./indexutl.icn ./gettokens.icn # # See also: retrieve.icn, bmp2text.icn, expandrf.icn # ############################################################################ # IPL files to be linked in at compile time. link options, codeobj # Global variable (for OS-dependencies). # global IS # declared in indexutl.icn # Is is a record containing vital information on an indexed file, such # as the field separator, the string-length of fields, etc. I've re- # moved the record declaration from this file, and placed it in index- # utl.icn. # record is(FS, s_len, len, no, is_case_sensitive, r_field, hufftree) # # Main procedure. # procedure main(a) local usage, opt_table, fname, rollover_field, index_fname, bitmap_fname, upto_field, ofs_filename, bitmap_offset_table, out_IS, limits_fname, char_tbl, backup_filename, unt_filename, huffman_table, index_table # global IS # IS contains stats for file being indexed # # Initialize global OS-related parameters, such as the directory # separator (_slash) and the maximum permissible filename length # minus four (to make room for extensions makeind tacks on). # initialize_os_params() # # Read in and check command argument list. Insert FS and no # parameters into (global) record IS. Calculate s_len, len, and # bitmap_length parameters as well. Returns table of options # (keys are option letters). # usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]" opt_table := initialize_IS(a) fname := \opt_table["f"] | stop(usage) rollover_field := opt_table["l"] # (optional) # # Begin the process of tokenizing, recording token locations, and # of storing this information in two separate files. # # Read input file, making a table of words and their locations. # While we're at it, build a table of character frequencies for # text (not bitmaps) within the file (char_tbl), and then create # a huffman tree and table out of this character frequency list. # index_table := table(); char_tbl := table() create_index(fname, index_table, char_tbl) # Use the char_tbl to generate a Huffman tree & a code table. IS.hufftree := heap_2_huffman_tree(heap_init(char_tbl)) huffman_table := hash_huffcodes(IS.hufftree) # # Write keys to one file, with pointers into another file # containing the bitmaps for each key. Use the index_table # created just above (contains words & their locations). # index_fname := dir_name(fname)||create_fname(fname, "IND") bitmap_fname := dir_name(fname)||create_fname(fname, "BMP") write_tokens_and_offsets(index_fname, bitmap_fname, index_table) # # Backup the original text file. Prepare to re-use the original # filename to store compressed text. # backup_filename := create_fname(fname,"BAK") backup_filename ?:= (tab(many('x')), tab(0)) backup_filename := dir_name(fname)||backup_filename if close(open(backup_filename)) then abort("makeind", "backup filename collision; aborting", 6) rename(fname, backup_filename) | abort("makeind", "cannot back up file to disk; aborting", 7) # # Open backup file (i.e. the original text file), then run through # it, writing the bitmaps directly to a .UNT file (using the # original, human-readable form). Compress and write the text # associated with those bitmaps to a file having the same name as # the original text file (not the backup name), tacking onto the # bitmaps in the .UNT file their offset within the main text file. # Write the offsets of the major divisions in the .UNT file to the # .OFS file. # upto_field := 1 < (IS.no * 2) / 3 | 1 ofs_filename := dir_name(fname)||create_fname(fname, "OFS") unt_filename := dir_name(fname)||create_fname(fname, "UNT") bitmap_offset_table := store_bitmaps_and_offsets(fname, unt_filename, backup_filename, upto_field, huffman_table) # Write .OFS file. The .UNT and main text files have already been # written out by store_bitmaps_and_offsets(). The original main file # name now holds compressed text. write_bitmaps_and_offsets(ofs_filename, bitmap_offset_table, upto_field) # # Re-open UNT file. Read it, find the pre-rollover bitmaps, and # store them in the .LIM file. Obviously this procedure could be # stuffed into another one above (e.g. store_bitmaps_and_offsets()). # if \rollover_field then { # # Let's say we are using the Bible as our text, and we want to # create all the bitmaps for Genesis 1:9-2:10. We need to know # what verse chapter 1 goes up to. By supplying makeind # with a "-l 3" argument, you are telling it to store this in- # formation for later use by expandrf(). # limits_fname := dir_name(fname)||create_fname(fname, "LIM") write_limits(limits_fname, unt_filename, rollover_field) IS.r_field := rollover_field } # # Write IS record to the .IS file. # out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") | abort("makeind","can't open .IS file",2) writes(out_IS, encode(IS)) close(out_IS) # All is well. Exit with zero status. exit(0) end # # initialize_IS # # Sets up main parameters for the current index file, such as the # field separator to be used in tokenizing the file, the string and # bit lengths of bitmap fields, the number of fields, and the size of # the actual bitmaps (in bytes) as written to disk (comes out to the # smallest multiple of eight greater than the field length times the # field number. The marker length has to be set in the main # procedure, so initialize_IS leaves it null for now. # procedure initialize_IS(a) local usage, fname, opt_table # global IS usage:="usage: makeind -f filename -m int -n int [-l int] [-s]" IS := is() # set up some IS fields opt_table := options(a, "f:m:n+sS:l+") 3 <= *opt_table <= 6 | stop(usage) IS.no := \opt_table["n"] | stop(usage) IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?" IS.is_case_sensitive := opt_table["s"] # normally is &null # # Calculate string representation length for fields, as well as # the number of bits required for their integer representation. # I.e. if the opt_table["m"] value is 99, this will take two chars to # represent as a string ("99"), but 7 binary "digits" to represent # internally as a base-two integer. # IS.s_len := *string(opt_table["m"]) IS.len := *exbase10(opt_table["m"], 2) return opt_table end # # create_index # # (A better name would be fill_out_index_and_char_frequency_table.) # # Places tokens in fname (the full text file) in a table (supplied as # arg 2), with the set of each token's locations recorded as values # for those tokens. IS.FS is not used. IS.s_len is the location # marker string-representation field length. IS.len is the number of # binary digits needed for an integer representation of a given field. # IS.no is the number of fields. While creating a table for tokens in # fname, create_index ALSO fills out a table of character frequencies # in the entries (i.e. it doesn't count frequencies of chars used in # the bitmaps). # procedure create_index(fname, wordtbl, char_tbl) local intext, line, bitmap, token, value intext := open(fname) | abort("create_index","can't open index file, "||fname, 9) # Dummy key to hold all bitmaps in the text. insert(wordtbl, "", set()) # Seek past any garbage. Take first :: initial line. match("::", line := !intext) repeat { line ? { if ="::" then { bitmap := digits_2_bitmap(tab(0)) # Insert every bitmap into the dummy entry for "". insert(wordtbl[""], bitmap) line := read(intext) } else { value := line || "\n" # Concatenate every line in this entry into a single value. while line := read(intext) do { line ? { match("::") & break value ||:= line || "\n" } } value := trim(value, '\n') # Maintain character frequency table (arg 3 above). count_chars("" ~== value, char_tbl) # Build a table of tokens. NB gettokens() resides # in a separate file. The table is arg 2. value ? { every token := gettokens(IS.is_case_sensitive) do { /wordtbl[token] := set() insert(wordtbl[token], \bitmap) } } # If :: doesn't appear first on line, we're at EOF. If # line == &subject, then to prevent an infinite loop, we # break. line == &subject & break match("::", line) | break } } } \line | abort("create_index", "empty input file, "||fname, 8) close(intext) return "tokenized " || *wordtbl || " words; filled out char_tbl" end # # write_tokens_and_offsets # # Writes to one file (the .IND file) a list of all tokens collected # from the input file, one to a line, followed by a tab, and then a # byte offset into another file (the .BMP file) where the bitmaps for # that token are kept. # # token tab offset # # A seek to "offset" in the .BMP file will put you at the start of a # block of bitmaps. # procedure write_tokens_and_offsets(index_fname, bitmap_fname, t) local outtokens, outbitmaps, index_lst, i, bitmap_length, how_many_bitmaps, bits_needed, inverse_signal, inverse_set outtokens := open(index_fname, "w") | abort("write_tokens_and_offsets","can't open "||index_fname,6) outbitmaps := open(bitmap_fname, "w") | abort("write_tokens_and_offsets","can't open "||bitmap_fname,5) # Calculate the length of bitmaps (must be the smallest multiple of # 8 >= (IS.len * IS.no)). bitmap_length := ((IS.len * IS.no) <= seq(0,8)) index_lst := sort(t, 3) bits_needed := 24 # bytes needed to hold no of bitmaps for keys inverse_signal := 8388608 # 24th bit, which signals inverse storage every i := 1 to *index_lst-1 by 2 do { # Write token to index file with the offset of that token's # bitmaps in the bitmap file. write(outtokens, index_lst[i], "\t", where(outbitmaps)) # Now write the bitmaps for the above token to the bitmap file. # First write out the number of bitmaps in this block. 4 bytes # are allotted to hold this count (23 bits). If the number of # bitmaps for the current token exceeds 3/5 of the total number # of bitmaps for the entire file, then add bits_needed-1 to the # number. That is, set the highest bit to 1. how_many_bitmaps := *index_lst[i+1] if how_many_bitmaps > (inverse_signal-1) then { # just in case abort("write_tokens_and_offsets", "too many bitmaps for"||index_lst[i], bits_needed) } # "" is a dummy key containing all the bitmaps in the text. # If the number of bitmaps for any key other than "" exceeds # 3/5 that of "", then store those bitmaps where the key does # NOT occur, and set the inverse_signal bit... if index_lst[i] ~== "" & how_many_bitmaps >= integer(*t[""] * 0.60) then { inverse_set := (index_lst[2] -- index_lst[i+1]) how_many_bitmaps := ior(*inverse_set, inverse_signal) write_int(outbitmaps, how_many_bitmaps, bits_needed) every write_int(outbitmaps, !inverse_set, bitmap_length) } else { # ...otherwise (if we have a reasonable number of keys), store # the locations "straight" and don't set the inverse_signal bit. write_int(outbitmaps, how_many_bitmaps, bits_needed) # Having written the bitmap count, now write the bitmaps proper # to the bitmap file. if index_lst[i] == "" then # Bitmaps for "" are guaranteed to be sorted. every write_int(outbitmaps, !sort(index_lst[i+1]),bitmap_length) else every write_int(outbitmaps, !index_lst[i+1], bitmap_length) } } # Close files. Return number of keys processed (any better ideas??) every close(outtokens | outbitmaps) return *index_lst / 2 # return number of keys in index file end # # store_bitmaps_and_offsets # # Open the original text file (backup_file). Read in bitmaps and # write out compressed texts to fname. Write out bitmaps and the # offsets of the text they go with in fname to the .UNT file. Store # locations of bitmaps in the UNT file to the .OFS file. Note that # the full bitmap is not stored. Rather only the first upto_field # fields are stored. Normally upto_field = IS.no - 1. The scheme is # thus as follows: bitmap -> .OFS file -> .UNT file -> main text file. # The OFS file is read into memory at run-time, and stores only the # locations of major blocks in the .UNT file. The .UNT file has a # full list of pointers into the main text file, and once a bitmap # is located there, the exact location for it in the main file can # be retrieved. # procedure store_bitmaps_and_offsets(fname, unt_fname, backup_fname, upto_field, huffman_table) local last_major_division, major_division, bitmap_offset_table, out_compress, in_main_text, out_locations, location, firstline, line, value out_compress := open(fname, "w") out_locations := open(unt_fname, "w") in_main_text := open(backup_fname) | abort("store_bitmaps_and_offsets","can't open "||backup_fname,5) bitmap_offset_table := table() last_major_division := -1 # initialize to an impossible value until match("::", firstline := read(in_main_text)) every line := (firstline | !in_main_text) do { line ? { if not ="::" then value ||:= line || "\n" else { location := tab(0) major_division := ishift(digits_2_bitmap(location), -((IS.no - upto_field) * IS.len)) if last_major_division ~= major_division then { # Write to the bitmap offset table the major division, # and then our position in the UNT file (which records # the locations for all bitmaps). insert(bitmap_offset_table, major_division, where(out_locations)) last_major_division := major_division } # Write compressed text to original filename; then write # the current location marker to the UNT file, followed by # our position in out_compress (i.e. the place where the # next block of compressed text will begin). \value & writes(out_compress, block_encode(trim(value,"\n"), huffman_table)) | # We should abort with an I/O error if there is no # on the device. I ran into a problem, though, in # which writes were repeatedly attempted to a full # filesystem. abort("store_bitmaps...", "can't write to "|| fname, 6) write(out_locations, location, "\t", where(out_compress)) value := "" } } } \value & writes(out_compress, block_encode(trim(value,"\n"), huffman_table)) | abort("store_bitmaps...", "can't write to "|| fname, 6) writes(out_compress, block_encode(trim(\value,"\n"), huffman_table)) every close(in_main_text | out_compress | out_locations) return bitmap_offset_table end # # write_bitmaps_and_offsets # # Does the actual writing of bitmaps and offsets to a file. Receives # a table of bitmaps cut down to upto_field fields. Shinking the # bitmaps lessens the size of the resulting file, but requires a bit # more I/O when it comes time to look something up. # procedure write_bitmaps_and_offsets(ofs_filename, t, upto_field) local outtext, tmp_list, i, offset_length, block_size, stored_bitmap_length outtext := open(ofs_filename, "w") | abort("write_bitmaps_and_offsets","can't open "||ofs_filename,5) stored_bitmap_length := ((IS.len * upto_field) <= seq(0,8)) tmp_list := sort(t, 3) every i := 1 to *tmp_list-1 by 2 do { # Number of bits needed to hold offset. offset_length := (*exbase10(tmp_list[i+1], 2) <= seq(0,8)) # Number of bytes needed to hold bitmap and offset (both). block_size := (stored_bitmap_length + offset_length) / 8 # We could just code the length of the offset, since the bitmap's # length is fixed (and known). Seems better to code the block's # total length just in case something gets screwed up. An 8-bit # limit means the bitmap+offset length cannot exceed 2^9-1 (255) # characters. if block_size > 255 then abort("write_bitmaps_and_offsets","bitmap+offset too big",15) write_int(outtext, block_size, 8) write_int(outtext, tmp_list[i], stored_bitmap_length) write_int(outtext, tmp_list[i+1], offset_length) } return end # # write_limits # # Writes out the bitmaps that will be needed in order for expandrf() # to be able to know when the rollover field rolls over. # procedure write_limits(out_fname, in_fname, r_field) local in, out, shift_bits_out, bitmap_length, bitmaps_read, line, bitmap, short_bitmap, old_bitmap in := open(in_fname) | abort("write_limits","can't open "||in_fname,5) out := open(out_fname, "w") | abort("write_limits","can't open "||out_fname,5) r_field <= IS.no | abort("write_limits","-l value should not exceed that of -n",50) shift_bits_out := -(((IS.no-r_field)+ 1) * IS.len) bitmap_length := ((IS.len * IS.no) <= seq(0,8)) bitmaps_read := 0 while line := read(in) do { line ? { bitmaps_read +:= 1 bitmap := digits_2_bitmap(tab(find("\t"))) # in indexutl.icn short_bitmap := ishift(bitmap, shift_bits_out) if ishift(\old_bitmap, shift_bits_out) ~== short_bitmap then write_int(out, old_bitmap, bitmap_length) old_bitmap := bitmap } } write_int(out, \old_bitmap, bitmap_length) every close(in | out) return bitmaps_read end