############################################################################
#
#	Name:	 makeind.icn
#
#	Title:	 makeind.icn
#
#	Author:	 Richard L. Goerwitz
#
#	Version: 2.4
#
############################################################################
#
#  This file, makeind.icn, compiles into an indexing program which
#  creates a series of files offering the user rapid access to
#  individual elements (usually words) within a text file.  Access is
#  gained through a set of basic retrieval utilities contained in the
#  file retrieve.icn, bmp2text.icn, retrops.icn, and others included
#  with this package.  In order to be indexable, files must interleave
#  string coded bitfield-style designators with text in the following
#  manner:
#
#  ::001:001:001
#  This is text.
#  ::001:001:002
#  This is more text.
#
#  The lines beginning with :: (a double colon) mark bitfield-style
#  location-designators.  Location designators are strings with digit
#  fields of fixed number and length separated either by nothing (as
#  in, say 001001002), or better yet by non-digits (e.g. 001:001:002).
#  NOTE WELL: The bitmaps must come in ascending order.  For example,
#  if we assume three-field bitmaps, 002:001:014 would come before
#  003:001:013.  If your file is not sorted properly, depending on
#  the structure of the file, retrieve() may 1) abort, 2) retrieve the
#  wrong text, or 3) work perfectly fine.  Unless you're absolutely
#  sure of what you're doing, write a quick sort routine and put the
#  file in order before invoking makind on it.
#
#  usage:  makeind -f filename -m int -n int [-l int] [-s]
#
#  When calling makeind, you must specify the filename to be indexed
#  (-f filename), the maximum field value (-m max-value; e.g. if
#  fields can go from 0 to 255, then -m 255 would be used), and the
#  number of fields (-n field-number).  The -s switch directs makeind
#  to create a case-sensitive index.  The default is case-insensitive.
#  -l [int] tells makeind to create a .LIM file, which is only needed
#  if you want to retrieve text by location marker, and not just via
#  the index (for this, you'll need something to translate human-
#  readable references into retrieve's native format).
#
#  BUGS: This indexing routine is going to eat up a _tremendous_
#  amount of memory when used on large files, since every token in the
#  input file gets its own entry in wordtbl, and each entry gets a set
#  as its corresponding key.  If you don't have the memory, then you
#  could use strings instead of sets (the insert routines will be just
#  a tiny bit more complicated).  Intermediate files could also be
#  used.  Drop me a line if you want help.  Otherwise, make sure you
#  have at *least* two megabytes core for every megabyte of text in
#  the file you wish to index (or else a very, very good virtual
#  memory management system).
#
#  NOTE: The -S [field-sep] option is currently disabled because using
#  it slows things down drastically.  If you want to be able to
#  specify what separator to use when breaking files down into
#  individual words, consult ./gettokens.icn.
#
#  NOTE ALSO:  Makeind compresses the input file somewhat, and in the
#  process, backs the old file up.  If you need the old file back again,
#  look for a file with the extension .BAK (the original name may be
#  slightly permutated).  Makeind will not overwrite this .BAK file, so
#  please either erase or move it when you are finished.  If you try to
#  remake without doing this, makeind (ever cautious) will abort.
#
############################################################################
#
#  Links: options.icn, codeobj.icn, ./indexutl.icn ./gettokens.icn
#
#  See also: retrieve.icn, bmp2text.icn, expandrf.icn
#
############################################################################

# IPL files to be linked in at compile time.
link options, codeobj

# Global variable (for OS-dependencies).
# global IS			# declared in indexutl.icn

# Is is a record containing vital information on an indexed file, such
# as the field separator, the string-length of fields, etc.  I've re-
# moved the record declaration from this file, and placed it in index-
# utl.icn.
# record is(FS, s_len, len, no, is_case_sensitive, r_field, hufftree)

#
# Main procedure.
#
procedure main(a)

    local usage, opt_table, fname, rollover_field, index_fname,
	bitmap_fname, upto_field, ofs_filename, bitmap_offset_table,
	out_IS, limits_fname, char_tbl, backup_filename, unt_filename,
	huffman_table, index_table

    # global IS			# IS contains stats for file being indexed

    #
    # Initialize global OS-related parameters, such as the directory
    # separator (_slash) and the maximum permissible filename length
    # minus four (to make room for extensions makeind tacks on).
    #
    initialize_os_params()

    #
    # Read in and check command argument list.  Insert FS and no
    # parameters into (global) record IS.  Calculate s_len, len, and
    # bitmap_length parameters as well.  Returns table of options
    # (keys are option letters).
    #
    usage:= "usage: makeind -f filename -m int -n int [-l int] [-s]"
    opt_table := initialize_IS(a)
    fname := \opt_table["f"]				| stop(usage)
    rollover_field := opt_table["l"]			# (optional)

    #
    # Begin the process of tokenizing, recording token locations, and
    # of storing this information in two separate files.
    #
    # Read input file, making a table of words and their locations.
    # While we're at it, build a table of character frequencies for
    # text (not bitmaps) within the file (char_tbl), and then create
    # a huffman tree and table out of this character frequency list.
    #
    index_table := table(); char_tbl := table()
    create_index(fname, index_table, char_tbl)
    # Use the char_tbl to generate a Huffman tree & a code table.
    IS.hufftree := heap_2_huffman_tree(heap_init(char_tbl))
    huffman_table := hash_huffcodes(IS.hufftree)

    #
    # Write keys to one file, with pointers into another file
    # containing the bitmaps for each key.  Use the index_table
    # created just above (contains words & their locations).
    #
    index_fname  := dir_name(fname)||create_fname(fname, "IND")
    bitmap_fname := dir_name(fname)||create_fname(fname, "BMP")
    write_tokens_and_offsets(index_fname, bitmap_fname, index_table)

    #
    # Backup the original text file.  Prepare to re-use the original
    # filename to store compressed text.
    #
    backup_filename := create_fname(fname,"BAK")
    backup_filename ?:= (tab(many('x')), tab(0))
    backup_filename := dir_name(fname)||backup_filename
    if close(open(backup_filename)) then
	abort("makeind", "backup filename collision; aborting", 6)
    rename(fname, backup_filename) |
	abort("makeind", "cannot back up file to disk; aborting", 7)
    #
    # Open backup file (i.e. the original text file), then run through
    # it, writing the bitmaps directly to a .UNT file (using the
    # original, human-readable form).  Compress and write the text
    # associated with those bitmaps to a file having the same name as
    # the original text file (not the backup name), tacking onto the
    # bitmaps in the .UNT file their offset within the main text file.
    # Write the offsets of the major divisions in the .UNT file to the
    # .OFS file.
    #
    upto_field := 1 < (IS.no * 2) / 3 | 1
    ofs_filename := dir_name(fname)||create_fname(fname, "OFS")
    unt_filename := dir_name(fname)||create_fname(fname, "UNT")
    bitmap_offset_table := 
	store_bitmaps_and_offsets(fname, unt_filename, backup_filename,
				  upto_field, huffman_table)
    # Write .OFS file.  The .UNT and main text files have already been
    # written out by store_bitmaps_and_offsets().  The original main file
    # name now holds compressed text.
    write_bitmaps_and_offsets(ofs_filename, bitmap_offset_table, upto_field)

    #
    # Re-open UNT file.  Read it, find the pre-rollover bitmaps, and
    # store them in the .LIM file.  Obviously this procedure could be
    # stuffed into another one above (e.g. store_bitmaps_and_offsets()).
    #
    if \rollover_field then {
	#
	# Let's say we are using the Bible as our text, and we want to
	# create all the bitmaps for Genesis 1:9-2:10.  We need to know
	# what verse chapter 1 goes up to.  By supplying makeind
	# with a "-l 3" argument, you are telling it to store this in-
	# formation for later use by expandrf().
	#
	limits_fname := dir_name(fname)||create_fname(fname, "LIM")
	write_limits(limits_fname, unt_filename, rollover_field)
	IS.r_field := rollover_field
    }

    #
    # Write IS record to the .IS file.
    #
    out_IS := open(dir_name(fname)||create_fname(fname, "IS"), "w") |
	abort("makeind","can't open .IS file",2)
    writes(out_IS, encode(IS))
    close(out_IS)

    # All is well.  Exit with zero status.
    exit(0)

end


#
# initialize_IS
#
# Sets up main parameters for the current index file, such as the
# field separator to be used in tokenizing the file, the string and
# bit lengths of bitmap fields, the number of fields, and the size of
# the actual bitmaps (in bytes) as written to disk (comes out to the
# smallest multiple of eight greater than the field length times the
# field number.  The marker length has to be set in the main
# procedure, so initialize_IS leaves it null for now.
#
procedure initialize_IS(a)

    local usage, fname, opt_table
    # global IS

    usage:="usage: makeind -f filename -m int -n int [-l int] [-s]"

    IS := is()			# set up some IS fields
    opt_table := options(a, "f:m:n+sS:l+")
    3 <= *opt_table <= 6			| stop(usage)
    IS.no := \opt_table["n"]			| stop(usage)
    IS.FS := \opt_table["S"] | "['.]?[^-0-9A-Za-z']+'?"
    IS.is_case_sensitive := opt_table["s"]      # normally is &null

    #
    # Calculate string representation length for fields, as well as
    # the number of bits required for their integer representation.
    # I.e. if the opt_table["m"] value is 99, this will take two chars to
    # represent as a string ("99"), but 7 binary "digits" to represent
    # internally as a base-two integer.
    #
    IS.s_len := *string(opt_table["m"])
    IS.len := *exbase10(opt_table["m"], 2)

    return opt_table

end


#
# create_index
#
# (A better name would be fill_out_index_and_char_frequency_table.)
#
# Places tokens in fname (the full text file) in a table (supplied as
# arg 2), with the set of each token's locations recorded as values
# for those tokens.  IS.FS is not used.  IS.s_len is the location
# marker string-representation field length.  IS.len is the number of
# binary digits needed for an integer representation of a given field.
# IS.no is the number of fields.  While creating a table for tokens in
# fname, create_index ALSO fills out a table of character frequencies
# in the entries (i.e. it doesn't count frequencies of chars used in
# the bitmaps).
# 
procedure create_index(fname, wordtbl, char_tbl)

    local intext, line, bitmap, token, value

    intext := open(fname) |
	abort("create_index","can't open index file, "||fname, 9)
    # Dummy key to hold all bitmaps in the text.
    insert(wordtbl, "", set())

    # Seek past any garbage.  Take first :: initial line.
    match("::", line := !intext)
    repeat {
	line ? {

	    if ="::" then {
		bitmap := digits_2_bitmap(tab(0))
		# Insert every bitmap into the dummy entry for "".
		insert(wordtbl[""], bitmap)
		line := read(intext)
	    } else {

		value := line || "\n"
		# Concatenate every line in this entry into a single value.
		while line := read(intext) do {
		    line ? {
			match("::") & break
			value ||:= line || "\n"
		    }
		}

		value := trim(value, '\n')
		# Maintain character frequency table (arg 3 above).
		count_chars("" ~== value, char_tbl)
		# Build a table of tokens. NB gettokens() resides
		# in a separate file.  The table is arg 2.
		value ? {
		    every token := gettokens(IS.is_case_sensitive) do {
			/wordtbl[token] := set()
			insert(wordtbl[token], \bitmap)
		    }
		}

		# If :: doesn't appear first on line, we're at EOF.  If
		# line == &subject, then to prevent an infinite loop, we
		# break.
		line == &subject & break
		match("::", line) | break
	    }
	}
    }

    \line | abort("create_index", "empty input file, "||fname, 8)
    close(intext)
    return "tokenized " || *wordtbl || " words; filled out char_tbl"

end


#
# write_tokens_and_offsets
#
# Writes to one file (the .IND file) a list of all tokens collected
# from the input file, one to a line, followed by a tab, and then a
# byte offset into another file (the .BMP file) where the bitmaps for
# that token are kept.
#
#     token tab offset
#
# A seek to "offset" in the .BMP file will put you at the start of a
# block of bitmaps.
#
procedure write_tokens_and_offsets(index_fname, bitmap_fname, t)

    local outtokens, outbitmaps, index_lst, i, bitmap_length,
	how_many_bitmaps, bits_needed, inverse_signal, inverse_set

    outtokens := open(index_fname, "w") |
	abort("write_tokens_and_offsets","can't open "||index_fname,6)
    outbitmaps := open(bitmap_fname, "w") |
	abort("write_tokens_and_offsets","can't open "||bitmap_fname,5)
    # Calculate the length of bitmaps (must be the smallest multiple of
    # 8 >= (IS.len * IS.no)).
    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
    index_lst := sort(t, 3)
    bits_needed := 24		# bytes needed to hold no of bitmaps for keys
    inverse_signal := 8388608	# 24th bit, which signals inverse storage

    every i := 1 to *index_lst-1 by 2 do {

	# Write token to index file with the offset of that token's
	# bitmaps in the bitmap file.
	write(outtokens, index_lst[i], "\t", where(outbitmaps))

	# Now write the bitmaps for the above token to the bitmap file.
	# First write out the number of bitmaps in this block.  4 bytes
	# are allotted to hold this count (23 bits).  If the number of
	# bitmaps for the current token exceeds 3/5 of the total number
	# of bitmaps for the entire file, then add bits_needed-1 to the
	# number.  That is, set the highest bit to 1.
	how_many_bitmaps := *index_lst[i+1]
	if how_many_bitmaps > (inverse_signal-1) then {        # just in case
	    abort("write_tokens_and_offsets",
		  "too many bitmaps for"||index_lst[i], bits_needed)
	}
	# "" is a dummy key containing all the bitmaps in the text.
	# If the number of bitmaps for any key other than "" exceeds
	# 3/5 that of "", then store those bitmaps where the key does
	# NOT occur, and set the inverse_signal bit...
	if index_lst[i] ~== "" &
	    how_many_bitmaps >= integer(*t[""] * 0.60)
	then {
	    inverse_set := (index_lst[2] -- index_lst[i+1])
	    how_many_bitmaps :=	ior(*inverse_set, inverse_signal)
	    write_int(outbitmaps, how_many_bitmaps, bits_needed)
	    every write_int(outbitmaps, !inverse_set, bitmap_length)
	} else {
	    # ...otherwise (if we have a reasonable number of keys), store
	    # the locations "straight" and don't set the inverse_signal bit.
	    write_int(outbitmaps, how_many_bitmaps, bits_needed)
	    # Having written the bitmap count, now write the bitmaps proper
	    # to the bitmap file.
	    if index_lst[i] == ""
	    then
	      # Bitmaps for "" are guaranteed to be sorted.
	      every write_int(outbitmaps, !sort(index_lst[i+1]),bitmap_length)
	    else
	      every write_int(outbitmaps, !index_lst[i+1], bitmap_length)
	}
    }

    # Close files.  Return number of keys processed (any better ideas??)
    every close(outtokens | outbitmaps)
    return *index_lst / 2	# return number of keys in index file

end


#
# store_bitmaps_and_offsets
#
# Open the original text file (backup_file).  Read in bitmaps and
# write out compressed texts to fname.  Write out bitmaps and the
# offsets of the text they go with in fname to the .UNT file.  Store
# locations of bitmaps in the UNT file to the .OFS file.  Note that
# the full bitmap is not stored.  Rather only the first upto_field
# fields are stored.  Normally upto_field = IS.no - 1.  The scheme is
# thus as follows: bitmap -> .OFS file -> .UNT file -> main text file.
# The OFS file is read into memory at run-time, and stores only the
# locations of major blocks in the .UNT file.  The .UNT file has a
# full list of pointers into the main text file, and once a bitmap
# is located there, the exact location for it in the main file can
# be retrieved.
#
procedure store_bitmaps_and_offsets(fname, unt_fname, backup_fname,
	  upto_field, huffman_table)

    local last_major_division, major_division, bitmap_offset_table,
	out_compress, in_main_text, out_locations, location, firstline,
	line, value

    out_compress := open(fname, "w")
    out_locations := open(unt_fname, "w")
    in_main_text := open(backup_fname) |
	abort("store_bitmaps_and_offsets","can't open "||backup_fname,5)
    bitmap_offset_table := table()

    last_major_division := -1	# initialize to an impossible value
    until match("::", firstline := read(in_main_text))
    every line := (firstline | !in_main_text) do {
	line ? {
	    if not ="::" then
		value ||:= line || "\n"
	    else {
		location := tab(0)
		major_division :=
		    ishift(digits_2_bitmap(location),
			   -((IS.no - upto_field) * IS.len))
		if last_major_division ~= major_division
		then {
		    # Write to the bitmap offset table the major division,
		    # and then our position in the UNT file (which records
		    # the locations for all bitmaps).
		    insert(bitmap_offset_table, major_division,
			   where(out_locations))
		    last_major_division := major_division
		}
		# Write compressed text to original filename; then write
		# the current location marker to the UNT file, followed by
		# our position in out_compress (i.e. the place where the
		# next block of compressed text will begin).
		\value & writes(out_compress,
		    block_encode(trim(value,"\n"), huffman_table)) |
		        # We should abort with an I/O error if there is no
		        # on the device.  I ran into a problem, though, in
		        # which writes were repeatedly attempted to a full
			# filesystem.
		        abort("store_bitmaps...", "can't write to "|| fname, 6)
		write(out_locations, location, "\t", where(out_compress))
		value := ""
	    }
	}
    }

    \value & writes(out_compress,
	block_encode(trim(value,"\n"), huffman_table)) |
	    abort("store_bitmaps...", "can't write to "|| fname, 6)
    writes(out_compress, block_encode(trim(\value,"\n"), huffman_table))
    every close(in_main_text | out_compress | out_locations)
    return bitmap_offset_table

end


#
# write_bitmaps_and_offsets
#
# Does the actual writing of bitmaps and offsets to a file.  Receives
# a table of bitmaps cut down to upto_field fields.  Shinking the
# bitmaps lessens the size of the resulting file, but requires a bit
# more I/O when it comes time to look something up.
#
procedure write_bitmaps_and_offsets(ofs_filename, t, upto_field)

    local outtext, tmp_list, i, offset_length,
	block_size, stored_bitmap_length

    outtext := open(ofs_filename, "w") |
	abort("write_bitmaps_and_offsets","can't open "||ofs_filename,5)
    stored_bitmap_length := ((IS.len * upto_field) <= seq(0,8))
    tmp_list := sort(t, 3)

    every i := 1 to *tmp_list-1 by 2 do {

	# Number of bits needed to hold offset.
	offset_length := (*exbase10(tmp_list[i+1], 2) <= seq(0,8))
	# Number of bytes needed to hold bitmap and offset (both).
	block_size := (stored_bitmap_length + offset_length) / 8

	# We could just code the length of the offset, since the bitmap's
	# length is fixed (and known).  Seems better to code the block's
	# total length just in case something gets screwed up.  An 8-bit
	# limit means the bitmap+offset length cannot exceed 2^9-1 (255)
	# characters.
	if block_size > 255 then
	    abort("write_bitmaps_and_offsets","bitmap+offset too big",15)
	write_int(outtext, block_size, 8)
	write_int(outtext, tmp_list[i], stored_bitmap_length)
	write_int(outtext, tmp_list[i+1], offset_length)

    }

    return

end


#
# write_limits
#
# Writes out the bitmaps that will be needed in order for expandrf()
# to be able to know when the rollover field rolls over.
#
procedure  write_limits(out_fname, in_fname, r_field)

    local in, out, shift_bits_out, bitmap_length, bitmaps_read,
	 line, bitmap, short_bitmap, old_bitmap

    in := open(in_fname) |
	abort("write_limits","can't open "||in_fname,5)
    out := open(out_fname, "w") |
	abort("write_limits","can't open "||out_fname,5)
    r_field <= IS.no |
	abort("write_limits","-l value should not exceed that of -n",50)
    shift_bits_out := -(((IS.no-r_field)+ 1) * IS.len)
    bitmap_length := ((IS.len * IS.no) <= seq(0,8))
    bitmaps_read := 0

    while line := read(in) do {
	line ? {
	    bitmaps_read +:= 1
	    bitmap := digits_2_bitmap(tab(find("\t"))) # in indexutl.icn
	    short_bitmap := ishift(bitmap, shift_bits_out)
	    if ishift(\old_bitmap, shift_bits_out) ~== short_bitmap
	    then write_int(out, old_bitmap, bitmap_length)
	    old_bitmap := bitmap
	}
    }

    write_int(out, \old_bitmap, bitmap_length)
    every close(in | out)
    return bitmaps_read
		    
end