package Search::OpenFTS::Index; use strict; =head1 NAME Search::OpenFTS::Index - Provides functions for indexing =head1 SYNOPSIS =head2 API my $fts=Search::OpenFTS::Index->new( DBI ); my $fts=Search::OpenFTS::Index->new( DBI, prefix ); my $fts=Search::OpenFTS::Index->init( dbi=>DBI, txttid=>NAME_TXT_ID, dict=>[DICT1, DICT2, ...], parser=>PARSER, map=>'{IDTYPELEXEM1=>[IDDICT1, ...], ...}', tsvector_field=>FIELD_NAME, ignore_id_index=>"IDTYPELEXEM1 [IDTYPELEXEM2 [...]]", ignore_headline=>"IDTYPELEXEM1 [IDTYPELEXEM2 [...]]", prefix=>PREFIX ); This is the initialization function. It is called only once, at the creation of a new search index, to create the configuration and indexing tables. =over 2 =item txttid The table where the documents are stored together with its primary key (e.g. messages.msg_id) =item dict List of available dictionaries. Dictionaries should support three methods: lemms, is_stoplexem, drop and init. init is used for the initialization of the dictionary. lemms returns an array of lexems for a given word and is_stoplexem answers whether the given lexeme corresponds to a stop word or not. drop is used for clearing dictionaries tables (if any) while dropping OpenFTS instance. Methods is_stoplexem, drop and init are optional. =item parser The full name of the parser in use. Parser should have the same interface as Search::OpenFTS::Parser module. =item map A mapping from types of lexemes to dictionaries. This is helpful for optimizing the search engine and it is also helpful for indexing multi-languages or exotic-text documents. =item tsvector_field The field name that holds the text index of integers for each document. This field must have tsvector type( from contrib/tsearch ) =item ignore_id_index Type IDs of lexemes to ignore while indexing documents. =item ignore_id_headline Type IDs of lexemes to ignore while constructing headlines of the search results. =item prefix If more than one content tables require indexing and searching functionality the user can pass a special parameter named prefix which is a character value from a-z. The given prefix is used, as a naming convention, to create different instances of the configuration and indexing table. To specify dictionary which requires parameters (snowball stemmer, for example), use following syntax: dict=>[ # example how to use snowball stemmer { mod=>'Search::OpenFTS::Dict::Snowball', param=>'{lang=>"english"}' }, 'Search::OpenFTS::Dict::UnknownDict', ] =back =cut use Search::OpenFTS::Base; use vars qw(@ISA); @ISA = ('Search::OpenFTS::Base'); sub new { my ( $class, $dbi, $prefix ) = @_; $class = ref($class) || $class; my $self = Search::OpenFTS::Base->new( $dbi, index => 1, prefix => $prefix ); return undef unless defined $self; $self->{INIT} = 0; bless( $self, $class ); return $self; } sub init { my ( $class, %opt ) = @_; $class = ref($class) || $class; local $_; my $dbi = $opt{dbi}; die "There is no DBI object" if !ref $dbi; my $nameid = $opt{txttid}; my $dict = []; my $rv = $opt{dict}; my $parser = $opt{parser}; $parser ||= 'Search::OpenFTS::Parser'; my $prefix = $opt{prefix}; $prefix ||= ''; if ( length $prefix ) { $prefix =~ s/^([a-z]).*$/$1/ || die "Wrong prefix"; $prefix .= '_'; } if ( !length $opt{tsvector_field} ) { die "There is not tsvector_field option"; } my $success = 1; #Creation of tables $success &&= $dbi->do( "create table ${prefix}fts_conf ( name varchar not null, did int not null default -1, mod varchar not null, param varchar, primary key (name,did) );" ); if ( defined $opt{map} && length $opt{map} ) { #Check syntax of map and save it my $map = eval $opt{map}; die "Can't eval '$opt{map}': $@" if $@; $opt{map} =~ s/'/\\'/g; $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod, param ) values ( 'map', '', '$opt{map}' );" ); } { #Check the syntax of mapping and insert it into the configuration table $parser = { mod => $parser, param => '' } if !ref $parser; my $prs_param = {}; if ( exists $parser->{param} && length $parser->{param} ) { $prs_param = eval $parser->{param}; die "Can't eval '$parser->{param}': $@" if $@; } eval( 'use ' . $parser->{mod} . ';' ); die "Can't use parser: $@" if ($@); my $prs; eval { $prs = $parser->{mod}->new(%$prs_param); }; die "Can't call new method in parser: $@" if ($@); $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod, param ) values ( 'parser', '$parser->{mod}', '$parser->{param}' );" ); undef $prs; } #Other configuration parameters used by Search::OpenFTS::Base $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod ) values ( 'ignore_id_index', '" . join( ' ', @{ $opt{ignore_id_index} } ) . "' );" ) if ref $opt{ignore_id_index}; $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod ) values ( 'ignore_headline', '" . join( ' ', @{ $opt{ignore_headline} } ) . "' );" ) if ref $opt{ignore_headline}; $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod ) values ( 'tsvector_field', '$opt{tsvector_field}' );" ); $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, mod ) values ( 'txttid', '$nameid' );" ); if ( !( ref $rv && $#{$rv} >= 0 ) ) { $rv = ['Search::OpenFTS::Dict::UnknownDict']; } @$rv = map { ( ref $_ ) ? $_ : { mod => $_ } } @$rv; foreach my $i ( 0 .. $#{$rv} ) { my $param = $rv->[$i]{param}; $param ||= ''; if ( length $param ) { $rv->[$i]{param} = eval $param; die "Can't eval '$param': $@" if $@; } $rv->[$i]{param}{DBI} = $dbi; eval( 'use ' . $rv->[$i]{mod} . ';' ); die "Can't use $rv->[$i]{mod} module: $@" if ($@); eval { $dict->[$i] = $rv->[$i]{mod}->init( %{ $rv->[$i]{param} } ); }; if ($@) { eval { $dict->[$i] = $rv->[$i]{mod}->new( %{ $rv->[$i]{param} } ); }; die "Can' call new/init method in $rv->[$i]{mod}: $@" if ($@); } if ( !$dict->[$i]->can('lemms') ) { die "Dictionary $rv->[$i]{mod} has no method lemms"; } $param =~ s/'/\\'/g; $success &&= $dbi->do( "insert into ${prefix}fts_conf ( name, did, mod, param ) values ( 'dict', $i, '$rv->[$i]{mod}', '$param' );" ); } undef $dict; #destroy dictionaries die "Can't insert row(s) in ${prefix}fts_conf" if !$success; my $self = Search::OpenFTS::Index->new( $dbi, $prefix ); die "Search::OpenFTS::Index->new() returns undef" unless defined $self; $self->{INIT} = 1; return $self; } sub DESTROY { my ($self) = @_; if ( $self->{INIT} ) { warn "Creating index...\n"; if ( !$self->create_index ) { warn "Can't create index !!!\n"; } } } =head2 Methods =over 4 =item index( $txt_id, [ $FH | $text | $reftext ] ); =item index( $txt_id, [ $FH | $text | $reftext ], $title ); Used for indexing text. =cut sub index { my ( $self, $tid, $txt, $title ) = @_; local $_; # %struct = ( # lexem=>[ posisions ] # ); # my ( $num, %struct ); $num = 1; if ( defined $title && length $title ) { $num = $self->_addtxt( \%struct, \$title, $num, undef, 'A' ); } $num += $self->_addtxt( \%struct, ( ref $txt ) ? $txt : \$txt, $num ); return $self->_flush( \%struct, $tid, $num ); } sub _flush { my ( $self, $rs, $tid, $num ) = @_; my $success = 1; local $_; my $orig; my $list = join( ' ', map { $orig = $_; $_ =~ s/([\r\t\n '\\:])/\\\\$1/g; "$_:" . join( ',', @{ $rs->{$orig} } ) } keys %$rs ); if ( defined $list && length $list ) { $success &&= $self->{DBI}->do( "update $self->{TABLE} set $self->{TSVECTOR_FIELD} = '$list' where $self->{IDNAME} = $tid;" ); } else { $success &&= $self->{DBI}->do( "update $self->{TABLE} set $self->{TSVECTOR_FIELD} = null where $self->{IDNAME} = $tid;" ); } return $success; } sub _addtxt { my ( $self, $rs, $rtxt, $num, $length, $wclass ) = @_; $num ||= 1; $length ||= 0; $wclass ||= ''; $self->{PRS}->start_parser( $rtxt, $length ); my ( $type, $word, $lexems ); while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) { next if exists $self->{IGNOREID}{$type}; $lexems = $self->_get_lexems( $type, $word ); if ( defined $lexems && $#$lexems >= 0 ) { foreach my $l (@$lexems) { $rs->{$l} ||= []; push( @{ $rs->{$l} }, $num . $wclass ); } } $num++; } $self->{PRS}->end_parser; return $num; } =pod =item delete ( $txt_id ) Deletes all records of the given identifier. =cut sub delete { my ( $self, $tid ) = @_; my $success = 1; $success &&= $self->{DBI}->do( "update $self->{TABLE} set $self->{TSVECTOR_FIELD} = null where $self->{IDNAME} = $tid;" ); return $success; } =pod =item create_index =item create_index(1); Creates indices for fast searching, non-zero option - verbose mode =cut sub create_index { my ( $self, $verbose ) = @_; my $success = 1; print "Creating index $self->{PREFIX}gist_key on $self->{TABLE}\n" if $verbose; $success &&= $self->{DBI}->do( "CREATE INDEX $self->{PREFIX}gist_key on $self->{TABLE} using gist ( $self->{TSVECTOR_FIELD} );" ); $self->{INIT} = 0 if $success; return $success; } =pod =item drop_index() Removes all indices on tables correspoding current instance of OpenFTS. Any error are ignored, only warn. This method is opposite for create_index. This is usefull for bulk uploading. =cut sub drop_index { my $self = shift; $self->{DBI}->do("drop index $self->{PREFIX}gist_key;") || warn "Can't drop index $self->{PREFIX}gist_key"; } =pod =item drop() Removes all tables correspoding current instance of OpenFTS. Any error are ignored, only warn. =cut sub drop { my $self = shift; foreach my $dict ( @{ $self->{DICT} } ) { $dict->drop if $dict->can('drop'); } $self->{DBI}->do("drop table $self->{PREFIX}fts_conf;") || warn "Can't drop table $self->{PREFIX}fts_conf"; } =pod =item start_index( $tid ) Opening a session for indexing Use: my $idx = Search::OpenFTS::Index->new( ... ); my $idx_chunk = $idx->start_index( ID ); foreach my $f ( glob <*.html> ) { $idx_chunk->index_chunk( IO::File->new( $f ) ); } $idx_chunk->flush; =cut sub start_index { my ( $self, $tid ) = @_; bless { index => $self, tid => $tid, struct => {}, nummax => 0 }, 'Search::OpenFTS::Index::Indexer'; } =item B($user) Grant r/o access on indexes and search table to user $user or to PUBLIC if $user doesn't specified. Return TRUE on success or error message if fails. Please, check return value explicitly for '1' ! Calls fix_permissions for each dictionary if it can. =cut sub fix_permissions { my ( $self, $user ) = @_; my $dbh = $self->{DBI}; return undef if !$dbh; my $autocommit = $dbh->{AutoCommit}; $dbh->{AutoCommit} = 0; $user ||= 'PUBLIC'; my $success = 1; $success &&= $dbh->do("grant select on $self->{PREFIX}fts_conf to $user;"); foreach my $dict ( @{ $self->{DICT} } ) { $success &&= $dict->fix_permissions($user) if $dict->can('fix_permissions'); } $dbh->commit if $success; $dbh->{AutoCommit} = $autocommit; return ($success) ? 1 : $dbh->errstr; } { package Search::OpenFTS::Index::Indexer; =pod =item index_chunk( [FH|REFTXT|TXT], direction=>[1|-1] ) =item index_chunk( [FH|REFTXT|TXT], wclass=>[A|B|C|D] ) =item index_chunk( FH, direction=>[1|-1], offset=>$offset, length=>$length ); =item index_chunk( FH, wclass=>[A|B|C|D], offset=>$offset, length=>$length ); Adds a part to an index. Option 'direction' is to store compatibility with old version of OpenFTS. wclass option has defaults 'D'. =cut sub index_chunk { my ( $self, $txt, %opt ) = @_; my $direct = $opt{direction}; local $_; $direct ||= 1; my $wclass = $opt{wclass}; if ( !defined $wclass ) { $wclass = ( $direct < 0 ) ? 'A' : ''; } else { $wclass =~ /^\s*(\S).*$/; } my $length = undef; if ( ref $txt ne 'SCALAR' ) { seek( $txt, $opt{offset}, 0 ) if defined $opt{offset}; } $length = $opt{length} if defined $opt{length}; $self->{nummax} = $self->{index}->_addtxt( $self->{struct}, ( ref $txt ) ? $txt : \$txt, $self->{nummax}, $length, $wclass ); return 1; } =pod =item flush Dump in base of an index =back =cut sub flush { my ($self) = @_; $self->{index}->_flush( $self->{struct}, $self->{tid}, $self->{nummax} - $self->{nummin} + 1 ); } } #package Indexer =head1 DESCRIPTION =head1 SEE ALSO The OpenFTS Primer ( see doc/ subdirectory ) The Crash-course to OpenFTS ( in examples/ subdirectory ) perldoc Search::OpenFTS::Search perldoc Search::OpenFTS::Parser perldoc Search::OpenFTS::Dict::PorterEng perldoc Search::OpenFTS::Dict::Snowball perldoc Search::OpenFTS::Dict::UnknownDict perldoc Search::OpenFTS::Morph::ISpell =cut 1; __END__