package Search::OpenFTS; use strict; =head1 NAME Search::OpenFTS::Search - Provides functions for searching =head1 SYNOPSIS =head2 API my $fts=Search::OpenFTS->S; my $fts=Search::OpenFTS->S( DBI, relfunc=>name_of_rel_function_in_psql, txttid=>NAME_OF_TXTID, prefix=>PREFIX ); Example of relfunc: relfunc=>q[ rank( '{0.1, 0.2, 0.4, 1.0}', $TSVECTOR, $QUERY, 0 )] first option is weights (see tsearch documentation), Last argument defines how to normalize weight of document: 0 - no normalization (default) 1 - normalized by log(length of document) 2 - normalized by length of document =cut use Search::OpenFTS::Base; use vars qw(@ISA $VERSION); @ISA = ('Search::OpenFTS::Base'); $VERSION = '0.39'; sub new { my ( $class, $dbi, %opt ) = @_; $class = ref($class) || $class; my $self = Search::OpenFTS::Base->new( $dbi, index => 0, prefix => $opt{prefix} ); return undef unless defined $self; $self->{FUNC} = ( $opt{relfunc} || q[rank( '{0.1, 0.2, 0.4, 1.0}', $TSVECTOR, $QUERY )] ); $self->{TXTID} = $opt{txttid} if $opt{txttid}; $self->{VERSION} = $VERSION; bless( $self, $class ); return $self; } =head2 Methods =over 4 =item get_sql( \@ARRAY_WORD ); =item get_sql( $STRING ); =item get_sql( \$STRING ); =item get_sql( *, %opt ); %opt - as in the constructor (see above), plus a key dict_opt = > {}, transmitted to dictionaries Returns parts of SQL: ($out, $condition, $order) Here is how they can be combined in an SQL statement: SELECT $opt{txttid}$out FROM table WHERE $condition $order; =cut sub _prsQuery { my ( $self, $sstr, $opt ) = @_; my $data = []; #parse input string if ( ( !ref $sstr ) || ref $sstr eq 'SCALAR' ) { my ( $type, $word ); $self->{PRS}->start_parser( ( ref $sstr ) ? $sstr : \$sstr ); while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) { if ( exists $self->{IGNOREID}{$type} ) { push @{ $opt->{ignored} }, { word => $word, type => $type } if ( $opt->{ignored} && ref $opt->{ignored} eq 'ARRAY' ); } else { push @$data, { word => $word, type => $type }; } } $self->{PRS}->end_parser; } elsif ( ref $sstr eq 'ARRAY' ) { $data = [ map { { word => $_, type => 0 } } @$sstr ]; } else { return; } return undef if ( $#$data < 0 ); my @dlex; foreach my $w (@$data) { my $lexems = $self->_get_lexems( $w->{type}, $w->{word}, $opt->{dict_opt} ); if ( !defined $lexems ) { #unkown word - didn't recognize by any dicts return; } elsif ( ref $lexems && $#$lexems >= 0 ) { #word is recognized and it isn't stopword push @dlex, $lexems; } elsif ( $opt->{rejected} && ref $opt->{rejected} eq 'ARRAY' ) { #word is recognized and it is stopword push @{ $opt->{rejected} }, $w->{word}; } } return \@dlex; } sub get_sql { my ( $self, $sstr, %opt ) = @_; local $_; $opt{txttid} ||= $self->{TXTID}; $opt{relfunc} ||= $self->{FUNC}; $opt{tsvector_field} ||= $self->{TSVECTOR_FIELD}; my $dlex = $self->_prsQuery( $sstr, \%opt ); return if ( !( ref $dlex eq 'ARRAY' && $#$dlex >= 0 ) ); my ( $out, $condition, $order ); my $query = ''; my $plainlexem = ''; foreach my $lexem (@$dlex) { $query .= ' & ' if length $query; $plainlexem .= ', ' if length $plainlexem; if ( $#$lexem > 0 ) { #Some versions of lexemes $query .= '( \\\'' . join( '\\\' | \\\'', map { $_ =~ s/'/\\\\'/g; $_ } @$lexem ) . '\\\' )'; $plainlexem .= '"' . join( '", "', map { $_ =~ s/"/\\\\"/g; $_ =~ s/'/\\'/g; $_ } @$lexem ) . '"'; } else { #one version of lexem $_ = $lexem->[0]; $_ =~ s/'/\\\\'/g; $query .= "\\'${_}\\'"; $_ = $lexem->[0]; $_ =~ s/"/\\\\"/g; $_ =~ s/'/\\'/g; $plainlexem .= "\"${_}\""; } } my ( $tablename, $tidname ) = split( /\./, $opt{txttid}, 2 ); $condition = "$tablename.$opt{tsvector_field} @@ '$query'"; $order = "pos desc"; $out = $opt{relfunc}; $out =~ s/\$TSVECTOR/$tablename.$opt{tsvector_field}/g; $out =~ s/\$QUERY/'$query'/g; $out .= ' as pos'; return ( $out, $condition, $order ); } sub _sql { my ( $self, $data, %opt ) = @_; $opt{txttid} ||= $self->{TXTID}; my ( $out, $condition, $order ) = $self->get_sql( $data, %opt ); return undef if !$condition; $order = ( defined $order && length $order ) ? "order by $order" : ''; $out = ( defined $out && length $out ) ? ",\n" . $out : ''; my ($txtbl) = split( /\./, $opt{txttid}, 2 ); my $sql = " select $opt{txttid}$out from $txtbl where $condition $order"; return $sql; } =pod =item search( SEARCH_AS_IN_GET_SQL ) =item search( SEARCH_AS_IN_GET_SQL, %opt ) Returns the reference to the list of identifiers sorted by relevance =cut sub search { my ( $self, $data, %opt ) = @_; local $_; $opt{txttid} ||= $self->{TXTID}; my $sql = $self->_sql( $data, %opt ); return () if !defined $sql; my $sth = $self->{DBI}->prepare($sql) || return (); $sth->execute || return (); my @wow = map { $_->[0] } @{ $sth->fetchall_arrayref }; $sth->finish; return \@wow; } =pod =item get_headline( query=>$query, src=>[$FH|$txt|$reftxt], maxlen=>$maxlen, maxread=>$maxread, otag=>$opentag, ctag=>$closetag, replace_ignore_headline=>$str_to_replace, dict_opt=>{}) Returns fragment of the document with search terms hilighted. S bytes reads from the document to generates headline with length S. S,S denote strings used for hilighting, for example, , . S - string used to replace html markups, space by default. =cut sub get_headline { my ( $self, %opt ) = @_; $opt{maxread} ||= 0; $opt{maxlen} ||= 0; $opt{otag} ||= ''; $opt{ctag} ||= ''; local $_; $self->{PRS}->start_parser( \$opt{query} ); my ( $type, $word ); my %qdata; while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) { next if exists $self->{IGNOREID}{$type}; my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} ); if ( ref $lexems && $#$lexems >= 0 ) { map { $qdata{$_}++ } @$lexems; } } $self->{PRS}->end_parser; my $replace_ignore_headline = ' '; $replace_ignore_headline = $opt{replace_ignore_headline} if exists $opt{replace_ignore_headline}; my $headline; my $maxpower = -1; my $maxlen = -1; my @buf; my ( $length, $power ) = ( 0, 0 ); my $totallen = 0; $self->{PRS}->start_parser( ( ref $opt{src} ) ? $opt{src} : \$opt{src} ); while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type && ( $opt{maxread} <= 0 || ( $opt{maxread} > 0 && $totallen < $opt{maxread} ) ) ) { if ( exists $self->{IGNOREIDHEADLINE}{$type} ) { push @buf, { w => $replace_ignore_headline, l => ( $replace_ignore_headline =~ /\S/ ) ? length $replace_ignore_headline : 0, o => 0, n => 1, } if ( defined $replace_ignore_headline ); next; } if ( !exists $self->{IGNOREID}{$type} ) { my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} ); if ( ref $lexems && $#$lexems >= 0 && scalar grep { exists $qdata{$_} } @$lexems ) { my $str = $opt{otag} . $word . $opt{ctag}; push @buf, { w => $str, l => length $word, o => 1, n=>0 }; } else { push @buf, { w => $word, l => length $word, o => 0, n=>0 }; } } else { push @buf, { w => $word, l => ( $word =~ /\S/ ) ? length $word : 0, o => 0, n=>1 }; } $totallen += $buf[-1]->{l}; if ( $opt{maxlen} > 0 ) { if ( $buf[-1]->{l} ) { $length += $buf[-1]->{l}; $power += $buf[-1]->{o}; if ( $length > $opt{maxlen} && !scalar keys %qdata ) { last; } while ( $length > $opt{maxlen} || $buf[0]->{n} ) { my $f = shift(@buf); $length -= $f->{l}; $power -= $f->{o}; } if ( $power > $maxpower || ( $power==$maxpower && ($maxlen < 0 || $length>$maxlen) ) ) { $headline = join( '', map { $_->{w} } @buf ); $maxpower = $power; $maxlen = $length; } } } else { my $w; while( defined( $w=shift(@buf) ) ) { $headline .= $w->{w}; } } } $self->{PRS}->end_parser; if ( !defined $headline ) { $headline = join( '', map { $_->{w} } @buf ); } return $headline; } =pod =item get_headline2( query=>$query, src=>[$FH|$txt|$reftxt], min_words=>$min_words, max_words=>$max_words, otag=>$opentag, ctag=>$closetag,) Another method for getting headline. This method should be a little bit slower but more accurate. The following parameters are recognized: S - minimal number of words in headline; S - maximal number of words in headline; S - maximal length of word that will be rejected at the end of headline; S - the expected length of sentence to cut headline at the end of sentence; S - a list of token ids (separated by spaces) which are not counted as words; S - a list of token ids that are not considered as words but that will appear in the output; S - a list of token ids that will be split on smaller tokens; S - token ids which are not desirable at the end of headline; S - token ids which represents end-of-sentence punctuation signs. S - if set to a scalar ref stores true value if S has been found in processed text. =cut sub get_headline2 { my ( $self, %opt ) = @_; $opt{maxread} ||= 0; $opt{otag} ||= ''; $opt{ctag} ||= ''; $opt{min_words} ||= 15; $opt{max_words} ||= 35; $opt{shortword} ||= 3; $opt{sentence_length} ||= 0; $opt{nonword_tokens} ||= $self->{IGNOREID}; $opt{leave_tokens} ||= '12'; $opt{complex_tokens} ||= '5 15 16 17'; $opt{noend_tokens} ||= '7 8 20 21 22'; $opt{endpunct_tokens} ||= '12'; my %ignore_tokens; local $_; for my $i ( grep { /_tokens$/ } keys %opt ) { $opt{$i} = { map { $_ => 1 } split( /\s+/, $opt{$i} ) } unless ref( $opt{$i} ) eq 'HASH'; } %ignore_tokens = map { $_ => 1 } grep { !$opt{leave_tokens}{$_} } keys %{ $opt{nonword_tokens} }; $self->{PRS}->start_parser( \$opt{query} ); my ( $type, $word ); my %qdata; while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) { next if exists $self->{IGNOREID}{$type}; my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} ); if ( ref $lexems && $#$lexems >= 0 ) { map { $qdata{$_}++ } @$lexems; } } $self->{PRS}->end_parser; my $replace_ignore_headline = ' '; $replace_ignore_headline = $opt{replace_ignore_headline} if exists $opt{replace_ignore_headline}; my $headline; my $maxpower = -1; my ( @buf, @tail ); my ( $power, $tailpower ) = ( 0, 0 ); my $totallen = 0; my ( $wait_word, $allow_postword ); my $prewords = ''; my $accum_str = ''; $self->{PRS}->start_parser( ( ref $opt{src} ) ? $opt{src} : \$opt{src} ); while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type && ( $opt{maxread} <= 0 || ( $opt{maxread} > 0 && $totallen < $opt{maxread} ) ) ) { $totallen += length $word; push @buf, @tail if @tail; $power += $tailpower; @tail = (); $tailpower = 0; if ( $ignore_tokens{$type} ) { $prewords .= $replace_ignore_headline if defined $replace_ignore_headline; } if ( defined $wait_word ) { $accum_str .= $word; undef $wait_word if $accum_str eq $wait_word; next; } if ( $opt{endpunct_tokens}{$type} && $allow_postword && ( $word =~ /[")?!.]/ ) ) { $buf[-1]->{postwords} .= $word if (@buf); next; } elsif ( !$opt{nonword_tokens}{$type} ) { my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} ); if ( ref $lexems && $#$lexems >= 0 && scalar grep { exists $qdata{$_} } @$lexems ) { my $l = length $word; push @buf, { w => $opt{otag} . $word . $opt{ctag}, l => $l, o => 1, t => $type }; } elsif ( $opt{complex_tokens}{$type} ) { next; } else { push @buf, { w => $word, l => length($word), o => 0, t => $type }; } $buf[-1]{prewords} = $prewords; $prewords = $buf[-1]{postwords} = ''; $allow_postword = 1; $power += $buf[-1]->{o}; } elsif ( $opt{leave_tokens}{$type} ) { $prewords .= $word; $allow_postword = 0; next; } if ( $opt{complex_tokens}{$type} ) { $wait_word = $word; $accum_str = ''; } while ( @buf > $opt{min_words} ) { next if $buf[0]{prewords} =~ /^\s*[:;,!?.)"]/; next if $buf[0]{postwords} =~ /^\s*[").!?]*[.!?]+\s*$/ && !$buf[0]->{o}; last; } continue { my $f = shift @buf; $power -= $f->{o}; } while ( @buf > $opt{min_words} ) { last if $buf[-1]{postwords} =~ /^\s*[").!?]*[.!?]+\s*$/; next if $buf[-1]->{l} <= $opt{shortword}; next if $opt{noend_tokens}{ $buf[-1]{t} }; last if @buf < $opt{max_words} - $opt{sentence_length}; } continue { my $f = pop @buf; $power -= $f->{o}; $tailpower += $f->{o}; unshift @tail, $f; } if ( $power >= $maxpower && @buf > $opt{min_words} ) { my ( $so, $eo ); for ( $so = 0 ; $so < @buf ; $so++ ) { last if $buf[$so]->{o}; } for ( $eo = 0 ; $eo < @buf ; $eo++ ) { last if $buf[ $#buf - $eo ]->{o}; } if ( $so - $eo > 0 ) { $headline = join '', map { $_->{prewords} . $_->{w} . $_->{postwords} } @buf; $maxpower = $power; } } last if @buf + @tail > $opt{max_words} && !keys %qdata; while ( @buf + @tail > $opt{max_words} ) { my $f = shift @buf; $power -= $f->{o}; } } $self->{PRS}->end_parser; if ( !defined $headline ) { $headline = join '', map { $_->{prewords} . $_->{w} . $_->{postwords} } @buf; } if ( $opt{met_query} && ref( $opt{met_query} ) eq 'SCALAR' ) { ${ $opt{met_query} } = $maxpower > 0 || $power > 0 ? 1 : 0; } return $headline; } =back =head1 DESCRIPTION =head1 SEE ALSO The OpenFTS Primer ( see doc/ subdirectory ) The Crash-course to OpenFTS ( in examples/ subdirectory ) perldoc Search::OpenFTS::Index perldoc Search::OpenFTS::Parser perldoc Search::OpenFTS::Dict::PorterEng perldoc Search::OpenFTS::Dict::Snowball perldoc Search::OpenFTS::Dict::UnknownDict perldoc Search::OpenFTS::Morph::ISpell =cut 1;