package Search::OpenFTS;
use strict;

=head1 NAME

Search::OpenFTS::Search - Provides functions for searching

=head1 SYNOPSIS

=head2 API

my $fts=Search::OpenFTS->S<new( DBI )>;

my $fts=Search::OpenFTS->S<new>(
   DBI,
   relfunc=>name_of_rel_function_in_psql,
   txttid=>NAME_OF_TXTID,
   prefix=>PREFIX

);

Example of relfunc:

relfunc=>q[ rank( '{0.1, 0.2, 0.4, 1.0}', $TSVECTOR, $QUERY, 0 )]

first option is weights (see tsearch documentation),
Last argument defines how to normalize weight of document:
 0 - no normalization (default)
 1 - normalized by log(length of document)
 2 - normalized by length of document

=cut

use Search::OpenFTS::Base;
use vars qw(@ISA $VERSION);
@ISA = ('Search::OpenFTS::Base');

$VERSION = '0.39';

sub new {
    my ( $class, $dbi, %opt ) = @_;
    $class = ref($class) || $class;
    my $self =
      Search::OpenFTS::Base->new( $dbi, index => 0, prefix => $opt{prefix} );
    return undef unless defined $self;

    $self->{FUNC} =
      ( $opt{relfunc} || q[rank( '{0.1, 0.2, 0.4, 1.0}', $TSVECTOR, $QUERY )] );
    $self->{TXTID}   = $opt{txttid} if $opt{txttid};
    $self->{VERSION} = $VERSION;

    bless( $self, $class );
    return $self;
}

=head2 Methods

=over 4

=item get_sql( \@ARRAY_WORD );

=item get_sql( $STRING );

=item get_sql( \$STRING );

=item get_sql( *, %opt );

%opt - as in the constructor (see above), plus a key dict_opt = > {}, transmitted to dictionaries

Returns parts of SQL:

($out, $condition, $order)

Here is how they can be combined in an SQL statement:

 SELECT
    $opt{txttid}$out
 FROM
    table
 WHERE
    $condition
 $order;


=cut

sub _prsQuery {
    my ( $self, $sstr, $opt ) = @_;

    my $data = [];

    #parse input string
    if ( ( !ref $sstr ) || ref $sstr eq 'SCALAR' ) {
        my ( $type, $word );
        $self->{PRS}->start_parser( ( ref $sstr ) ? $sstr : \$sstr );
        while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) {
            if ( exists $self->{IGNOREID}{$type} ) {
                push @{ $opt->{ignored} }, { word => $word, type => $type }
                  if ( $opt->{ignored} && ref $opt->{ignored} eq 'ARRAY' );
            } else {
                push @$data, { word => $word, type => $type };
            }
        }
        $self->{PRS}->end_parser;
    } elsif ( ref $sstr eq 'ARRAY' ) {
        $data = [ map { { word => $_, type => 0 } } @$sstr ];
    } else {
        return;
    }
    return undef if ( $#$data < 0 );

    my @dlex;
    foreach my $w (@$data) {
        my $lexems =
          $self->_get_lexems( $w->{type}, $w->{word}, $opt->{dict_opt} );
        if ( !defined $lexems ) {

            #unkown word - didn't recognize by any dicts
            return;
        } elsif ( ref $lexems && $#$lexems >= 0 ) {

            #word is recognized and it isn't stopword
            push @dlex, $lexems;
        } elsif ( $opt->{rejected} && ref $opt->{rejected} eq 'ARRAY' ) {

            #word is recognized and it is stopword
            push @{ $opt->{rejected} }, $w->{word};
        }
    }
    return \@dlex;
}

sub get_sql {
    my ( $self, $sstr, %opt ) = @_;
    local $_;

    $opt{txttid}         ||= $self->{TXTID};
    $opt{relfunc}        ||= $self->{FUNC};
    $opt{tsvector_field} ||= $self->{TSVECTOR_FIELD};

    my $dlex = $self->_prsQuery( $sstr, \%opt );
    return if ( !( ref $dlex eq 'ARRAY' && $#$dlex >= 0 ) );

    my ( $out, $condition, $order );
    my $query      = '';
    my $plainlexem = '';
    foreach my $lexem (@$dlex) {
        $query      .= ' & ' if length $query;
        $plainlexem .= ', '  if length $plainlexem;
        if ( $#$lexem > 0 ) {    #Some versions of lexemes
            $query .= '( \\\'' . join(
                '\\\' | \\\'',
                map {
                    $_ =~ s/'/\\\\'/g;
                    $_
                  } @$lexem
              )
              . '\\\' )';
            $plainlexem .= '"' . join(
                '", "',
                map {
                    $_ =~ s/"/\\\\"/g;
                    $_ =~ s/'/\\'/g;
                    $_
                  } @$lexem
              )
              . '"';
        } else {    #one version of lexem
            $_ = $lexem->[0];
            $_ =~ s/'/\\\\'/g;
            $query .= "\\'${_}\\'";
            $_ = $lexem->[0];
            $_ =~ s/"/\\\\"/g;
            $_ =~ s/'/\\'/g;
            $plainlexem .= "\"${_}\"";
        }
    }

    my ( $tablename, $tidname ) = split( /\./, $opt{txttid}, 2 );
    $condition = "$tablename.$opt{tsvector_field} @@ '$query'";

    $order = "pos desc";

    $out = $opt{relfunc};
    $out =~ s/\$TSVECTOR/$tablename.$opt{tsvector_field}/g;
    $out =~ s/\$QUERY/'$query'/g;
    $out .= ' as pos';
    return ( $out, $condition, $order );
}

sub _sql {
    my ( $self, $data, %opt ) = @_;

    $opt{txttid} ||= $self->{TXTID};
    my ( $out, $condition, $order ) = $self->get_sql( $data, %opt );
    return undef if !$condition;
    $order = ( defined $order && length $order ) ? "order by $order" : '';
    $out   = ( defined $out   && length $out )   ? ",\n" . $out      : '';

    my ($txtbl) = split( /\./, $opt{txttid}, 2 );
    my $sql = "
select
	$opt{txttid}$out
from
	$txtbl
where
	$condition
$order";

    return $sql;
}

=pod

=item search( SEARCH_AS_IN_GET_SQL )

=item search( SEARCH_AS_IN_GET_SQL, %opt )

Returns the reference to the list of identifiers sorted by relevance

=cut

sub search {
    my ( $self, $data, %opt ) = @_;
    local $_;
    $opt{txttid} ||= $self->{TXTID};
    my $sql = $self->_sql( $data, %opt );
    return () if !defined $sql;
    my $sth = $self->{DBI}->prepare($sql) || return ();
    $sth->execute || return ();
    my @wow = map { $_->[0] } @{ $sth->fetchall_arrayref };
    $sth->finish;
    return \@wow;
}

=pod

=item get_headline(

    query=>$query,
    src=>[$FH|$txt|$reftxt],
    maxlen=>$maxlen,
    maxread=>$maxread,
    otag=>$opentag,
    ctag=>$closetag,
    replace_ignore_headline=>$str_to_replace,
    dict_opt=>{})

Returns fragment of the document with search terms hilighted.
S<maxread> bytes reads from the document to generates headline
with length S<maxlen>.

S<otag>,S<ctag> denote strings used for hilighting, for example,
<b>, </b>.

S<replace_ignore_headline> - string used to replace html markups, space
by default.

=cut

sub get_headline {
    my ( $self, %opt ) = @_;
    $opt{maxread} ||= 0;
    $opt{maxlen}  ||= 0;
    $opt{otag}    ||= '<b>';
    $opt{ctag}    ||= '</b>';
    local $_;

    $self->{PRS}->start_parser( \$opt{query} );
    my ( $type, $word );
    my %qdata;
    while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) {
        next if exists $self->{IGNOREID}{$type};
        my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} );
        if ( ref $lexems && $#$lexems >= 0 ) {
            map { $qdata{$_}++ } @$lexems;
        }
    }
    $self->{PRS}->end_parser;

    my $replace_ignore_headline = ' ';
    $replace_ignore_headline = $opt{replace_ignore_headline}
      if exists $opt{replace_ignore_headline};
    my $headline;
    my $maxpower = -1;
    my $maxlen = -1;
    my @buf;
    my ( $length, $power ) = ( 0, 0 );
    my $totallen = 0;
    $self->{PRS}->start_parser( ( ref $opt{src} ) ? $opt{src} : \$opt{src} );

    while (
           ( ( $type, $word ) = $self->{PRS}->get_word )
        && $type
        && ( $opt{maxread} <= 0
            || ( $opt{maxread} > 0 && $totallen < $opt{maxread} ) )
      )
    {
        if ( exists $self->{IGNOREIDHEADLINE}{$type} ) {
            push @buf,
              {
                w => $replace_ignore_headline,
                l => ( $replace_ignore_headline =~ /\S/ )
                ? length $replace_ignore_headline
                : 0,
                o => 0,
		n => 1,
              }
              if ( defined $replace_ignore_headline );
            next;
        }

        if ( !exists $self->{IGNOREID}{$type} ) {

            my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} );
            if (   ref $lexems
                && $#$lexems >= 0
                && scalar grep { exists $qdata{$_} } @$lexems )
            {
                my $str = $opt{otag} . $word . $opt{ctag};
                push @buf, { w => $str, l => length $word, o => 1, n=>0 };
            } else {
                push @buf, { w => $word, l => length $word, o => 0, n=>0 };
            }
        } else {
            push @buf,
              { w => $word, l => ( $word =~ /\S/ ) ? length $word : 0, o => 0, n=>1 };
        }
        $totallen += $buf[-1]->{l};

        if ( $opt{maxlen} > 0 ) {
            if ( $buf[-1]->{l} ) {
                $length += $buf[-1]->{l};
                $power  += $buf[-1]->{o};
                if ( $length > $opt{maxlen} && !scalar keys %qdata ) {
                    last;
                }
                while ( $length > $opt{maxlen} ||  $buf[0]->{n} ) {
                    my $f = shift(@buf);
                    $length -= $f->{l};
                    $power  -= $f->{o};
                }
		if (   $power > $maxpower || ( $power==$maxpower && ($maxlen < 0 || $length>$maxlen) ) ) {
                    $headline = join( '', map { $_->{w} } @buf );
                    $maxpower = $power;
		    $maxlen   = $length;
                }
            }
        } else {
	    my $w;
	    while(  defined( $w=shift(@buf) ) ) { 
            	$headline .= $w->{w};
	    }
        }
    }
    $self->{PRS}->end_parser;

    if ( !defined $headline ) {
        $headline = join( '', map { $_->{w} } @buf );
    }
    return $headline;
}

=pod

=item get_headline2(

    query=>$query,
    src=>[$FH|$txt|$reftxt],
    min_words=>$min_words,
    max_words=>$max_words,
    otag=>$opentag,
    ctag=>$closetag,)

Another method for getting headline.
This method should be a little bit slower but more accurate.
The following parameters are recognized:

S<min_words> - minimal number of words in headline;

S<max_words> - maximal number of words in headline;

S<shortword> - maximal length of word that will be rejected at the end
of headline;

S<sentence_length> - the expected length of sentence to cut headline
at the end of sentence;

S<nonword_tokens> - a list of token ids (separated by spaces) which
are not counted as words;

S<leave_tokens> - a list of token ids that are not considered as words
but that will appear in the output;

S<complex_tokens> - a list of token ids that will be split on smaller
tokens;

S<noend_tokens> - token ids which are not desirable at the end of headline;

S<endpunct_tokens> - token ids which represents end-of-sentence punctuation signs.

S<met_query> - if set to a scalar ref stores true value if S<query> has been
found in processed text.

=cut

sub get_headline2 {
    my ( $self, %opt ) = @_;
    $opt{maxread}         ||= 0;
    $opt{otag}            ||= '<b>';
    $opt{ctag}            ||= '</b>';
    $opt{min_words}       ||= 15;
    $opt{max_words}       ||= 35;
    $opt{shortword}       ||= 3;
    $opt{sentence_length} ||= 0;
    $opt{nonword_tokens}  ||= $self->{IGNOREID};
    $opt{leave_tokens}    ||= '12';
    $opt{complex_tokens}  ||= '5 15 16 17';
    $opt{noend_tokens}    ||= '7 8 20 21 22';
    $opt{endpunct_tokens} ||= '12';

    my %ignore_tokens;
    local $_;

    for my $i ( grep { /_tokens$/ } keys %opt ) {
        $opt{$i} = { map { $_ => 1 } split( /\s+/, $opt{$i} ) }
          unless ref( $opt{$i} ) eq 'HASH';
    }
    %ignore_tokens =
      map { $_ => 1 }
      grep { !$opt{leave_tokens}{$_} } keys %{ $opt{nonword_tokens} };

    $self->{PRS}->start_parser( \$opt{query} );
    my ( $type, $word );
    my %qdata;
    while ( ( ( $type, $word ) = $self->{PRS}->get_word ) && $type ) {
        next if exists $self->{IGNOREID}{$type};
        my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} );
        if ( ref $lexems && $#$lexems >= 0 ) {
            map { $qdata{$_}++ } @$lexems;
        }
    }
    $self->{PRS}->end_parser;

    my $replace_ignore_headline = ' ';
    $replace_ignore_headline = $opt{replace_ignore_headline}
      if exists $opt{replace_ignore_headline};
    my $headline;
    my $maxpower = -1;
    my ( @buf, @tail );
    my ( $power, $tailpower ) = ( 0, 0 );
    my $totallen = 0;
    my ( $wait_word, $allow_postword );
    my $prewords  = '';
    my $accum_str = '';
    $self->{PRS}->start_parser( ( ref $opt{src} ) ? $opt{src} : \$opt{src} );

    while (
           ( ( $type, $word ) = $self->{PRS}->get_word )
        && $type
        && ( $opt{maxread} <= 0
            || ( $opt{maxread} > 0 && $totallen < $opt{maxread} ) )
      )
    {
        $totallen += length $word;
        push @buf, @tail if @tail;
        $power += $tailpower;
        @tail      = ();
        $tailpower = 0;
        if ( $ignore_tokens{$type} ) {
            $prewords .= $replace_ignore_headline
              if defined $replace_ignore_headline;
        }

        if ( defined $wait_word ) {
            $accum_str .= $word;
            undef $wait_word if $accum_str eq $wait_word;
            next;
        }
        if (   $opt{endpunct_tokens}{$type}
            && $allow_postword
            && ( $word =~ /[")?!.]/ ) )
        {
            $buf[-1]->{postwords} .= $word if (@buf);
            next;
        } elsif ( !$opt{nonword_tokens}{$type} ) {
            my $lexems = $self->_get_lexems( $type, $word, $opt{dict_opt} );
            if (   ref $lexems
                && $#$lexems >= 0
                && scalar grep { exists $qdata{$_} } @$lexems )
            {
                my $l = length $word;
                push @buf,
                  {
                    w => $opt{otag} . $word . $opt{ctag},
                    l => $l,
                    o => 1,
                    t => $type
                  };
            } elsif ( $opt{complex_tokens}{$type} ) {
                next;
            } else {
                push @buf,
                  { w => $word, l => length($word), o => 0, t => $type };
            }
            $buf[-1]{prewords} = $prewords;
            $prewords = $buf[-1]{postwords} = '';
            $allow_postword = 1;
            $power += $buf[-1]->{o};
        } elsif ( $opt{leave_tokens}{$type} ) {
            $prewords .= $word;
            $allow_postword = 0;
            next;
        }
        if ( $opt{complex_tokens}{$type} ) {
            $wait_word = $word;
            $accum_str = '';
        }

        while ( @buf > $opt{min_words} ) {
            next if $buf[0]{prewords} =~ /^\s*[:;,!?.)"]/;
            next
              if $buf[0]{postwords} =~ /^\s*[").!?]*[.!?]+\s*$/
              && !$buf[0]->{o};
            last;
        } continue {
            my $f = shift @buf;
            $power -= $f->{o};
        }
        while ( @buf > $opt{min_words} ) {
            last if $buf[-1]{postwords} =~ /^\s*[").!?]*[.!?]+\s*$/;
            next if $buf[-1]->{l} <= $opt{shortword};
            next if $opt{noend_tokens}{ $buf[-1]{t} };
            last if @buf < $opt{max_words} - $opt{sentence_length};
        } continue {
            my $f = pop @buf;
            $power -= $f->{o};
            $tailpower += $f->{o};
            unshift @tail, $f;
        }
        if ( $power >= $maxpower && @buf > $opt{min_words} ) {
            my ( $so, $eo );
            for ( $so = 0 ; $so < @buf ; $so++ ) {
                last if $buf[$so]->{o};
            }
            for ( $eo = 0 ; $eo < @buf ; $eo++ ) {
                last if $buf[ $#buf - $eo ]->{o};
            }
            if ( $so - $eo > 0 ) {
                $headline = join '',
                  map { $_->{prewords} . $_->{w} . $_->{postwords} } @buf;
                $maxpower = $power;
            }
        }
        last if @buf + @tail > $opt{max_words} && !keys %qdata;
        while ( @buf + @tail > $opt{max_words} ) {
            my $f = shift @buf;
            $power -= $f->{o};
        }
    }
    $self->{PRS}->end_parser;

    if ( !defined $headline ) {
        $headline = join '',
          map { $_->{prewords} . $_->{w} . $_->{postwords} } @buf;
    }
    if ( $opt{met_query} && ref( $opt{met_query} ) eq 'SCALAR' ) {
        ${ $opt{met_query} } = $maxpower > 0 || $power > 0 ? 1 : 0;
    }
    return $headline;
}

=back

=head1 DESCRIPTION

=head1 SEE ALSO

    The OpenFTS Primer          (  see doc/ subdirectory )

    The Crash-course to OpenFTS ( in examples/ subdirectory )

    perldoc Search::OpenFTS::Index

    perldoc Search::OpenFTS::Parser

    perldoc Search::OpenFTS::Dict::PorterEng

    perldoc Search::OpenFTS::Dict::Snowball

    perldoc Search::OpenFTS::Dict::UnknownDict

    perldoc Search::OpenFTS::Morph::ISpell


=cut

1;