/* The University of Liverpool Modifications to Zebra 1.1 / YAZ 1.7 to enable ranking by attribute weight. Copyright (c) 2001-2002 The University of Liverpool. All rights reserved. Licensed under the Academic Free License version 1.1. http://opensource.org/licenses/academic.php $Id: livcode.c,v 1.1 2003/03/26 16:41:48 adam Exp $ */ #include #include #ifdef WIN32 #include #else #include #endif #include #include "index.h" #include "zserver.h" /* ** These functions/routines ** 1. reads in and builds a linked list of rank attr/rank score pairs ** 2. expand a simple query into a paired list of complex/simple nodes. */ typedef struct rstype { struct rstype *next_rsnode ; int rank ; int score ; char *rankstr ; } rsnode, *refrsnode ; refrsnode start_rsnode = NULL ; /* ** Function/Routine prototypes */ static int search_for_score( char *rankstr ) ; static char *search_for_rankstr( int rank ) ; static int search_for_rank( int rank ) ; static refrsnode set_rsnode( int rank, int score ) ; static int read_zrank_file(ZebraHandle zh) ; static void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ; static void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ; static Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) ; static Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) ; static Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) ; static Z_Operand *set_operand( Z_Operand *thisop, int newattr ) ; static int check_operand_attrs( Z_Operand *thisop ) ; /* ** search_for_score() ** given a rank-string traverse down the linked list ; ** return its score if found otherwise return -1. */ int search_for_score( char *rankstr ) { refrsnode node = start_rsnode ; int rank ; if ( sscanf( rankstr,"%d",&rank ) ) { while ( node ) { if ( node->rank == rank ) return node->score ; node = node->next_rsnode ; } } return -1 ; } /* ** search_for_rankstr() ** given a rank traverse down the linked list ; ** return its string if found otherwise return NULL. */ char *search_for_rankstr( int rank ) { refrsnode node = start_rsnode ; while ( node ) { if ( node->rank == rank ) return node->rankstr ; node = node->next_rsnode ; } return "rank" ; } /* ** search_for_rank() ** given a rank traverse down the linked list ; ** return 1 if found otherwise return 0. */ int search_for_rank( int rank ) { refrsnode node = start_rsnode ; while ( node ) { if ( node->rank == rank ) return 1 ; node = node->next_rsnode ; } return 0 ; } /* ** set_rsnode() ** given a rank and a score, build the rest of the rsnode structure. */ refrsnode set_rsnode( int rank, int score ) { #define BUFFMAX 128 refrsnode node = (refrsnode)malloc( sizeof(rsnode) ) ; char buff[BUFFMAX] ; node->next_rsnode = NULL ; node->rank = rank ; node->score = score ; sprintf( buff,"%d",rank ) ; node->rankstr = (char *)malloc( strlen(buff)+1 ) ; strcpy( node->rankstr, buff ) ; return node ; } /* ** read_zrank_file(zh) ** read in the rankfile and build the rank/score linked list ; ** return 0 : can't open the zebra config. file ** return 0 : can't find the rankfile entry in the zebra config. file ** return 0 : can't open the rankfile itself ** return the number of distinct ranks read in. */ int read_zrank_file(ZebraHandle zh) { #define LINEMAX 256 char line[ LINEMAX ] ; char rname[ LINEMAX ] ; char *lineptr ; FILE *ifd ; int rank = 0 ; int score = 0 ; int numranks = 0 ; /* ** open the zebra configuration file and look for the "rankfile:" ** entry which contains the path/name of the rankfile */ const char *rankfile = res_get_def(zh->res, "rankfile", 0); const char *profilePath = res_get_def(zh->res, "profilePath", DEFAULT_PROFILE_PATH); if (!rankfile) { yaz_log(LOG_LOG, "rankfile entry not found in config file" ) ; return 0 ; } ifd = yaz_path_fopen(profilePath, rankfile, "r" ) ; if ( ifd ) { while ( (lineptr = fgets( line,LINEMAX,ifd )) ) { if ( sscanf( lineptr,"rankfile: %s", rname ) == 1 ) rankfile = rname ; } /* ** open the rankfile and read the rank/score pairs ** ignore 1016 ** ignore duplicate ranks ** ignore ranks without +ve scores */ if ( rankfile ) { if ( !(ifd = fopen( rankfile, "r" )) ) { logf( LOG_LOG, "unable to open rankfile %s",rankfile ) ; return 0; } while ( (lineptr = fgets( line,LINEMAX,ifd )) ) { sscanf( lineptr,"%d : %d", &rank,&score ) ; if ( ( score > 0 ) && ( rank != 1016 ) ) { refrsnode new_rsnode ; if ( search_for_rank( rank ) == 0 ) { new_rsnode = set_rsnode( rank,score ) ; new_rsnode->next_rsnode = start_rsnode ; start_rsnode = new_rsnode ; numranks++ ; } } } } else { yaz_log(LOG_WARN|LOG_ERRNO, "unable to open config file (%s)", rankfile); } } return numranks ; } /* ** set_operand() ** build an operand "node" - hav to make a complete copy of thisop and ** then insert newattr in the appropriate place ** */ Z_Operand *set_operand( Z_Operand *thisop, int newattr ) { Z_Operand *operand ; Z_AttributesPlusTerm *attributesplusterm ; Z_AttributeList *attributelist ; Z_AttributeElement *attributeelement ; Z_AttributeElement *attrptr ; Z_AttributeElement **attrptrptr ; Z_Term *term ; Odr_oct *general ; int i ; operand = (Z_Operand *) malloc( sizeof( Z_Operand ) ) ; attributesplusterm = (Z_AttributesPlusTerm *) malloc( sizeof( Z_AttributesPlusTerm ) ) ; attributelist = (Z_AttributeList *) malloc( sizeof( Z_AttributeList ) ) ; attributeelement = (Z_AttributeElement *) malloc( sizeof( Z_AttributeElement ) ) ; term = (Z_Term *) malloc( sizeof( Z_Term ) ) ; general = (Odr_oct *) malloc( sizeof( Odr_oct ) ) ; operand->which = Z_Operand_APT ; operand->u.attributesPlusTerm = attributesplusterm ; attributesplusterm->attributes = attributelist ; attributesplusterm->term = term ; attributelist->num_attributes = thisop->u.attributesPlusTerm-> attributes->num_attributes ; attrptr = (Z_AttributeElement *) malloc( sizeof(Z_AttributeElement) * attributelist->num_attributes ) ; attrptrptr = (Z_AttributeElement **) malloc( sizeof(Z_AttributeElement) * attributelist->num_attributes ) ; attributelist->attributes = attrptrptr ; for ( i = 0 ; i < attributelist->num_attributes ; i++ ) { *attrptr = *thisop->u.attributesPlusTerm->attributes->attributes[i] ; attrptr->attributeType = (int *)malloc( sizeof(int *) ) ; *attrptr->attributeType = *thisop->u.attributesPlusTerm->attributes-> attributes[i]->attributeType; attrptr->value.numeric = (int *)malloc( sizeof(int *) ) ; *attrptr->value.numeric = *thisop->u.attributesPlusTerm->attributes-> attributes[i]->value.numeric; if ( (*attrptr->attributeType == 1) && (*attrptr->value.numeric == 1016) ) { *attrptr->value.numeric = newattr ; } *attrptrptr++ = attrptr++ ; } term->which = Z_Term_general ; term->u.general = general ; general->len = thisop->u.attributesPlusTerm->term->u.general->len ; general->size = thisop->u.attributesPlusTerm->term->u.general->size ; general->buf = malloc( general->size ) ; strcpy( general->buf, thisop->u.attributesPlusTerm->term->u.general->buf ) ; return operand ; } /* ** set_2operands() ** build a complex "node" with two (simple) operand "nodes" as branches */ Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) { Z_Complex *top ; Z_RPNStructure *s1 ; Z_RPNStructure *s2 ; Z_Operator *roperator ; top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ; s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ; top->roperator = roperator ; top->roperator->which = Z_Operator_or ; top->roperator->u.op_or = odr_nullval() ; top->s1 = s1 ; top->s1->which = Z_RPNStructure_simple ; top->s1->u.simple = sim1 ; top->s2 = s2 ; top->s2->which = Z_RPNStructure_simple ; top->s2->u.simple = sim2 ; return top ; } /* ** set_1complex_1operand() ** build a complex "node" with a complex "node" branch and an ** operand "node" branch */ Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) { Z_Complex *top ; Z_RPNStructure *s1 ; Z_RPNStructure *s2 ; Z_Operator *roperator ; top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ; s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ; roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ; top->roperator = roperator ; top->roperator->which = Z_Operator_or ; top->roperator->u.op_or = odr_nullval() ; top->s1 = s1 ; top->s1->which = Z_RPNStructure_complex ; top->s1->u.complex = comp ; top->s2 = s2 ; top->s2->which = Z_RPNStructure_simple ; top->s2->u.simple = simp ; return top ; } /* ** expand_query() ** expand a simple query into a number of complex queries */ Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) { Z_Complex *top ; int numattrs = 0 ; /* ** start_rsnode will be set if we have already read the rankfile ** so don't bother again but we need to know the number of attributes ** in the linked list so traverse it again to find out how many. */ if ( start_rsnode ) { refrsnode node = start_rsnode ; while ( node ) { numattrs++ ; node = node->next_rsnode ; } } /* ** only expand the query if there are 2 or more attributes */ if ( numattrs >= 2 ) { refrsnode node = start_rsnode ; int attr1 ; int attr2 ; attr1 = node->rank ; node = node->next_rsnode ; attr2 = node->rank ; node = node->next_rsnode ; /* ** this is the special case and has to be done first because the ** last complex node in the linear list has two simple nodes whereas ** all the others have a complex and a simple. */ top = set_2operands( set_operand( thisop,attr1 ), set_operand( thisop,attr2 ) ) ; /* ** do the rest as complex/simple pairs */ while ( node ) { attr1 = node->rank ; node = node->next_rsnode ; top = set_1complex_1operand( top,set_operand( thisop,attr1 ) ) ; } /* ** finally add the 1016 rank attribute at the top of the tree */ top = set_1complex_1operand( top,set_operand( thisop,1016 ) ) ; return top ; } else return NULL ; } /* ** check_operand_attrs() ** loop through the attributes of a particular operand ** return 1 if (type==1 && value==1016) && (type==2 && value==102) ** otherwise return 0 */ int check_operand_attrs( Z_Operand *thisop ) { Z_AttributeElement *attrptr ; int cond1 = 0 ; int cond2 = 0 ; int numattrs ; int i ; numattrs = thisop->u.attributesPlusTerm->attributes->num_attributes ; for ( i = 0 ; i < numattrs ; i++ ) { attrptr = thisop->u.attributesPlusTerm->attributes->attributes[i] ; if ( (*attrptr->attributeType == 1) && (*attrptr->value.numeric == 1016) ) cond1 = 1 ; if ( (*attrptr->attributeType == 2) && (*attrptr->value.numeric == 102) ) cond2 = 1 ; } return (cond1 & cond2) ; } /* ** convert_simple2complex() ** */ void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) { Z_Complex *complex = NULL ; Z_Operand *operand = rpnstruct->u.simple ; if ( check_operand_attrs( operand ) ) { complex = expand_query(zh, operand ) ; if ( complex ) { /* ** Everything is complete so replace the original ** operand with the newly built complex structure ** This is it ... no going back!! */ rpnstruct->which = Z_RPNStructure_complex ; rpnstruct->u.complex = complex ; } } } /* ** walk_complex_query() ** recursively traverse the tree expanding any simple queries we find */ void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) { if ( rpnstruct->which == Z_RPNStructure_simple ) { convert_simple2complex(zh, rpnstruct ) ; } else { walk_complex_query(zh, rpnstruct->u.complex->s1 ) ; walk_complex_query(zh, rpnstruct->u.complex->s2 ) ; } } void zebra_livcode_transform(ZebraHandle zh, Z_RPNQuery *query) { /* ** Got a search request, ** 1. if it is a simple query, see if it suitable for expansion ** i.e. the attributes are of the form ... ** (type==1 && value==1016) && (type==2 && value==102) ** or ** 2. if it is complex, traverse the complex query tree and expand ** any simples simples as above */ #if LIV_CODE Z_RPNStructure *rpnstruct = query->RPNStructure ; if ( rpnstruct->which == Z_RPNStructure_simple ) { convert_simple2complex(zh, rpnstruct ) ; } else if ( rpnstruct->which == Z_RPNStructure_complex ) { walk_complex_query(zh, rpnstruct ) ; } #endif } struct rank_class_info { int dummy; }; struct rank_term_info { int local_occur; int global_occur; int global_inv; int rank_flag; }; struct rank_set_info { int last_pos; int no_entries; int no_rank_entries; struct rank_term_info *entries; }; static int log2_int (unsigned g) { int n = 0; while ((g = g>>1)) n++; return n; } /* * create: Creates/Initialises this rank handler. This routine is * called exactly once. The routine returns the class_handle. */ static void *create (ZebraHandle zh) { struct rank_class_info *ci = (struct rank_class_info *) xmalloc (sizeof(*ci)); logf (LOG_DEBUG, "livrank create"); read_zrank_file(zh) ; return ci; } /* * destroy: Destroys this rank handler. This routine is called * when the handler is no longer needed - i.e. when the server * dies. The class_handle was previously returned by create. */ static void destroy (struct zebra_register *reg, void *class_handle) { struct rank_class_info *ci = (struct rank_class_info *) class_handle; logf (LOG_DEBUG, "livrank destroy"); xfree (ci); } /* * begin: Prepares beginning of "real" ranking. Called once for * each result set. The returned handle is a "set handle" and * will be used in each of the handlers below. */ static void *begin (struct zebra_register *reg, void *class_handle, RSET rset) { struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si)); int i; logf (LOG_DEBUG, "livrank begin"); si->no_entries = rset->no_rset_terms; si->no_rank_entries = 0; si->entries = (struct rank_term_info *) xmalloc (sizeof(*si->entries)*si->no_entries); for (i = 0; i < si->no_entries; i++) { const char *flags = rset->rset_terms[i]->flags; int g = rset->rset_terms[i]->nn; const char *cp = strstr(flags, ",u="); si->entries[i].rank_flag = 1; if (cp) { char *t = search_for_rankstr(atoi(cp+3)); if (t) si->entries[i].rank_flag = search_for_score(t) ; } if ( si->entries[i].rank_flag ) (si->no_rank_entries)++; si->entries[i].local_occur = 0; si->entries[i].global_occur = g; si->entries[i].global_inv = 32 - log2_int (g); logf (LOG_DEBUG, "-------- %d ------", 32 - log2_int (g)); } return si; } /* * end: Terminates ranking process. Called after a result set * has been ranked. */ static void end (struct zebra_register *reg, void *set_handle) { struct rank_set_info *si = (struct rank_set_info *) set_handle; logf (LOG_DEBUG, "livrank end"); xfree (si->entries); xfree (si); } /* * add: Called for each word occurence in a result set. This routine * should be as fast as possible. This routine should "incrementally" * update the score. */ static void add (void *set_handle, int seqno, int term_index) { struct rank_set_info *si = (struct rank_set_info *) set_handle; logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index); si->last_pos = seqno; si->entries[term_index].local_occur++; } /* * calc: Called for each document in a result. This handler should * produce a score based on previous call(s) to the add handler. The * score should be between 0 and 1000. If score cannot be obtained * -1 should be returned. */ static int calc (void *set_handle, int sysno) { int i, lo, divisor, score = 0; struct rank_set_info *si = (struct rank_set_info *) set_handle; logf (LOG_DEBUG, "livrank calc sysno=%d", sysno); if (!si->no_rank_entries) return -1; for (i = 0; i < si->no_entries; i++) { score += si->entries[i].local_occur * si->entries[i].rank_flag ; } for (i = 0; i < si->no_entries; i++) if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur)) score += (8+log2_int (lo)) * si->entries[i].global_inv; score *= 34; divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries)); score = score / divisor; if (score > 1000) score = 1000; for (i = 0; i < si->no_entries; i++) si->entries[i].local_occur = 0; return score; } /* * Pseudo-meta code with sequence of calls as they occur in a * server. Handlers are prefixed by --: * * server init * -- create * foreach search * rank result set * -- begin * foreach record * foreach word * -- add * -- calc * -- end * -- destroy * server close */ static struct rank_control rank_control = { "livrank", create, destroy, begin, end, calc, add, }; struct rank_control *rankliv_class = &rank_control;