/*
* 例文の中を検索できるデータ構造を作る
* 現時点では例文をすべて入れているが、そのうちフィルターすることも考えられる
*
* Copyright (C) 2007 TABATA Yusuke
*
*/
/*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <anthy/corpus.h>
#define MAX_NR_VAL 8
#define BUCKET_SIZE 8192
#define MAX_COLLISION 8
/* word in source */
struct node {
int nr;
int val[MAX_NR_VAL];
int flags;
};
/* word feature in corpus file */
struct element {
/* hash値 */
int val;
/* 有効な(ELM_INVALIDの無い)エントリとしてのindex */
int idx;
/* このhash値の次の出現場所 */
int next_idx;
/**/
int flags;
};
/* index */
struct bucket {
/* 検索のキー */
int key;
/* 最初の出現場所 */
int first_idx;
/**/
int last_idx;
};
struct corpus {
/**/
struct node *array;
int nr_node;
int array_size;
/**/
int nr_values;
struct element *elms;
/**/
int nr_buckets;
struct bucket *buckets;
/**/
int bucket_collision;
};
static void
corpus_setup_bucket(struct corpus *c, int nr)
{
int i;
free(c->buckets);
/**/
c->nr_buckets = nr;
c->buckets = malloc(sizeof(struct bucket) * nr);
for (i = 0; i < nr; i++) {
c->buckets[i].key = -1;
c->buckets[i].first_idx = -1;
c->buckets[i].last_idx = -1;
}
}
struct corpus *
corpus_new(void)
{
struct corpus *c = malloc(sizeof(*c));
c->nr_node = 0;
c->array_size = 0;
c->array = NULL;
c->nr_values = 0;
c->elms = NULL;
c->buckets = NULL;
c->bucket_collision = 0;
return c;
}
static void
corpus_ensure_array(struct corpus *c, int nr)
{
int i, size;
if (c->array_size >= nr) {
return ;
}
size = nr * 2;
c->array = realloc(c->array, sizeof(struct node) * size);
for (i = c->array_size; i < size; i++) {
c->array[i].nr = 0;
}
c->array_size = nr;
}
void
corpus_dump(struct corpus *c)
{
int i;
for (i = 0; i < c->nr_values; i++) {
if (c->elms[i].flags & ELM_WORD_BORDER) {
printf("%d:", i);
}
printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx);
}
printf("\n");
}
static int
count_nr_valid_values(struct corpus *c)
{
int i;
int nr = 0;
for (i = 0; i < c->nr_node; i++) {
struct node *nd = &c->array[i];
if (nd->flags & ELM_INVALID) {
continue;
}
nr += nd->nr;
}
return nr;
}
static void
corpus_build_flatten(struct corpus *c)
{
int i, j;
int idx = 0;
int nr_valid_elms = count_nr_valid_values(c);
c->elms = malloc(sizeof(struct element) * nr_valid_elms);
for (i = 0; i < c->nr_node; i++) {
struct node *nd = &c->array[i];
if (nd->flags & ELM_INVALID) {
continue;
}
for (j = 0; j < nd->nr; j++) {
c->elms[idx].val = nd->val[j];
c->elms[idx].next_idx = -1;
c->elms[idx].flags = nd->flags;
if (j == 0) {
c->elms[idx].flags |= ELM_WORD_BORDER;
}
c->elms[idx].idx = idx;
idx++;
}
}
}
static struct bucket *
find_bucket(struct corpus *c, int val)
{
int i;
int h = val % c->nr_buckets;
for (i = 0; i < MAX_COLLISION; i++) {
struct bucket *bkt = &c->buckets[h];
if (bkt->key == val) {
return bkt;
}
if (bkt->key == -1) {
bkt->key = val;
return bkt;
}
/**/
h ++;
h %= c->nr_buckets;
}
c->bucket_collision ++;
return NULL;
}
static void
corpus_build_link(struct corpus *c)
{
int i;
for (i = 0; i < c->nr_values; i++) {
struct element *elm = &c->elms[i];
struct bucket *bkt = find_bucket(c, elm->val);
if (!bkt) {
continue;
}
if (bkt->first_idx < 0) {
/* 最初の出現 */
bkt->first_idx = c->elms[i].idx;
} else {
c->elms[bkt->last_idx].next_idx = c->elms[i].idx;
}
bkt->last_idx = c->elms[i].idx;
c->elms[i].next_idx = -1;
}
}
void
corpus_build(struct corpus *c)
{
corpus_setup_bucket(c, BUCKET_SIZE);
/**/
corpus_build_flatten(c);
corpus_build_link(c);
/**/
}
void
corpus_push_back(struct corpus *c, int *val, int nr, int flags)
{
struct node nd;
int i;
for (i = 0; i < nr; i++) {
nd.val[i] = val[i];
}
nd.nr = nr;
nd.flags = flags;
/**/
corpus_ensure_array(c, c->nr_node+1);
c->array[c->nr_node] = nd;
c->nr_node++;
c->nr_values += nd.nr;
}
void
corpus_write_bucket(FILE *fp, struct corpus *c)
{
int i;
fprintf(fp, "0 %d\n", c->nr_buckets);
for (i = 0; i < c->nr_buckets; i++) {
fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK),
c->buckets[i].first_idx);
}
printf(" %d collisions in corpus bucket\n", c->bucket_collision);
}
void
corpus_write_array(FILE *fp, struct corpus *c)
{
int i;
fprintf(fp, "0 %d\n", c->nr_values);
for (i = 0; i < c->nr_values; i++) {
struct element *elm = &c->elms[i];
int val;
val = elm->val;
val &= CORPUS_KEY_MASK;
if (elm->flags & ELM_BOS) {
val |= ELM_BOS;
}
if (elm->flags & ELM_WORD_BORDER) {
val |= ELM_WORD_BORDER;
}
fprintf(fp, "%d,%d\n", val,
c->elms[i].next_idx);
}
}
syntax highlighted by Code2HTML, v. 0.9.1