/*
* 文を文節にsplitするsplitter
*
* 文節の境界を検出する
* anthy_init_split_context() 分割用のコンテキストを作って
* anthy_mark_border() 分割をして
* anthy_release_split_context() コンテキストを解放する
*
* anthy_commit_border() コミットされた内容に対して学習をする
*
* Funded by IPA未踏ソフトウェア創造事業 2001 9/22
*
* Copyright (C) 2004 YOSHIDA Yuichi
* Copyright (C) 2000-2004 TABATA Yusuke
* Copyright (C) 2000-2001 UGAWA Tomoharu
*
* $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
*/
/*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <stdlib.h>
#include <string.h>
#include <anthy/alloc.h>
#include <anthy/record.h>
#include <anthy/splitter.h>
#include <anthy/logger.h>
#include "wordborder.h"
#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
static int splitter_debug_flags;
/**/
wtype_t anthy_wtype_noun;
wtype_t anthy_wtype_name_noun;
wtype_t anthy_wtype_num_noun;
wtype_t anthy_wtype_prefix;
wtype_t anthy_wtype_num_prefix;
wtype_t anthy_wtype_num_postfix;
wtype_t anthy_wtype_name_postfix;
wtype_t anthy_wtype_sv_postfix;
wtype_t anthy_wtype_a_tail_of_v_renyou;
wtype_t anthy_wtype_v_renyou;
wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */
wtype_t anthy_wtype_n1;
wtype_t anthy_wtype_n10;
/** make_word_cacheで作成した文節情報を解放する
*/
static void
release_info_cache(struct splitter_context *sc)
{
struct word_split_info_cache *info = sc->word_split_info;
anthy_free_allocator(info->MwAllocator);
anthy_free_allocator(info->WlAllocator);
free(info->cnode);
free(info->seq_len);
free(info->rev_seq_len);
free(info);
}
static void
metaword_dtor(void *p)
{
struct meta_word *mw = (struct meta_word*)p;
if (mw->cand_hint.str) {
free(mw->cand_hint.str);
}
}
static void
alloc_char_ent(xstr *xs, struct splitter_context *sc)
{
int i;
sc->char_count = xs->len;
sc->ce = (struct char_ent*)
malloc(sizeof(struct char_ent)*(xs->len + 1));
for (i = 0; i <= xs->len; i++) {
sc->ce[i].c = &xs->str[i];
sc->ce[i].seg_border = 0;
sc->ce[i].initial_seg_len = 0;
sc->ce[i].best_seg_class = SEG_HEAD;
sc->ce[i].best_mw = NULL;
}
/* 左右両端は文節の境界である */
sc->ce[0].seg_border = 1;
sc->ce[xs->len].seg_border = 1;
}
/* ここで確保した内容はrelease_info_cacheで解放される
*/
static void
alloc_info_cache(struct splitter_context *sc)
{
int i;
struct word_split_info_cache *info;
/* キャッシュのデータを確保 */
sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
info = sc->word_split_info;
info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
info->cnode =
malloc(sizeof(struct char_node) * (sc->char_count + 1));
info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
/* 各文字インデックスに対して初期化を行う */
for (i = 0; i <= sc->char_count; i++) {
info->seq_len[i] = 0;
info->rev_seq_len[i] = 0;
info->cnode[i].wl = NULL;
info->cnode[i].mw = NULL;
info->cnode[i].max_len = 0;
}
}
/** 外から呼び出されるwordsplitterのトップレベルの関数 */
void
anthy_mark_border(struct splitter_context *sc,
int from, int from2, int to)
{
int i;
struct word_split_info_cache *info;
/* sanity check */
if ((to - from) <= 0) {
return ;
}
/* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
info = sc->word_split_info;
info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
for (i = 0; i < sc->char_count + 1; ++i) {
info->seg_border[i] = sc->ce[i].seg_border;
info->best_seg_class[i] = sc->ce[i].best_seg_class;
info->best_mw[i] = sc->ce[i].best_mw;
}
/* 境界を決定する */
anthy_eval_border(sc, from, from2, to);
for (i = from; i < to; ++i) {
sc->ce[i].seg_border = info->seg_border[i];
sc->ce[i].best_seg_class = info->best_seg_class[i];
sc->ce[i].best_mw = info->best_mw[i];
}
}
/* 文節が拡大されたので,それを学習する */
static void
proc_expanded_segment(struct splitter_context *sc,
int from, int len)
{
int initial_len = sc->ce[from].initial_seg_len;
int i, nr;
xstr from_xs, to_xs, *xs;
from_xs.str = sc->ce[from].c;
from_xs.len = initial_len;
to_xs.str = sc->ce[from].c;
to_xs.len = len;
if (anthy_select_section("EXPANDPAIR", 1) == -1) {
return ;
}
if (anthy_select_row(&from_xs, 1) == -1) {
return ;
}
nr = anthy_get_nr_values();
for (i = 0; i < nr; i ++) {
xs = anthy_get_nth_xstr(i);
if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
/* 既にある */
return ;
}
}
anthy_set_nth_xstr(nr, &to_xs);
anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
}
/* 文節のマージと語尾を学習する */
void
anthy_commit_border(struct splitter_context *sc, int nr_segments,
struct meta_word **mw, int *seg_len)
{
int i, from = 0;
/* 伸ばした文節 */
for (i = 0; i < nr_segments; i++) {
/* それぞれの文節に対して */
int len = seg_len[i];
int initial_len = sc->ce[from].initial_seg_len;
int real_len = 0;
int l2;
if (!initial_len || from + initial_len == sc->char_count) {
/* そこは境界ではない */
goto tail;
}
l2 = sc->ce[from + initial_len].initial_seg_len;
if (initial_len + l2 > len) {
/* 隣の文節を含むほど拡大されたわけではない */
goto tail;
}
if (mw[i]) {
real_len = mw[i]->len;
}
if (real_len <= initial_len) {
goto tail;
}
/* 右の文節を含む長さに拡張された文節がコミットされた */
proc_expanded_segment(sc, from, real_len);
tail:
from += len;
}
}
int
anthy_splitter_debug_flags(void)
{
return splitter_debug_flags;
}
void
anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
{
alloc_char_ent(xs, sc);
alloc_info_cache(sc);
sc->is_reverse = is_reverse;
/* 全ての部分文字列をチェックして、文節の候補を列挙する
word_listを構成してからmetawordを構成する */
anthy_lock_dic();
anthy_make_word_list_all(sc);
anthy_unlock_dic();
anthy_make_metaword_all(sc);
}
void
anthy_release_split_context(struct splitter_context *sc)
{
if (sc->word_split_info) {
release_info_cache(sc);
sc->word_split_info = 0;
}
if (sc->ce) {
free(sc->ce);
sc->ce = 0;
}
}
/** splitter全体の初期化を行う */
int
anthy_init_splitter(void)
{
/* デバッグプリントの設定 */
char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
splitter_debug_flags = SPLITTER_DEBUG_NONE;
if (!dis && en && strlen(en)) {
char *fs = getenv("ANTHY_SPLITTER_PRINT");
if (fs) {
if (strchr(fs, 'w')) {
splitter_debug_flags |= SPLITTER_DEBUG_WL;
}
if (strchr(fs, 'm')) {
splitter_debug_flags |= SPLITTER_DEBUG_MW;
}
if (strchr(fs, 'l')) {
splitter_debug_flags |= SPLITTER_DEBUG_LN;
}
if (strchr(fs, 'i')) {
splitter_debug_flags |= SPLITTER_DEBUG_ID;
}
if (strchr(fs, 'c')) {
splitter_debug_flags |= SPLITTER_DEBUG_CAND;
}
}
}
/* 付属語グラフの初期化 */
if (anthy_init_depword_tab()) {
anthy_log(0, "Failed to init dependent word table.\n");
return -1;
}
/**/
anthy_wtype_noun = anthy_init_wtype_by_name("名詞35");
anthy_wtype_name_noun = anthy_init_wtype_by_name("人名");
anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞");
anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語");
anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形");
anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語");
anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞");
anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞");
anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞");
anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞");
anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞");
anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1");
anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10");
return 0;
}
void
anthy_quit_splitter(void)
{
anthy_quit_depword_tab();
}
syntax highlighted by Code2HTML, v. 0.9.1