/*
 * ʸκǾñ̤Ǥwordlist
 *
 * init_word_seq_tab()
 *   °ơ֥ΥΡɤؤΥݥ󥿤ν
 * release_word_seq_tab()
 *   °ơ֥β
 * anthy_make_word_list_all() 
 * ʸηʬʸ󤹤
 *  ηϩ󤵤줿word_list
 *  anthy_commit_word_listsplitter_contextɲä
 *
 * Funded by IPA̤Ƨեȥ¤ 2002 2/27
 * Copyright (C) 2000-2003 TABATA Yusuke, UGAWA Tomoharu
 *
 * $Id: wordlist.c,v 1.50 2002/11/17 14:45:47 yusuke Exp $
 *
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <alloc.h>
#include <record.h>
#include <xstr.h>
#include <wtype.h>
#include <conf.h>
#include <ruleparser.h>
#include <dic.h>
#include <splitter.h>
#include "wordborder.h"

static allocator wordseq_rule_ator;

/** ΩĹФɾ */
#define SCORE_BY_CORE_LEN_TAB_MAX 5
static const int score_by_core_len[] =
  {1,1,8,27,64,125,216,343,512,729,1000};

/** ΩʻȤθ³°ΥΥΡɤб */
struct wordseq_rule {
  wtype_t wt; /* Ωʻ */
  int ratio; /* ΥФΨ */
  char *wt_name; /* ʻ̾(ǥХå) */
  int node_id; /* μΩθ³°쥰ΥΡɤid */
  struct wordseq_rule *next;
};

/* ñ³롼 */
static struct wordseq_rule *gRules;

/* ǥХå */
void
anthy_print_word_list(struct splitter_context *sc,
		      struct word_list *wl)
{
  xstr xs;
  const char *wn = "---";
  if (!wl) {
    printf("--\n");
    return ;
  }
  /* Ƭ */
  xs.len = wl->part[PART_CORE].from - wl->from;
  xs.str = sc->ce[wl->from].c;
  anthy_putxstr(&xs);
  printf(".");
  /* Ω */
  xs.len = wl->part[PART_CORE].len;
  xs.str = sc->ce[wl->part[PART_CORE].from].c;
  anthy_putxstr(&xs);
  printf(".");
  /*  */
  xs.len = wl->part[PART_POSTFIX].len;
  xs.str = sc->ce[wl->part[PART_CORE].from + wl->part[PART_CORE].len].c;
  anthy_putxstr(&xs);
  printf("-");
  /* ° */
  xs.len = wl->part[PART_DEPWORD].len;
  xs.str = sc->ce[wl->part[PART_CORE].from +
		  wl->part[PART_CORE].len +
		  wl->part[PART_POSTFIX].len].c;
  anthy_putxstr(&xs);
  if (wl->core_wt_name) {
    wn = wl->core_wt_name;
  }
  printf(" %s %d %d\n", wn, wl->score, wl->part[PART_DEPWORD].ratio);
}

static int
calc_score_by_len(int len)
{
  if (len <= SCORE_BY_CORE_LEN_TAB_MAX) {
    return score_by_core_len[len];
  }
  /* 2䤷Ƥ */
  return
    score_by_core_len[SCORE_BY_CORE_LEN_TAB_MAX]
    * len * (len + 1 - SCORE_BY_CORE_LEN_TAB_MAX) /
    SCORE_BY_CORE_LEN_TAB_MAX;
}

static int
log2(int x)
{
  if (x < 64) {
    if (x < 32) {
      if (x < 16) {
	if (x < 4) {
	  if (x < 2) {
	    return 1;
	  }
	  return 2;
	}
	return 3;
      }
      return 4;
    }
    return 5;
  }
  return 5 + log2(x/64);
}

/** word_listɾ */
static void
anthy_eval_word_list(struct word_list *wl)
{
  int f;

  f = log2(wl->part[PART_CORE].freq)+25;
  if (f > 30) {
    f = 30;
  }

  /* ΩΥ٤ˤ */
  wl->score += calc_score_by_len(wl->part[PART_CORE].len)*f;

  /* °Ф */
  if (wl->part[PART_DEPWORD].len) {
    int score, len = wl->part[PART_DEPWORD].len;
    if (len > 5) {
      len = 5;
    }
    score = len * wl->part[PART_DEPWORD].ratio * 800;
    score /= RATIO_BASE;
    wl->score += score;
  }

  /* ƬФĴ */
  wl->score *= wl->part[PART_POSTFIX].ratio;
  wl->score /= RATIO_BASE;
  wl->score *= wl->part[PART_PREFIX].ratio;
  wl->score /= RATIO_BASE;

  wl->score *= wl->part[PART_DEPWORD].ratio;
  wl->score /= RATIO_BASE;

  /* Ĺˤ */
  wl->score += SCORE_PER_LEN * wl->len;
}

/** word_listӤ롢޴ΤʤΤǡ
    ̩ӤǤɬפ̵ */
static int
word_list_same(struct word_list *wl1, struct word_list *wl2)
{
  if (wl1->node_id != wl2->node_id ||
      wl1->score != wl2->score ||
      wl1->from != wl2->from ||
      wl1->len != wl2->len ||
      wl1->core_wt_name != wl2->core_wt_name) {
    return 0;
  }
  if (wl1->seg_class.tail_ct != wl2->seg_class.tail_ct ||
      wl1->seg_class.tail_type != wl2->seg_class.tail_type) {
    return 0;
  }
  return 1;
}

/** äword_listΥ׻Ƥ饳ߥåȤ */
void 
anthy_commit_word_list(struct splitter_context *sc,
		       struct word_list *wl)
{
  struct word_list *tmp;
  /**/
  wl->last_part = PART_DEPWORD;

  /* ׻ */
  anthy_eval_word_list(wl);

  /* ƱƤword_listʤĴ٤ */
  for (tmp = sc->word_split_info->cnode[wl->from].wl; tmp; tmp = tmp->next) {
    if (word_list_same(tmp, wl)) {
      return ;
    }
  }
  /* wordlistΥꥹȤɲ */
  wl->next = sc->word_split_info->cnode[wl->from].wl;
  sc->word_split_info->cnode[wl->from].wl = wl;

  /* ǥХåץ */
  if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_WL) {
    anthy_print_word_list(sc, wl);
  }
}

struct word_list *
anthy_alloc_word_list(struct splitter_context *sc)
{
  return anthy_smalloc(sc->word_split_info->WlAllocator);
}

/* ³γѸ졢ưդ */
static void
make_following_word_list(struct splitter_context *sc,
			 struct word_list *tmpl)
{
  /* xsϼΩθ³ʸ */
  xstr xs;
  xs.str = sc->ce[tmpl->from+tmpl->len].c;
  xs.len = sc->char_count - tmpl->from - tmpl->len;
  tmpl->part[PART_DEPWORD].from =
    tmpl->part[PART_PREFIX].from + tmpl->part[PART_PREFIX].len;
  
  if (tmpl->node_id >= 0) {
    /* ̤word_list */
    anthy_scan_node(sc, tmpl, &xs, tmpl->node_id);
  } else {
    /* Ω줬ʤword_list */
    struct wordseq_rule *r;
    struct word_list new_tmpl;
    new_tmpl = *tmpl;
    /* ƥ롼Ф */
    for (r = gRules; r; r = r->next) {
      new_tmpl.part[PART_CORE].wt = r->wt;
      new_tmpl.core_wt_name = r->wt_name;
      new_tmpl.node_id = r->node_id;
      anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id);
    }
  }
}

static void
push_part_back(struct word_list *tmpl, int len,
	       seq_ent_t se, wtype_t wt)
{
  tmpl->len += len;
  tmpl->part[PART_POSTFIX].len += len;
  tmpl->part[PART_POSTFIX].wt = wt;
  tmpl->part[PART_POSTFIX].seq = se;
  tmpl->part[PART_POSTFIX].ratio = RATIO_BASE * 3 / 2;
  tmpl->last_part = PART_POSTFIX;
}

/* 򤯤äĤ */
static void 
make_suc_words(struct splitter_context *sc,
	       struct word_list *tmpl)
{
  int i, right;

  wtype_t core_wt = tmpl->part[PART_CORE].wt;
  /* 졢̾̾Τ줫°դ */
  int core_is_num = 0;
  int core_is_name = 0;
  int core_is_sv_noun = 0;

  /* ޤդΩ줫å */
  if (anthy_wtypecmp(anthy_wtype_num_noun, core_wt)) {
    core_is_num = 1;
  }
  if (anthy_wtypecmp(anthy_wtype_name_noun, core_wt)) {
    core_is_name = 1;
  }
  if (anthy_wtype_get_sv(core_wt)) {
    core_is_sv_noun = 1;
  }
  if (!core_is_num && !core_is_name && !core_is_sv_noun) {
    return ;
  }

  right = tmpl->part[PART_CORE].from + tmpl->part[PART_CORE].len;
  /* Ωα¦ʸФ */
  for (i = 1;
       i <= sc->word_split_info->seq_len[right];
       i++){
    xstr xs;
    seq_ent_t suc;
    xs.str = sc->ce[right].c;
    xs.len = i;
    suc = anthy_get_seq_ent_from_xstr(&xs);
    if (anthy_get_seq_ent_pos(suc, POS_SUC)) {
      /* ¦ʸ°ʤΤǡΩʻˤ碌ƥå */
      struct word_list new_tmpl;
      if (core_is_num &&
	  anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_num_postfix)) {
	new_tmpl = *tmpl;
	push_part_back(&new_tmpl, i, suc, anthy_wtype_num_postfix);
	make_following_word_list(sc, &new_tmpl);
      }
      if (core_is_name &&
	  anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_name_postfix)) {
	new_tmpl = *tmpl;
	push_part_back(&new_tmpl, i, suc, anthy_wtype_name_postfix);
	make_following_word_list(sc, &new_tmpl);
      }
      if (core_is_sv_noun &&
	  anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_sv_postfix)) {
	new_tmpl = *tmpl;
	push_part_back(&new_tmpl, i, suc, anthy_wtype_sv_postfix);
	make_following_word_list(sc, &new_tmpl);
      }
    }
  }
}

static void
push_part_front(struct word_list *tmpl, int len,
		seq_ent_t se, wtype_t wt)
{
  tmpl->from = tmpl->from - len;
  tmpl->len = tmpl->len + len;
  tmpl->part[PART_PREFIX].from = tmpl->from;
  tmpl->part[PART_PREFIX].len += len;
  tmpl->part[PART_PREFIX].wt = wt;
  tmpl->part[PART_PREFIX].seq = se;
  tmpl->part[PART_PREFIX].ratio = RATIO_BASE * 2 / 3;
}

/* Ƭ򤯤äĤƤ򤯤äĤ */
static void
make_pre_words(struct splitter_context *sc,
	       struct word_list *tmpl)
{
  int i;
  wtype_t core_wt = tmpl->part[PART_CORE].wt;
  /* ΩϿ줫 */
  if (!anthy_wtypecmp(anthy_wtype_num_noun, core_wt)) {
    return ;
  }
  /* Ƭ󤹤 */
  for (i = 1; 
       i <= sc->word_split_info->rev_seq_len[tmpl->part[PART_CORE].from];
       i++) {
    seq_ent_t pre;
    /* xsϼΩʸ */
    xstr xs;
    xs.str = sc->ce[tmpl->part[PART_CORE].from - i].c;
    xs.len = i;
    pre = anthy_get_seq_ent_from_xstr(&xs);
    if (anthy_get_seq_ent_pos(pre, POS_PRE)) {
      struct word_list new_tmpl;
      if (anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_num_prefix)) {
	new_tmpl = *tmpl;
	push_part_front(&new_tmpl, i, pre, anthy_wtype_num_prefix);
	make_following_word_list(sc, &new_tmpl);
	/* ξ⤯äĤ */
	make_suc_words(sc, &new_tmpl);
      }
    }
  }
}

/* wordlist */
static void
setup_word_list(struct word_list *wl, int from, int len)
{
  int i;
  wl->from = from;
  wl->len = len;
  /* part */
  for (i = 0; i < NR_PARTS; i++) {
    wl->part[i].from = 0;
    wl->part[i].len = 0;
    wl->part[i].wt = anthy_wt_none;
    wl->part[i].seq = 0;
    wl->part[i].freq = 1;/* ٤㤤ñȤƤ */
    wl->part[i].ratio = RATIO_BASE;
    wl->part[i].pos = POS_NONE;
    wl->part[i].ct = CT_NONE;
  }
  /* ΩΥѡȤ */
  wl->part[PART_CORE].from = from;
  wl->part[PART_CORE].len = len;
  /**/
  wl->score = 0;
  wl->node_id = -1;
  wl->last_part = PART_CORE;
  anthy_init_segclass(&wl->seg_class);
  /**/
  wl->core_wt_name = NULL;
}

/*
 * ΩФơƬ°դΤ
 * ʸθ(=word_list)Ȥcacheɲä
 */
static void
make_word_list(struct splitter_context *sc,
	       seq_ent_t se,
	       int from, int len)
{
  struct word_list tmpl;
  struct wordseq_rule *r;

  /* ƥץ졼Ȥν */
  setup_word_list(&tmpl, from, len);
  tmpl.part[PART_CORE].seq = se;

  /* ƥ롼˥ޥå뤫 */
  for (r = gRules; r; r = r->next) {
    int freq = anthy_get_seq_ent_wtype_freq(se, r->wt);
    if (freq) {
      /* ΩʻϤΥ롼ˤäƤ */
      if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_ID) {
	/* ʻɽΥǥХå*/
	xstr xs;
	xs.str = sc->ce[tmpl.part[PART_CORE].from].c;
	xs.len = tmpl.part[PART_CORE].len;
	anthy_putxstr(&xs);
	printf(" name=%s freq=%d node_id=%d\n", r->wt_name, freq, r->node_id);
      }
      /* ܤ롼ξž */
      tmpl.part[PART_CORE].wt = r->wt;
      tmpl.part[PART_CORE].ratio = r->ratio;
      tmpl.part[PART_CORE].freq = freq;
      tmpl.seg_class.head_pos = anthy_wtype_get_pos(r->wt);
      tmpl.core_wt_name = r->wt_name;
      tmpl.node_id = r->node_id;
      /**/
      tmpl.part[PART_POSTFIX].from =
	tmpl.part[PART_CORE].from +
	tmpl.part[PART_CORE].len;
      /**/
      if (anthy_wtype_get_pos(r->wt) == POS_NOUN) {
	/* Ƭ̾ˤդʤȤˤƤ */
	make_pre_words(sc, &tmpl);
	make_suc_words(sc, &tmpl);
      }
      /* Ƭ̵ǽưĤ */
      make_following_word_list(sc, &tmpl);
    }
  }
}

static void
make_dummy_head(struct splitter_context *sc)
{
  struct word_list tmpl;
  setup_word_list(&tmpl, 0, 0);
  tmpl.part[PART_CORE].seq = 0;
  tmpl.part[PART_CORE].wt = anthy_wtype_noun;
  tmpl.score = 0;
  make_suc_words(sc, &tmpl);
}

/* ƥȤʸƤword_list󤹤 */
void 
anthy_make_word_list_all(struct splitter_context *sc)
{
  int i, j;
  xstr xs;
  seq_ent_t se;
  struct depword_ent {
    struct depword_ent *next;
    int from, len;
    seq_ent_t se;
  } *head, *de;
  struct word_split_info_cache *info;
  allocator de_ator;

  info = sc->word_split_info;
  head = 0;
  de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0);

  /* ƤμΩ */
  /* Υ롼 */
  for (i = 0; i < sc->char_count ; i++) {
    int search_len = sc->char_count - i;
    int search_from = 0;
    if (search_len > 30) {
      search_len = 30;
    }
    /* ʸĹΥ롼(Ĺ) */
    for (j = search_len; j > search_from; j--) {
      xs.len = j;
      xs.str = sc->ce[i].c;

      se = anthy_get_seq_ent_from_xstr(&xs);

      if (se) {
	/* ơʬʸñʤ */

	/* ƬκĹĴ٤ */
	if (j > info->seq_len[i] &&
	    anthy_get_seq_ent_pos(se, POS_SUC)) {
	  info->seq_len[i] = j;
	}
	if (j > info->rev_seq_len[i + j] &&
	    anthy_get_seq_ent_pos(se, POS_PRE)) {
	  info->rev_seq_len[i + j] = j;
	}
	/* ȯΩꥹȤɲ */
	if (anthy_get_seq_ent_indep(se)) {
	  de = (struct depword_ent *)anthy_smalloc(de_ator);
	  de->from = i;
	  de->len = j;
	  de->se = se;
	  de->next = head;
	  head = de;
	}
      }
    }
  }

  /* ȯΩƤФ°ѥθ */
  for (de = head; de; de = de->next) {
    make_word_list(sc, de->se, de->from, de->len);
  }

  /* Ƭ0ʸμΩդ */
  make_dummy_head(sc);

  anthy_free_allocator(de_ator);
}

static
void parse_line(char **tokens, int nr)
{
  struct wordseq_rule *r;
  int tmp;
  if (nr < 2) {
    printf("Syntex error in indepword defs"
	   " :%d.\n", anthy_get_line_number());
    return ;
  }
  /* ԤƬˤʻ̾äƤ */
  r = anthy_smalloc(wordseq_rule_ator);
  r->wt_name = anthy_name_intern(tokens[0]);
  anthy_name_to_wtype(tokens[0], &r->wt);

  /* Ψ */
  tmp = atoi(tokens[1]);
  if (tmp == 0) {
    tmp = 1;
  }
  r->ratio = RATIO_BASE - tmp*(RATIO_BASE/16);

  /* μˤϥΡ̾äƤ */
  r->node_id = anthy_get_node_id_by_name(tokens[2]);

  /* 롼ɲ */
  r->next = gRules;
  gRules = r;
}

/** °쥰դɤ߹ */
static int 
init_word_seq_tab(void)
{
  const char *fn;
  char **tokens;
  int nr;

  wordseq_rule_ator = anthy_create_allocator(sizeof(struct wordseq_rule),
					     NULL);

  fn = anthy_conf_get_str("INDEPWORD");
  if (!fn){
    printf("independent word dict unspecified.\n");
    return -1;
  }
  if (anthy_open_file(fn) == -1) {
    printf("Failed to open indep word dict (%s).\n", fn);
    return -1;
  }
  /* եԤɤ */
  gRules = NULL;
  while (!anthy_read_line(&tokens, &nr)) {
    parse_line(tokens, nr);
    anthy_free_line();
  }
  anthy_close_file();

  return 0;
}

int
anthy_init_wordlist(void)
{
  return init_word_seq_tab();
}
