/*
 * Copyright (c) 1991-2003 Kyoto University
 * Copyright (c) 2000-2003 NAIST
 * All rights reserved
 */

/* ngram2.h --- N-gram structure */

/* $Id: ngram2.h,v 1.7 2003/09/29 06:01:22 ri Exp $ */


/***** revision 3 *****/
/* ngram2.h .. n-gram language model for speech recognition */
/* Third revision: sequencial allocation */
/* for disk-LM and rapid read-in */

#ifndef __SENT_NGRAM2_H__
#define __SENT_NGRAM2_H__

#include <sent/stddefs.h>
#include <sent/ptree.h>


/* MACROs */
#define MAX_N 3			/* read until 3-gram */

/* data structures */

typedef int NNID;		/* typedef for N-gram entry ID */
#define NNID_INVALID -1

typedef struct ngram_data {
  boolean from_bin;		/* TRUE if source is bingram, otherwise ARPA */
  WORD_ID max_word_num;		/* total size of N-gram vocabulary */
  NNID ngram_num[MAX_N];	/* total number of N tuples */
  NNID total_ngram_num[MAX_N];	/* total count of each N-gram */
  WORD_ID unk_id;		/* unknown word ID */
  int unk_num;			/* number of unknown words in the tree lexicon */
  LOGPROB unk_num_log;		/* log scale value of unk_num */
  boolean isopen;		/* TRUE if open vocabulary (word unknown in N-gram appears in lexicon (dictionary) */

  /* basic data (nid: 0 - max_word_num-1) */
  char **wname;			/* word name string [nid] */
  PATNODE *root;		/* root node word string->ID search index */
  
  /* 1-gram ( nid: 0 - ngram_num[0]-1 ) */
  LOGPROB *p;			/* 1-gram log probability [nid] */
  LOGPROB *bo_wt_lr;		/* back-off weight for LR 2-gram [nid] */
  LOGPROB *bo_wt_rl;		/* back-off weight for RL 2-gram [nid] */
  NNID *n2_bgn;			/* 2-gram entries who have the same left context [nid] begin at n2_bgn[nid] till n2_bgn[nid]+n2_num[nid]-1 */
  WORD_ID *n2_num;
  
  /* 2-gram (n2: 0 - ngram_num[1] - 1) */
  WORD_ID *n2tonid;		/* index [n2] -> nid */
  LOGPROB *p_lr;		/* LR 2-gram log probability [n2] */
  LOGPROB *p_rl;		/* RL 2-gram log probability [n2] */
  LOGPROB *bo_wt_rrl;		/* back-off weight for RL 3-gram [n2] */
  NNID *n3_bgn;			/* 3-gram entries who have the same right context at [n2] begin at n3_bgn[n2] till n3_bgn[n2]+n3_num[n2]-1 */
  WORD_ID *n3_num;
  
  /* 3-gram (n3: 0 - ngram_num[2] - 1) */
  WORD_ID *n3tonid;		/* index [n3] = nid */
  LOGPROB *p_rrl;		/* RL 3-gram log probability [n3] */
} NGRAM_INFO;



#define BINGRAM_IDSTR "julius_bingram_v3"
#define BINGRAM_HDSIZE 512
#define BINGRAM_SIZESTR_HEAD "word="
#ifdef WORDS_INT
#define BINGRAM_SIZESTR_BODY "4byte(int)"
#else
#define BINGRAM_SIZESTR_BODY "2byte(unsigned short)"
#endif
#define BINGRAM
#define BEGIN_WORD_DEFAULT "<s>"
#define END_WORD_DEFAULT "</s>"



/* function declare */
NNID search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r);
NNID search_trigram(NGRAM_INFO *ndata,  NNID n2, WORD_ID wkey);
LOGPROB uni_prob(NGRAM_INFO *ndata, WORD_ID w);
LOGPROB bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
LOGPROB bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2);
LOGPROB tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3);

boolean ngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction);
void set_unknown_id(NGRAM_INFO *ndata);
boolean ngram_read_bin(FILE *fp, NGRAM_INFO *ndata);
boolean ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *header_str);

void ngram_make_lookup_tree(NGRAM_INFO *ndata);
WORD_ID ngram_lookup_word(NGRAM_INFO *ndata, char *wordstr);
WORD_ID make_ngram_ref(NGRAM_INFO *, char *);

NGRAM_INFO *ngram_info_new();
void ngram_info_free(NGRAM_INFO *ngram);
void init_ngram_bin(NGRAM_INFO *ndata, char *ngram_file);
void init_ngram_arpa(NGRAM_INFO *ndata, char *lrfile, char *rlfile);

void print_ngram_info(NGRAM_INFO *ndata);

#include <sent/vocabulary.h>
void make_voca_ref(NGRAM_INFO *ndata, WORD_INFO *winfo);

#endif /* __SENT_NGRAM2_H__ */
