/* Copyright (c) 1991-2002 Doshita Lab. Speech Group, Kyoto University */
/* Copyright (c) 2000-2002 Speech and Acoustics Processing Lab., NAIST */
/*   All rights reserved   */

/* ngram_access.c --- access N-gram data and return probability */

/* $Id: ngram_access.c,v 1.3 2002/09/11 22:01:50 ri Exp $ */

#include <sent/stddefs.h>
#include <sent/ngram2.h>

/* search for wkey at (bigrams) */
/* return NULL if not found */
NNID
search_bigram(NGRAM_INFO *ndata, WORD_ID w_l, WORD_ID w_r)
{
  /* do binary search */
  /* assume that data in (bigrams) are ordered by wid */
  NNID left,right,mid;		/* n2 */

  if ((left = ndata->n2_bgn[w_l]) == NNID_INVALID) 	/* has no bigram */
    return (NNID_INVALID);
  right = left + ndata->n2_num[w_l] - 1;
  while(left < right) {
    mid = (left + right) / 2;
    if (ndata->n2tonid[mid] < w_r) {
      left = mid + 1;
    } else {
      right = mid;
    }
  }
  if (ndata->n2tonid[left] == w_r) {
    return (left);
  } else {
    return (NNID_INVALID);
  }
}

/* search for wkey at (trigrams) */
/* return NULL if not found */
NNID
search_trigram(NGRAM_INFO *ndata, NNID n2, WORD_ID wkey)
{
  /* do binary search */
  /* assume that data in (trigrams) are ordered by wid */
  int left,right,mid;

  if ((left = ndata->n3_bgn[n2]) == NNID_INVALID) 	/* has no bigram */
    return (NNID_INVALID);
  right = left + ndata->n3_num[n2] - 1;
  while(left < right) {
    mid = (left + right) / 2;
    if (ndata->n3tonid[mid] < wkey) {
      left = mid + 1;
    } else {
      right = mid;
    }
  }
  if (ndata->n3tonid[left] == wkey) {
    return (left);
  } else {
    return (NNID_INVALID);
  }
}


/* ---------------------------------------------------------------------- */
/* for 1-gram */
/* return unigram prob  p(w) */
LOGPROB
uni_prob(NGRAM_INFO *ndata, WORD_ID w)
{
  if (w != ndata->unk_id) {
    return(ndata->p[w]);
  } else {
    return(ndata->p[w] - ndata->unk_num_log);
  }
}

/* for 2-gram */
/* return bigram prob  p(w2|w1) */
LOGPROB
bi_prob_lr(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
{
  NNID n2;
  LOGPROB prob;

  if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
    /* bigram exist */
    prob = ndata->p_lr[n2];
  } else {
    /* bigram not exist, return back-off prob */
    /* bo_wt_lr(w1) * p(w2) */
    prob = ndata->bo_wt_lr[w1] + ndata->p[w2];
  }
  if (w2 != ndata->unk_id) {
    return(prob);
  } else {
    return(prob - ndata->unk_num_log);
  }
}

/* return bigram prob  p(w1|w2) */
LOGPROB
bi_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2)
{
  NNID n2;
  LOGPROB prob;

  if ((n2 = search_bigram(ndata, w1, w2)) != NNID_INVALID) {
    /* bigram exist */
    prob = ndata->p_rl[n2];
  } else {
    /* bigram not exist, return back-off prob */
    /* bo_wt_rl(w2) * p(w1) */
    prob = ndata->bo_wt_rl[w2] + ndata->p[w1];
  }
  if (w1 != ndata->unk_id) {
    return(prob);
  } else {
    return(prob - ndata->unk_num_log);
  }
}

/* for 3-gram */
/* return trigram prob p(w1|w2,w3) */
LOGPROB
tri_prob_rl(NGRAM_INFO *ndata, WORD_ID w1, WORD_ID w2, WORD_ID w3)
{
  NNID n2, n3;
  
  if ((n2 = search_bigram(ndata, w2, w3)) != NNID_INVALID) {
    if ((n3 = search_trigram(ndata, n2, w1)) != NNID_INVALID) {
      /* trigram exist */
      if (w1 != ndata->unk_id) {
	return(ndata->p_rrl[n3]);
      } else {
	return(ndata->p_rrl[n3] - ndata->unk_num_log);
      }
    } else {
      /* return back-off prob */
      /* bo_wt_rl(w2,w3) * p(w1|w2) */
      return(ndata->bo_wt_rrl[n2] + bi_prob_rl(ndata, w1, w2)); /* unk already discounted at bi-gram*/
    }
  } else {
    /* context not exist, so return bigram prob */
    return(bi_prob_rl(ndata, w1, w2));
  }
}
