/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

  $Id: dictionary.cpp,v 1.15 2006/07/09 13:34:22 taku-ku Exp $;

  Copyright (C) 2001-2006 Taku Kudo <taku@chasen.org>
  Copyright (C) 2004-2006 Nippon Telegraph and Telephone Corporation

*/
#include <fstream>
#include "param.h"
#include "dictionary.h"
#include "common.h"
#include "utils.h"
#include "mempool.h"
#include "dictionary_rewriter.h"
#include "connector.h"
#include "context_id.h"
#include "utils.h"

namespace MeCab {

  static const unsigned int DictionaryMagicID = 0xef718f77u;

  int progress_bar_darts(size_t current, size_t total)
  {
    return progress_bar("emitting double-array", current, total);
  }

  bool Dictionary::open(const std::string &file)
  {
    MMAP_OPEN(char, dmmap_, file);

    CHECK_CLOSE_FALSE(dmmap_->size() >= 100)
      << "dictionary file is broken: " << file;

    char *ptr = dmmap_->begin();

    unsigned short index_type;
    unsigned int dsize;
    unsigned int tsize;
    unsigned int fsize;
    unsigned int magic;

    read_static<unsigned int>(&ptr, magic);
    CHECK_CLOSE_FALSE((magic ^ DictionaryMagicID) == dmmap_->size())
      << "dictionary file is broken: " << file;

    read_static<unsigned short>(&ptr, version_);
    CHECK_CLOSE_FALSE(version_ == DIC_VERSION)
      << "incompatible version: " << version_;

    read_static<unsigned short>(&ptr, type_);
    read_static<unsigned short>(&ptr, index_type);
    read_static<unsigned int>(&ptr, lexsize_);
    read_static<unsigned int>(&ptr, lsize_);
    read_static<unsigned int>(&ptr, rsize_);
    read_static<unsigned int>(&ptr, dsize);
    read_static<unsigned int>(&ptr, tsize);
    read_static<unsigned int>(&ptr, fsize);

    charset_ = ptr;
    ptr += 32;
    da_.set_array(ptr);
    ptr += dsize;

    token_ = reinterpret_cast<Token *>(ptr);
    ptr += tsize;

    feature_ = ptr;
    ptr += fsize;

    CHECK_CLOSE_FALSE(ptr == dmmap_->end())
      << "dictionary file is broken: " << file;

    return true;
  }

  void Dictionary::close()
  {
    MMAP_CLOSE(char, dmmap_);
  }

  bool Dictionary::compile(Param &param,
                           const std::vector<std::string> &dics, // inputs
                           const char *matrix_file,
                           const char *matrix_bin_file,
                           const char *left_id_file,
                           const char *right_id_file,
                           const char *rewrite_file,
                           const char *pos_id_file,
                           const char *output)
  {
    Connector matrix;
    DictionaryRewriter *rewrite = 0;
    POSIDGenerator *posid = 0;
    ContextID *cid = 0;

    std::vector<std::pair<std::string, Token*> > dic;

    size_t offset  = 0;
    unsigned int lexsize = 0;
    std::string w, feature, ufeature, lfeature, rfeature, fbuf, key;
    int lid, rid, cost;

    std::string from = param.getProfileString("dcharset");
    std::string to = param.getProfileString("charset");
    bool wakati = param.getProfileInt("wakati");
    int  type = param.getProfileInt("type");
    bool make_posid = param.getProfileInt("posid");

    CHECK_DIE(! from.empty()) << "input dictionary charset is empty";
    CHECK_DIE(! to.empty())   << "output dictionary charset is empty";

    Iconv iconv;
    CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
      << "iconv_open() failed with from=" << from << " to=" << to;

    if (! matrix.openText(matrix_file) && ! matrix.open(matrix_bin_file)) {
      matrix.set_left_size(1);
      matrix.set_right_size(1);
    }

    if (make_posid) {
      posid = new POSIDGenerator;
      CHECK_DIE(posid) << "allocation error";
      posid->open(pos_id_file);
    }

    for (size_t i = 0; i < dics.size(); ++i) {
      std::cout << "reading " << dics[i] << " ... ";

      std::ifstream ifs(dics[i].c_str());
      CHECK_DIE(ifs) << "no such file or directory: " << dics[i];

      char line[BUF_SIZE];

      size_t num = 0;

      while (ifs.getline(line, sizeof(line))) {

        char *col[8];
        size_t n = tokenizeCSV(line, col, 5);
        CHECK_DIE(n == 5) << "format error: " << line;

        w = col[0];
        lid = std::atoi(col[1]);
        rid = std::atoi(col[2]);
        cost = std::atoi(col[3]);
        feature = col[4];
        int pid = make_posid ? posid->id(feature.c_str()) : 0;

        if (lid < 0  || rid < 0) {

          if (! rewrite) {
            rewrite = new DictionaryRewriter;
            rewrite->open(rewrite_file);
          }

          CHECK_DIE(rewrite->rewrite(feature, ufeature, lfeature, rfeature))
            << "rewrite failed: " << feature;

          if (! cid) {
            cid = new ContextID;
            cid->open(left_id_file, right_id_file);
            CHECK_DIE( cid->left_size()  == matrix.left_size() &&
                       cid->right_size() == matrix.right_size() )
                         << "Context ID files ("
                         << left_id_file
                         << " or "
                         << right_id_file << " may be broken";
          }

          lid = cid->lid(lfeature.c_str());
          rid = cid->rid(rfeature.c_str());
        }

        CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
          << "invalid ids are found lid=" << lid << " rid=" << rid;

        if (w.empty()) {
          std::cerr << "empty word is found, discard this line" << std::endl;
          continue;
        }

        if (! iconv.convert(w) || ! iconv.convert(feature)) {
          std::cerr << "iconv conversion failed. skip this entry" << std::endl;
          continue;
        }

        key = "";
        if (!wakati) key = feature + '\0';

        Token* token  = new Token;
        token->lcAttr = lid;
        token->rcAttr = rid;
        token->posid  = pid;
        token->wcost = cost;
        token->feature = offset;
        token->compound = 0;
        dic.push_back(std::make_pair<std::string, Token*>(w, token));

        // append to output buffer
        if (!wakati) fbuf.append(key.data(), key.size());
        offset += key.size();

        ++num;
        ++lexsize;
      }

      std::cout << num << std::endl;
    }

    delete cid;
    delete rewrite;
    delete posid;

    if (wakati) fbuf.append("\0", 1);

    std::sort(dic.begin(), dic.end());

    size_t bsize = 0;
    size_t idx = 0;
    std::string prev;
    std::vector <const char *> str;
    std::vector <Darts::DoubleArray::result_type> val;

    for (size_t i = 0; i < dic.size(); ++i) {
      if (i != 0 && prev != dic[i].first) {
        str.push_back(dic[idx].first.c_str());
        val.push_back(bsize + (idx << 8));
        bsize = 1;
        idx = i;
      } else {
        ++bsize;
      }
      prev = dic[i].first;
    }
    str.push_back(dic[idx].first.c_str());
    val.push_back(bsize + (idx << 8));

    Darts::DoubleArray da;
    CHECK_DIE(da.build(str.size(), (char **)&str[0],
                       0, &val[0], &progress_bar_darts) == 0)
                         << "unkown error in building double-array";

    std::string tbuf;
    for (size_t i = 0; i < dic.size(); ++i) {
      tbuf.append(reinterpret_cast<char*>(dic[i].second), sizeof(Token));
      delete dic[i].second;
    }
    dic.clear();

    unsigned short dummy = 0;
    unsigned int lsize = matrix.left_size();
    unsigned int rsize = matrix.right_size();
    unsigned int dsize = da.unit_size() * da.size();
    unsigned int tsize = tbuf.size();
    unsigned int fsize = fbuf.size();

    unsigned short version = DIC_VERSION;
    char charset[32];
    std::fill(charset, charset + sizeof(charset), '\0');
    std::strncpy(charset, to.c_str(), 31);

    std::ofstream bofs(output, std::ios::binary|std::ios::out);
    CHECK_DIE(bofs) << "permission denied: " << output;

    unsigned int magic = 0;

    bofs.write(reinterpret_cast<char*>(&magic),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&version), sizeof(unsigned short));
    bofs.write(reinterpret_cast<char*>(&type),    sizeof(unsigned short));
    bofs.write(reinterpret_cast<char*>(&dummy),   sizeof(unsigned short));
    bofs.write(reinterpret_cast<char*>(&lexsize), sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&lsize),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&rsize),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&dsize),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&tsize),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(&fsize),   sizeof(unsigned int));
    bofs.write(reinterpret_cast<char*>(charset),  sizeof(charset));
    bofs.write(reinterpret_cast<char*>(da.array()), da.unit_size() * da.size());
    bofs.write(const_cast<char*>(tbuf.data()), tbuf.size());
    bofs.write(const_cast<char*>(fbuf.data()), fbuf.size());

    // save magic id
    magic = static_cast<unsigned int>(bofs.tellp());
    magic ^= DictionaryMagicID;
    bofs.seekp(0);
    bofs.write(reinterpret_cast<char*>(&magic), sizeof(unsigned int));

    bofs.close();

    return true;
  }
}

int mecab_dict_info(int argc, char **argv)
{
  static const MeCab::Option long_options[] =
    {
      { "help",      'h',  0,   0,   "show this help and exit."  },
      { "version", 'v',  0,   0,  "show the version and exit"  },
      { 0, 0, 0, 0 }
    };

  MeCab::Param param;

  if (! param.open(argc, argv, long_options)) {
    std::cout << param.what() << "\n\n" <<  COPYRIGHT
              << "\ntry '--help' for more information." << std::endl;
    return -1;
  }

  const std::vector<std::string> &files = param.rest_args();

  if (!param.help_version(long_options)) return 0;

  for (std::vector<std::string>::const_iterator it = files.begin();
       it != files.end();
       ++it) {
    MeCab::Dictionary dic;
    if (! dic.open(it->c_str())) {
      std::cerr << dic.what();
      continue;
    }
    std::cout << "filename:\t" << *it << std::endl;
    std::cout << "version:\t" << dic.version() << std::endl;
    std::cout << "charset:\t" << dic.charset() << std::endl;
    std::cout << "type:\t" << dic.type()   << std::endl;
    std::cout << "size:\t" << dic.size() << std::endl;
    std::cout << "left size:\t" << dic.lsize() << std::endl;
    std::cout << "right size:\t" << dic.rsize() << std::endl;
    std::cout << std::endl;
  }

  return 0;
}

