/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

  $Id: tagger.cpp,v 1.30 2006/07/09 15:18:41 taku-ku Exp $;

  Copyright (C) 2001-2006 Taku Kudo <taku@chasen.org>
  Copyright (C) 2004-2006 Nippon Telegraph and Telephone Corporation

*/
#include <cstring>
#include <iostream>

#include "viterbi.h"
#include "common.h"
#include "param.h"
#include "mecab.h"
#include "string_buffer.h"
#include "writer.h"
#include "connector.h"
#include "nbest_generator.h"
#include "stream_wrapper.h"

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

const char *getGlobalError();
void setGlobalError(const char *str);

namespace {

  const MeCab::Option long_options[] =
    {
      { "rcfile",        'r',  0, "FILE",    "use FILE as resource file" },
      { "dicdir",        'd',  0, "DIR",    "set DIR  as a system dicdir"        },
      { "userdic",        'u',  0, "FILE",    "use FILE as a user dictionary"        },
      { "lattice-level",      'l', "0", "INT",    "lattice information level (default 0)" },
      { "all-morphs",      'a', 0, 0,    "output all morphs (default false)" },
      { "output-format-type", 'O',  0, "TYPE",     "set output format type (wakati,none,...)" },
      { "partial",            'p',  0, 0,      "partial parsing mode" },
      { "node-format",        'F',  0, "STR",     "use STR as the user-defined node format"  },
      { "unk-format",        'U',  0, "STR",     "use STR as the user-defined unk format"   },
      { "bos-format",        'B',  0, "STR",     "use STR as the user-defined bos format"   },
      { "eos-format",        'E',  0, "STR",     "use STR as the user-defined eos format"   },
      { "unk-feature",       'x',  0, "STR",     "use STR as the feature for unknown word" },
      { "input-buffer-size",  'b',  0, "INT",     "set input buffer size (default BUF_SIZE)"     },
      { "allocate-sentence",  'C', 0, 0, "allocate new memory for input sentence" },
      { "nbest",        'N', "1",    "INT",   "output N best results  (default 1)"        },
      { "theta",        't',  "0.75",  "FLOAT", "set temparature parameter theta (default 0.75)"  },
      { "output",        'o',  0,    "FILE",  "set the output file name"       },
      { "version",        'v',  0, 0,     "show the version and exit."       },
      { "help",          'h',  0, 0,     "show this help and exit."       },
      { 0, 0, 0, 0 }
    };
}

namespace MeCab
{
  class TaggerImpl: public Tagger
  {
  private:
    Tokenizer tokenizer_;
    Connector connector_;
    Viterbi viterbi_;
    StringBuffer ostrs_;
    Writer writer_;
    scoped_ptr<NBestGenerator> nbest_;
    const char* begin_;
    bool lattice_level_;
    whatlog what_;

  public:
    bool  open(Param &);
    bool  open(int, char**);
    bool  open(const char*);
    void  close();
    const char* parse(const char*);
    const char* parse(const char*, size_t);
    const char* parse(const char*, size_t, char*, size_t);
    Node*  parseToNode(const char*);
    Node*  parseToNode(const char*, size_t = 0);
    const char* parseNBest(size_t, const char*);
    const char* parseNBest(size_t, const char*, size_t);
    const char* parseNBest(size_t, const char*, size_t, char *, size_t);
    bool  parseNBestInit(const char*);
    bool  parseNBestInit(const char*, size_t);
    Node*  nextNode();
    const char* next();
    const char* next(char*, size_t);
    const char *formatNode(Node *);
    const char *formatNode(Node *, char *, size_t);
    const char* what();

    TaggerImpl(): lattice_level_(0) {};
    virtual ~TaggerImpl() { this->close (); }
  };

  const char *TaggerImpl::what()
  {
    return what_.str();
  }

  bool TaggerImpl::open(int argc, char **argv)
  {
    Param param;
    CHECK_CLOSE_FALSE(param.open(argc, argv, long_options)) << param.what();
    return open (param);
  }

  bool TaggerImpl::open(const char *arg)
  {
    Param param;
    CHECK_CLOSE_FALSE(param.open(arg, long_options)) << param.what();
    return open(param);
  }

  bool TaggerImpl::open(Param &param)
  {
    close ();

    if (param.getProfileInt("help")) {
      WHAT << param.help(long_options);
      close();
      return false;
    }

    if (param.getProfileInt("version")) {
      WHAT << param.version(long_options);
      close();
      return false;
    }

    CHECK_CLOSE_FALSE(load_dictionary_resource(param)) << param.what();

    std::string output_format = param.getProfileString("output-format-type");
    if (output_format == "dump") {
      param.setProfile("lattice-level", "3", true);
      param.setProfile("all-morphs",    "1", true);
    }

    lattice_level_ = param.getProfileInt("lattice-level");

    CHECK_CLOSE_FALSE(tokenizer_.open(param)) << tokenizer_.what();
    CHECK_CLOSE_FALSE(connector_.open(param)) << connector_.what();
    CHECK_CLOSE_FALSE(viterbi_.open(param, &tokenizer_, &connector_)) << viterbi_.what();
    CHECK_CLOSE_FALSE(writer_.open(param)) << writer_.what();

    return true;
  }

  void TaggerImpl::close()
  {
    lattice_level_ = 0;
  }

  const char *TaggerImpl::parse(const char *str)
  {
    return parse(str, std::strlen(str));
  }

  const char *TaggerImpl::parse(const char *str, size_t len)
  {
    Node *n = parseToNode(str, len);
    if (!n) return 0;
    ostrs_.clear();
    CHECK_0(writer_.write(ostrs_, str, n)) << writer_.what();
    ostrs_ << '\0';
    return ostrs_.str();
  }

  const char *TaggerImpl::parse(const char *str, size_t len, char *out, size_t len2)
  {
    Node *n = parseToNode(str, len);
    if (!n) return 0;
    StringBuffer os(out, len2);
    CHECK_0(writer_.write(os, str, n)) << writer_.what();
    os << '\0';
    CHECK_0(os.str()) << "output buffer overflow";
    return os.str();
  }

  const char* TaggerImpl::formatNode(Node* node)
  {
    ostrs_.clear();
    CHECK_0(writer_.writeNode(ostrs_,(const char *)begin_, node)) <<  writer_.what();
    ostrs_ << '\0';
    return ostrs_.str();
  }

  const char* TaggerImpl::formatNode(Node* node, char *out, size_t len)
  {
    StringBuffer os(out, len);
    CHECK_0(writer_.writeNode(os, (const char *)begin_, node)) <<  writer_.what();
    os << '\0';
    CHECK_0(os.str()) << "output buffer overflow";
    return os.str();
  }

  Node *TaggerImpl::parseToNode(const char *str)
  {
    return parseToNode(str, std::strlen(str));
  }

  Node *TaggerImpl::parseToNode(const char *str, size_t len)
  {
    CHECK_0(str) << "NULL pointer is given";
    Node *bosNode = viterbi_.analyze(str, len);
    CHECK_0(bosNode) << viterbi_.what();
    return bosNode;
  }

  bool TaggerImpl::parseNBestInit(const char *str)
  {
    return parseNBestInit(str, std::strlen(str));
  }

  bool TaggerImpl::parseNBestInit(const char *str, size_t len)
  {
    CHECK_FALSE(lattice_level_) << "use -l option to obtain N-Best results. e.g., mecab -N10 -l1";
    Node *n = parseToNode(str, len);
    begin_ = str;
    if (! n) return false;
    if (! nbest_.get()) nbest_.reset(new NBestGenerator);
    nbest_->set(n);
    return true;
  }

  Node* TaggerImpl::nextNode()
  {
    if (! nbest_.get()) nbest_.reset(new NBestGenerator);
    Node *n = nbest_->next();
    CHECK_0(n) << "no more results";
    return n;
  }

  const char* TaggerImpl::next()
  {
    Node *n = nextNode();
    if (! n) return 0;
    ostrs_.clear();
    CHECK_0(writer_.write(ostrs_, (const char *)begin_, n)) << writer_.what();
    ostrs_ << '\0';
    return ostrs_.str();
  }

  const char* TaggerImpl::next(char *out, size_t len2)
  {
    Node *n = nextNode();
    if (! n) return 0;
    StringBuffer os(out, len2);
    CHECK_0(writer_.write(os, (const char *)begin_, n)) << writer_.what();
    os << '\0';
    CHECK_0(os.str()) << "output buffer overflow";
    return os.str();
  }

  const char* TaggerImpl::parseNBest(size_t N, const char* str)
  {
    return parseNBest(N, str, std::strlen(str));
  }

  const char* TaggerImpl::parseNBest(size_t N, const char* str, size_t len)
  {
    if(N == 1) return parse(str, len);

    if (! parseNBestInit(str, len)) return 0;
    ostrs_.clear();

    for (size_t i = 0; i < N; ++i) {
      Node *n = nextNode();
      if (! n) break;
      CHECK_0(writer_.write(ostrs_, str, n)) << writer_.what();
    }

    ostrs_ << '\0';
    return ostrs_.str();
  }

  const char* TaggerImpl::parseNBest(size_t N, const char* str, size_t len,
                                     char *out, size_t len2)
  {
    if (N == 1) return parse(str, len, out, len2);

    if (! parseNBestInit(str, len)) return 0;
    StringBuffer os(out, len2);

    for (size_t i = 0; i < N; ++i) {
      Node *n = nextNode();
      if (! n) break;
      CHECK_0(writer_.write(os, str, n)) << writer_.what();
    }
    os << '\0';
    CHECK_0(os.str()) << "output buffer overflow";
    return os.str();
  }

  Tagger *Tagger::create(int argc, char **argv)
  {
    return createTagger(argc, argv);
  }

  Tagger *Tagger::create(const char *arg)
  {
    return createTagger(arg);
  }

  const char *Tagger::version()
  {
    return VERSION;
  }

  Tagger *createTagger(int argc, char **argv)
  {
    TaggerImpl *tagger = new TaggerImpl();
    if (! tagger->open(argc, argv)) {
      setGlobalError(tagger->what());
      delete tagger;
      return 0;
    }
    return tagger;
  }

  Tagger *createTagger(const char *argv)
  {
    TaggerImpl *tagger = new TaggerImpl();
    if (! tagger->open(argv)) {
      setGlobalError(tagger->what());
      delete tagger;
      return 0;
    }
    return tagger;
  }

  const char *getTaggerError()
  {
    return getGlobalError();
  }
}

int mecab_do(int argc, char **argv)
{
#define WHAT_ERROR(msg) do { std::cout << msg << std::endl; \
                          return EXIT_FAILURE; } while (0);

  MeCab::TaggerImpl tagger;
  MeCab::Param param;

  if (! param.open(argc, argv, long_options)) {
    std::cout << param.what() << std::endl;
    return EXIT_SUCCESS;
  }

  std::string ofilename = param.getProfileString("output");
  if (ofilename.empty()) ofilename = "-";

  int nbest = param.getProfileInt("nbest");
  if (nbest <= 0 || nbest > NBEST_MAX)
    WHAT_ERROR("invalid N value");

  if (nbest >= 2)
    param.setProfile("lattice-level", "1", true);

  if (! tagger.open(param)) {
    std::cout << tagger.what() << std::endl;
    std::exit(EXIT_FAILURE);
  }

  MeCab::ostream_wrapper ofs(ofilename.c_str());
  if (! *ofs)
    WHAT_ERROR("no such file or directory: " << ofilename);

  const std::vector <std::string>& rest_ = param.rest_args();
  std::vector<std::string> rest = rest_;

  if (rest.empty()) rest.push_back("-");

  size_t ibufsize = MeCab::_min(MAX_INPUT_BUFFER_SIZE,
                                MeCab::_max(param.getProfileInt("input-buffer-size"),
                                            MIN_INPUT_BUFFER_SIZE));

  bool partial = param.getProfileInt("partial");
  if (partial) ibufsize *= 8;

  MeCab::scoped_array<char> ibuf_(new char [ibufsize]);
  char *ibuf =ibuf_.get();

  for (size_t i = 0; i < rest.size(); ++i) {

    MeCab::istream_wrapper ifs(rest[i].c_str());
    if (! *ifs) WHAT_ERROR("no such file or directory: " << rest[i]);

    while (true) {
      if (! partial) {
        ifs->getline(ibuf, ibufsize);
      } else {
        std::string sentence;
        char line[BUF_SIZE];
        for (;;) {
          if (! ifs->getline(line, sizeof(line))) {
            ifs->clear(std::ios::eofbit|std::ios::badbit);
            break;
          }
          sentence += line;
          sentence += '\n';
          if (std::strcmp(line, "EOS") == 0 || line[0] == '\0')
            break;
        }
        std::strncpy(ibuf, sentence.c_str(), ibufsize);
      }
      if (ifs->eof() && !ibuf[0]) return false;
      if (ifs->fail()) {
        std::cerr << "input-beffer overflow. The line is splitted. use -b #SIZE option." << std::endl;
        ifs->clear();
      }
      const char *r = (nbest >= 2) ? tagger.parseNBest(nbest, ibuf) : tagger.parse(ibuf);
      if (! r)  WHAT_ERROR(tagger.what());
      *ofs << r << std::flush;
    }
  }

  return EXIT_SUCCESS;

#undef WHAT_ERROR
}
