/*
  MeCab -- Yet Another Part-of-Speech and Morphological Analyzer

  $Id: writer.cpp,v 1.14 2006/07/09 11:26:27 taku-ku Exp $;

  Copyright (C) 2001-2006 Taku Kudo <taku@chasen.org>
  Copyright (C) 2004-2006 Nippon Telegraph and Telephone Corporation

*/
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include "common.h"
#include "utils.h"
#include "string_buffer.h"
#include "writer.h"
#include "param.h"

namespace MeCab {

  Writer::Writer() {}
  Writer::~Writer() { this->close(); }

  void Writer::close()
  {
    _write = &Writer::writeLattice;
  }

  bool Writer::open(Param &param)
  {
    std::string ostyle = param.getProfileString("output-format-type");
    _write = &Writer::writeLattice;

    node_format_.reset_string("%m\t%H");
    unk_format_.reset_string("%m\t%H");
    bos_format_.reset_string("%m\t%H");
    bos_format_.reset_string("%m\t%H");

    if (ostyle == "wakati") {
      _write = &Writer::writeWakati;
      node_format_.reset_string("%m");
      unk_format_.reset_string("%m");
      bos_format_.reset_string("");
      eos_format_.reset_string("");
    } else if (ostyle == "none") {
      _write = &Writer::writeNone;
    } else if (ostyle == "dump") {
      _write = &Writer::writeDump;
    } else {
      std::string nfk = "node-format";
      std::string bfk = "bos-format";
      std::string efk = "eos-format";
      std::string ufk = "unk-format";

      if (! ostyle.empty()) {
        nfk += "-"; nfk += ostyle;
        bfk += "-"; bfk += ostyle;
        efk += "-"; efk += ostyle;
        ufk += "-"; ufk += ostyle;

        CHECK_FALSE(! std::string(param.getProfileString(nfk.c_str())).empty())
          << "unknown format type [" << ostyle << "]";
      }

      node_format_.reset_string(param.getProfileString(nfk.c_str()));
      bos_format_.reset_string(param.getProfileString(bfk.c_str()));
      eos_format_.reset_string(param.getProfileString(efk.c_str()));

      std::string uf = param.getProfileString(ufk.c_str());
      if (uf.empty()) uf = param.getProfileString(nfk.c_str());
      unk_format_.reset_string(uf);

      if (*node_format_ != '\0') _write = &Writer::writeUser;
    }

    return true;
  }

  bool Writer::write(StringBuffer &os, const char* str, Node *bosNode)
  {
    return (this->*_write)(os, str, bosNode);
  }

  bool Writer::writeLattice(StringBuffer &os, const char* str, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write(node->surface, node->length);
      os << '\t' << node->feature; // << '\t';
      os << '\n';
    }
    os << "EOS\n";
    return true;
  }

  bool Writer::writeWakati(StringBuffer &os, const char*, Node *bosNode)
  {
    for (Node *node = bosNode->next; node->next; node = node->next) {
      os.write(node->surface, node->length);
      os << ' ';
    }
    os << '\n';
    return true;
  }

  bool Writer::writeNone(StringBuffer &, const char*, Node *)
  {
    return true; // do nothing
  }

  bool Writer::writeUser(StringBuffer &os, const char* str, Node *bosNode)
  {
    if (! writeNode(os, bos_format_.get(), str, bosNode)) return false;
    Node *node = 0;
    char *fmt = 0;
    for (node = bosNode->next; node->next; node = node->next) {
      fmt = (node->stat == MECAB_UNK_NODE ? unk_format_.get() : node_format_.get());
      if (! writeNode(os, fmt, str, node)) return false;
    }
    if (! writeNode(os, eos_format_.get(), str, node)) return false;
    return true;
  }

  bool Writer::writeDump(StringBuffer &os, const char* str, Node *bosNode)
  {
    for (Node *node = bosNode; node; node = node->next) {
      os << node->id << ' ';
      if (node->stat == MECAB_BOS_NODE)
        os << "BOS";
      else if (node->stat == MECAB_EOS_NODE)
        os << "EOS";
      else
        os.write(node->surface, node->length);

      os << ' ' << node->feature
         << ' ' << (int)(node->surface - str)
         << ' ' << (int)(node->surface - str + node->length)
         << ' ' << node->rcAttr
         << ' ' << node->lcAttr
         << ' ' << node->posid
         << ' ' << (int)node->char_type
         << ' ' << (int)node->stat
         << ' ' << (int)node->isbest
         << ' ' << node->alpha
         << ' ' << node->beta
         << ' ' << node->prob
         << ' ' << node->cost;
      for (Path *path = node->lpath; path; path = path->lnext) {
        os << ' ' << path->lnode->id << ':' << path->cost << ':' << path->prob;
      }
      os << '\n';
    }
    return true;
  }

  bool Writer::writeNode(StringBuffer&os, const char *sentence, Node *node)
  {
    switch (node->stat) {
    case MECAB_BOS_NODE:
      return writeNode(os, bos_format_.get(),  sentence, node);
    case MECAB_EOS_NODE:
      return writeNode(os, eos_format_.get(),  sentence, node);
    case MECAB_UNK_NODE:
      return writeNode(os, unk_format_.get(),  sentence, node);
    case MECAB_NOR_NODE:
      return writeNode(os, node_format_.get(), sentence, node);
    }
    return true;
  }

  bool Writer::writeNode(StringBuffer&os, const char *p, const char *sentence, Node *node)
  {
    char buf[BUF_SIZE];
    char *ptr[64];
    size_t psize = 0;

    for (; *p; p++) {
      switch (*p) {

      default: os << *p; break;

      case '\\': os << getEscapedChar(*++p); break;

      case '%': { // macros
        switch (*++p) {
        default: CHECK_FALSE(false) << "unkonwn meta char " << *p;
        case 'S': os.write(sentence, std::strlen(sentence)); break;   // input sentence
        case 'L': os << std::strlen(sentence); break;  // sentence length
        case 'm': os.write(node->surface, node->length); break; // morph
        case 'M': os.write((char*)(node->surface - node->rlength + node->length), node->rlength); break;
        case 'h': os << node->posid; break; // Part-Of-Speech ID
        case '%': os << '%'; break;        // %
        case 'c': os << (int)(node->wcost); break; // word cost
        case 'H': os << node->feature; break; // feature
        case 't': os << (unsigned int)node->char_type; break; // feature
        case 's': os << (unsigned int)node->stat; break; // feature
        case 'P': os << node->prob; break;
        case 'p': {
          switch (*++p) {
          default: CHECK_FALSE(false) << "[iseSCwcnblLh] is required after %p";
          case 'i': os << node->id; break; // node id
          case 'S': os.write((char*)(node->surface - node->rlength + node->length), node->rlength - node->length); break; // space
          case 's': os << (int)(node->surface - sentence); break; // start position
          case 'e': os << (int)(node->surface - sentence + node->length); break; // end position
          case 'C': os << node->cost - node->prev->cost - node->wcost; break; // connection cost
          case 'w': os << node->wcost; break; // word cost
          case 'c': os << node->cost; break; // best cost
          case 'n': os << (node->cost - node->prev->cost); break; // node cost
          case 'b': os << (node->isbest ? '*' : ' '); break; // * if best path, ' 'otherwise
          case 'P': os << node->prob; break;
          case 'A': os << node->alpha; break;
          case 'B': os << node->beta; break;
          case 'l': os << node->length; break; // length of morph
          case 'L': os << node->rlength;    break; // length of morph including the spaces
          case 'h': { // Hidden Layer ID
            switch (*++p) {
            default: CHECK_FALSE(false) << "lr is required after %ph";
            case 'l': os << node->lcAttr; break;  // current
            case 'r': os << node->rcAttr; break;  // prev
            }
          } break;

          case 'p': {
            char mode = *++p;
            char sep = *++p;
            if (sep == '\\') sep = getEscapedChar(*++p);
            CHECK_FALSE(node->lpath) << "no path information, use -l option";
            for (Path *path = node->lpath; path; path = path->lnext) {
              if (path != node->lpath) os << sep;
              switch (mode) {
              case 'i': os << path->lnode->id; break;
              case 'c': os << path->cost; break;
              case 'P': os << path->prob; break;
              default: CHECK_FALSE(false) << "[icP] is required after %pp";
              }
            }
          } break;

          }
        } break;

        case 'F':
        case 'f': {
          CHECK_FALSE(node->feature[0] != '\0') << "no feature information available";

          if (! psize) {
            std::strncpy(buf, node->feature, sizeof(buf));
            psize = tokenizeCSV(buf, ptr, sizeof(ptr));
          }

          // separator
          char separator = '\t'; // default separator
          if (*p == 'F') { // change separator
            if (*++p == '\\') separator = getEscapedChar(*++p);
            else separator = *p;
          }

          CHECK_FALSE(*++p =='[') << "cannot find '['";
          size_t n = 0;
          bool sep = false;
          bool isfil = false;
          p++;

          for (;; ++p) {
            switch (*p) {
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
              n = 10 * n + (*p - '0');
              break;
            case ',': case ']':
              CHECK_FALSE(n < psize) << "given index is out of range";
              isfil = (ptr[n][0] != '*');
              if (isfil) { if (sep) os << separator; os << ptr[n]; }
              if (*p == ']') goto last;
              sep = isfil;
              n = 0;
              break;
            default:
              CHECK_FALSE(false) << "cannot find ']'";
            }
          }
        } last: break;
        } // end switch
      } break; // end case '%'
      } // end switch
    }

    return true;
  }
}
