#include "uniconv.h"

namespace aka2 {
  extern const int utf8_char_length_table[];
}

/** This table comes from babel.  Thanks, Doukeshi-san. */

using namespace aka2;

namespace {
  inline size_t _utf8_to_ucs2(const char *utf8str, 
			      aka2::uchar_t *uch) {
    
    int length = utf8_char_length_table[*reinterpret_cast<const unsigned char *>(utf8str)];
    char first_ch = *utf8str;
    
    switch (length) {
    case 1:
      *uch = aka2::uchar_t(first_ch);
      return 1;
    case 2:
      *uch = (aka2::uchar_t(first_ch & 0x1f) << 6) | (utf8str[1] & 0x3f);
      return 2;
    case 3:
      *uch = (aka2::uchar_t(first_ch & 0x0f) << 12) 
	| (aka2::uchar_t(utf8str[1] & 0x3f) << 6) 
	| (aka2::uchar_t(utf8str[2] & 0x3f));
      return 3;
    case 4:
    case 5:
    case 6:
      throw error("UTF-16 sarrogate is not supported.", __FILE__, __LINE__);
    }
    
    assert(length == 0);
    throw error("Wrong UTF-8 character sequence.", __FILE__, __LINE__);
    return 0;
  }
}



int uniconv::ucs2_to_utf8(string_buffer &buffer, 
			  const aka2::uchar_t *source, int length) {

  const aka2::uchar_t *sourceend;
  if (length == -1) {
    sourceend = source;
    while (*sourceend != 0)
      ++sourceend;
  }
  else
    sourceend = source + length;

  while (source < sourceend) {
    char *ptr = buffer.get_end_ptr(1024);
    char *ptrend = buffer.get_buffer_end() - 6;

    while ((ptr < ptrend) && (source < sourceend)) {
      ptr += ucs2_to_utf8(*source, ptr);
      ++source;
    }
    buffer.commit_current_end(ptr);
  };
  return buffer.get_content_length();
}

std::string uniconv::ucs2_to_utf8(const aka2::uchar_t *source, int length) {
  string_buffer buffer;
  ucs2_to_utf8(buffer, source, length);
  return std::string(buffer.get_ptr(), buffer.get_content_length());
}


int uniconv::utf8_to_ucs2(string_buffer &buffer, 
			  const char *utf8str, int length) {
  aka2::uchar_t *outbuf = reinterpret_cast<uchar_t*>(buffer.get_end_ptr(length << 1));
  const char *source = utf8str; 
  const char *sourceend;
  if (length == -1) {
    sourceend = source;
    while (*sourceend != 0)
      ++sourceend;
  }
  else {
    sourceend = source + length;
  }

  while (source < sourceend) {
    size_t length = _utf8_to_ucs2(source, outbuf);
    if (length == 0)
      break;
    source += length;
    ++outbuf; // next uchar_t.
  }
  buffer.commit_current_end(reinterpret_cast<char*>(outbuf));
  return source - utf8str;
}


int uniconv::utf8_to_ucs2(ustring_buffer &buffer, 
			  const char *utf8str, int length) {
  aka2::uchar_t *outbuf = buffer.get_end_ptr(length);
  const char *source = utf8str; 
  const char *sourceend;
  if (length == -1) {
    sourceend = source;
    while (*sourceend != 0)
      ++sourceend;
  }
  else {
    sourceend = source + length;
  }

  while (source < sourceend) {
    size_t length = _utf8_to_ucs2(source, outbuf);
    if (length == 0)
      break;
    source += length;
    ++outbuf;
  }
  buffer.commit_current_end(outbuf);
  return source - utf8str;
}



ustring uniconv::utf8_to_ucs2(const std::string &utf8str) {
  ustring converted;
  converted.reserve(utf8str.size());

  const char *source = utf8str.data(); 
  const char *sourceend = source + utf8str.size();

  while (source < sourceend) {
    aka2::uchar_t uch;
    size_t length = _utf8_to_ucs2(source, &uch);
    if (length == 0)
      break;
    source += length;
    converted += uch;
  }
  return converted;
}



size_t uniconv::utf8_to_ucs2(const char *utf8str, const char *utf8strend,
			     aka2::uchar_t *uch) {
  int length = utf8_char_length_table[*reinterpret_cast<const unsigned char *>(utf8str)];
  if (length > (utf8strend - utf8str))
    return 0;
  return _utf8_to_ucs2(utf8str, uch);
}

size_t uniconv::ucs2_to_utf8(const aka2::uchar_t wc, char *converted) {
  if (!(wc & 0xff80)) {
    converted[0] = static_cast<char>(wc);
    return 1;
  }
  else if (!(wc & 0xf800)) {
    converted[0] = 0xc0 | static_cast<char>((wc & 0x07c0) >> 6);
    converted[1] = 0x80 | static_cast<char>(wc & 0x3f);
    return 2;
  }
  else {
    converted[0] = 0xe0 | static_cast<char>((wc & 0xf000) >> 12);
    converted[1] = 0x80 | static_cast<char>((wc & 0x0fc0) >> 6);
    converted[2] = 0x80 | static_cast<char>(wc & 0x003f);
    return 3;
  }
}
