/* 
 * Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
 *
 * This file is part of Rast.
 * See the file COPYING for redistribution information.
 *
 */

#include <ctype.h>
#include <string.h>

#include <apr_strings.h>

#include <rast/config.h>
#include <rast/encoding.h>
#include <rast/string.h>

#include <mecab.h>

static const unsigned char euc_jp_char_size_table[] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};

static const char *x0201kana_to_x0208[] = {
    "\xA1\xA1", "\xA1\xA3", "\xA1\xD6", "\xA1\xD7", "\xA1\xA2", "\xA1\xA6",
    "\xA5\xF2", "\xA5\xA1", "\xA5\xA3", "\xA5\xA5", "\xA5\xA7", "\xA5\xA9",
    "\xA5\xE3", "\xA5\xE5", "\xA5\xE7", "\xA5\xC3", "\xA1\xBC", "\xA5\xA2",
    "\xA5\xA4", "\xA5\xA6", "\xA5\xA8", "\xA5\xAA", "\xA5\xAB", "\xA5\xAD",
    "\xA5\xAF", "\xA5\xB1", "\xA5\xB3", "\xA5\xB5", "\xA5\xB7", "\xA5\xB9",
    "\xA5\xBB", "\xA5\xBD", "\xA5\xBF", "\xA5\xC1", "\xA5\xC4", "\xA5\xC6",
    "\xA5\xC8", "\xA5\xCA", "\xA5\xCB", "\xA5\xCC", "\xA5\xCD", "\xA5\xCE",
    "\xA5\xCF", "\xA5\xD2", "\xA5\xD5", "\xA5\xD8", "\xA5\xDB", "\xA5\xDE",
    "\xA5\xDF", "\xA5\xE0", "\xA5\xE1", "\xA5\xE2", "\xA5\xE4", "\xA5\xE6",
    "\xA5\xE8", "\xA5\xE9", "\xA5\xEA", "\xA5\xEB", "\xA5\xEC", "\xA5\xED",
    "\xA5\xEF", "\xA5\xF3", "\xA1\xAB", "\xA1\xAC",
};

static const char *voiced_x0201kana_to_x0208[] = {
    "\xA1\xA1\xA1\xAB", "\xA1\xA3\xA1\xAB", "\xA1\xD6\xA1\xAB",
    "\xA1\xD7\xA1\xAB", "\xA1\xA2\xA1\xAB", "\xA1\xA6\xA1\xAB",
    "\xA5\xF2\xA1\xAB", "\xA5\xA1\xA1\xAB", "\xA5\xA3\xA1\xAB",
    "\xA5\xA5\xA1\xAB", "\xA5\xA7\xA1\xAB", "\xA5\xA9\xA1\xAB",
    "\xA5\xE3\xA1\xAB", "\xA5\xE5\xA1\xAB", "\xA5\xE7\xA1\xAB",
    "\xA5\xC3\xA1\xAB", "\xA1\xBC\xA1\xAB", "\xA5\xA2\xA1\xAB",
    "\xA5\xA4\xA1\xAB", "\xA5\xF4", "\xA5\xA8\xA1\xAB",
    "\xA5\xAA\xA1\xAB", "\xA5\xAC", "\xA5\xAE",
    "\xA5\xB0", "\xA5\xB2", "\xA5\xB4",
    "\xA5\xB6", "\xA5\xB8", "\xA5\xBA",
    "\xA5\xBC", "\xA5\xBE", "\xA5\xC0",
    "\xA5\xC2", "\xA5\xC5", "\xA5\xC7",
    "\xA5\xC9", "\xA5\xCA\xA1\xAB", "\xA5\xCB\xA1\xAB",
    "\xA5\xCC\xA1\xAB", "\xA5\xCD\xA1\xAB", "\xA5\xCE\xA1\xAB",
    "\xA5\xD0", "\xA5\xD3", "\xA5\xD6",
    "\xA5\xD9", "\xA5\xDC", "\xA5\xDE\xA1\xAB",
    "\xA5\xDF\xA1\xAB", "\xA5\xE0\xA1\xAB", "\xA5\xE1\xA1\xAB",
    "\xA5\xE2\xA1\xAB", "\xA5\xE4\xA1\xAB", "\xA5\xE6\xA1\xAB",
    "\xA5\xE8\xA1\xAB", "\xA5\xE9\xA1\xAB", "\xA5\xEA\xA1\xAB",
    "\xA5\xEB\xA1\xAB", "\xA5\xEC\xA1\xAB", "\xA5\xED\xA1\xAB",
    "\xA5\xEF\xA1\xAB", "\xA5\xF3\xA1\xAB", "\xA1\xAB\xA1\xAB",
    "\xA1\xAC\xA1\xAB",
};

static const char *semi_voiced_x0201kana_to_x0208[] = {
    "\xA1\xA1\xA1\xAC", "\xA1\xA3\xA1\xAC", "\xA1\xD6\xA1\xAC",
    "\xA1\xD7\xA1\xAC", "\xA1\xA2\xA1\xAC", "\xA1\xA6\xA1\xAC",
    "\xA5\xF2\xA1\xAC", "\xA5\xA1\xA1\xAC", "\xA5\xA3\xA1\xAC",
    "\xA5\xA5\xA1\xAC", "\xA5\xA7\xA1\xAC", "\xA5\xA9\xA1\xAC",
    "\xA5\xE3\xA1\xAC", "\xA5\xE5\xA1\xAC", "\xA5\xE7\xA1\xAC",
    "\xA5\xC3\xA1\xAC", "\xA1\xBC\xA1\xAC", "\xA5\xA2\xA1\xAC",
    "\xA5\xA4\xA1\xAC", "\xA5\xA6\xA1\xAC", "\xA5\xA8\xA1\xAC",
    "\xA5\xAA\xA1\xAC", "\xA5\xAB\xA1\xAC", "\xA5\xAD\xA1\xAC",
    "\xA5\xAF\xA1\xAC", "\xA5\xB1\xA1\xAC", "\xA5\xB3\xA1\xAC",
    "\xA5\xB5\xA1\xAC", "\xA5\xB7\xA1\xAC", "\xA5\xB9\xA1\xAC",
    "\xA5\xBB\xA1\xAC", "\xA5\xBD\xA1\xAC", "\xA5\xBF\xA1\xAC",
    "\xA5\xC1\xA1\xAC", "\xA5\xC4\xA1\xAC", "\xA5\xC6\xA1\xAC",
    "\xA5\xC8\xA1\xAC", "\xA5\xCA\xA1\xAC", "\xA5\xCB\xA1\xAC",
    "\xA5\xCC\xA1\xAC", "\xA5\xCD\xA1\xAC", "\xA5\xCE\xA1\xAC",
    "\xA5\xD1", "\xA5\xD4", "\xA5\xD7",
    "\xA5\xDA", "\xA5\xDD", "\xA5\xDE\xA1\xAC",
    "\xA5\xDF\xA1\xAC", "\xA5\xE0\xA1\xAC", "\xA5\xE1\xA1\xAC",
    "\xA5\xE2\xA1\xAC", "\xA5\xE4\xA1\xAC", "\xA5\xE6\xA1\xAC",
    "\xA5\xE8\xA1\xAC", "\xA5\xE9\xA1\xAC", "\xA5\xEA\xA1\xAC",
    "\xA5\xEB\xA1\xAC", "\xA5\xEC\xA1\xAC", "\xA5\xED\xA1\xAC",
    "\xA5\xEF\xA1\xAC", "\xA5\xF3\xA1\xAC", "\xA1\xAB\xA1\xAC",
    "\xA1\xAC\xA1\xAC",
};

typedef struct {
    char *ptr;
    char *next_ptr;
    int byte_offset;
    int char_offset;
} mecab_euc_jp_context_t;

static int
get_char_len(const char *ptr, const char *ptr_end)
{
    int len;

    len = euc_jp_char_size_table[(unsigned char) *ptr];
    if (ptr + len > ptr_end) {
        len = ptr_end - ptr;
    }
    return len;
}

static rast_error_t *
mecab_euc_jp_get_char_len(rast_tokenizer_t *tokenizer, rast_size_t *char_len)
{
    *char_len = get_char_len(tokenizer->ptr, tokenizer->ptr_end);
    return RAST_OK;
}

static int
count_chars(const char *ptr, const char *ptr_end)
{
    const char *p = ptr;
    int nchars = 0;

    while (p < ptr_end) {
        p += euc_jp_char_size_table[(unsigned char) *p];
        nchars++;
    }
    return nchars;
}

static rast_error_t *
create_context(rast_tokenizer_t *tokenizer)
{
    mecab_euc_jp_context_t *context;
    mecab_t *mecab;
    char *s;

    context = (mecab_euc_jp_context_t *)
        apr_palloc(tokenizer->pool, sizeof(mecab_euc_jp_context_t));
    mecab = mecab_new(0, NULL);
    if (mecab == NULL) {
        return rast_error(RAST_ERROR_GENERAL, "mecab error: %s",
                          mecab_strerror(NULL));
    }
    s = mecab_sparse_tostr2(mecab, (char *) tokenizer->ptr,
                            tokenizer->ptr_end - tokenizer->ptr);
    if (s == NULL) {
        rast_error_t *error;

        error = rast_error(RAST_ERROR_GENERAL, "mecab error: %s",
                           mecab_strerror(mecab));
        mecab_destroy(mecab);
        return error;
    }
    context->ptr = apr_pstrdup(tokenizer->pool, s);
    mecab_destroy(mecab);
    context->next_ptr = NULL;
    context->byte_offset = -1;
    context->char_offset = -1;
    tokenizer->context = context;
    return RAST_OK;
}

static rast_error_t *
mecab_euc_jp_get_token(rast_tokenizer_t *tokenizer, rast_token_t *token)
{
    const char *ptr = tokenizer->ptr;
    const char *ptr_end = tokenizer->ptr_end;
    mecab_euc_jp_context_t *c;
    char *p;
    char *word_end;
    rast_error_t *error;

    if (tokenizer->context == NULL) {
        error = create_context(tokenizer);
        if (error != RAST_OK) {
            return error;
        }
    }
    c = (mecab_euc_jp_context_t *) tokenizer->context;
    if (*ptr == '\0' || isspace((unsigned char) *ptr)) {
        token->nchars = 1;
        token->nbytes = 1;
        token->is_complete = 1;
        c->next_ptr = c->ptr;
        c->char_offset = token->nchars;
        c->byte_offset = token->nbytes;
        return RAST_OK;
    }
    word_end = strchr(c->ptr, '\t');
    if (word_end == NULL) {
        token->nbytes = ptr_end - ptr;
    }
    else {
        token->nbytes = word_end - c->ptr;
    }
    token->nchars = count_chars(ptr, ptr + token->nbytes);
    token->is_complete = 1;
    p = c->ptr;
    while (*p != '\0' && *p != '\n') {
        p++;
    }
    if (*p == '\n') {
        p++;
    }
    c->next_ptr = p;
    c->char_offset = token->nchars;
    c->byte_offset = token->nbytes;
    return RAST_OK;
}

static rast_error_t *
mecab_euc_jp_get_next_offset(rast_tokenizer_t *tokenizer,
                             rast_size_t *byte_offset,
                             rast_size_t *char_offset)
{
    mecab_euc_jp_context_t *c = (mecab_euc_jp_context_t *) tokenizer->context;
    rast_error_t *error;

    if (c->next_ptr == NULL) {
        rast_token_t token;

        error = mecab_euc_jp_get_token(tokenizer, &token);
        if (error != RAST_OK) {
            return error;
        }
    }
    *char_offset = c->char_offset;
    *byte_offset = c->byte_offset;
    c->ptr = c->next_ptr;
    c->next_ptr = NULL;
    c->char_offset = -1;
    c->byte_offset = -1;
    return RAST_OK;
}

static int
convert_x0201kana_to_x0208(const unsigned char *sp,
                           const unsigned char *sp_end, rast_string_t *dst)
{
    const char *s;

    if (sp + 4 <= sp_end && *(sp + 2) == 0x8E) {
        if (*(sp + 3) == 0xDE) {
            s = voiced_x0201kana_to_x0208[*(sp + 1) - 0xA0];
            rast_string_append(dst, s, strlen(s));
            return 4;
        }
        else if (*(sp + 3) == 0xDF) {
            s = semi_voiced_x0201kana_to_x0208[*(sp + 1) - 0xA0];
            rast_string_append(dst, s, strlen(s));
            return 4;
        }
    }
    s = x0201kana_to_x0208[*(sp + 1) - 0xA0];
    rast_string_append(dst, s, strlen(s));
    return 2;
}

static int
convert_x0208alnum_to_ascii(const unsigned char *sp,
                            const unsigned char *sp_end, rast_string_t *dst)
{
    unsigned char c;

    c = *(sp + 1) & 0x7F;
    rast_string_append(dst, &c, 1);
    return 2;
}

static void
mecab_euc_jp_normalize_text(apr_pool_t *pool,
                            const char *src, rast_size_t src_len,
                            char **dst, rast_size_t *dst_len)
{
    const unsigned char *sp, *sp_end;
    rast_string_t *string;
    int len;

    sp = (unsigned char *) src;
    sp_end = sp + src_len;
    string = rast_string_create(pool, "", 0, src_len + 1);
    while (sp < sp_end) {
        if (isspace(*sp)) {
            rast_string_append(string, " ", 1);
            while (sp < sp_end && isspace(*sp)) {
                sp++;
            }
        }
        else {
            len = get_char_len(sp, sp_end);
            if (len == 2) {
                switch (*sp) {
                case 0x8E:
                    if (*(sp + 1) >= 0xA0) {
                        sp += convert_x0201kana_to_x0208(sp, sp_end, string);
                        continue;
                    }
                    break;
                case 0xA3:
                    if (*(sp + 1) >= 0xA0) {
                        sp += convert_x0208alnum_to_ascii(sp, sp_end, string);
                        continue;
                    }
                    break;
                }
            }
            rast_string_append(string, sp, len);
            sp += len;
        }
    }
    *dst = string->ptr;
    *dst_len = string->len;
}

static void
mecab_euc_jp_normalize_chars(apr_pool_t *pool,
                             const char *src, rast_size_t src_len,
                             char **dst, rast_size_t *dst_len)
{
    const unsigned char *sp, *sp_end;
    unsigned char *dp;
    int len;

    sp = (unsigned char *) src;
    sp_end = sp + src_len;
    *dst = (char *) apr_palloc(pool, src_len + 1);
    dp = (unsigned char *) *dst;
    while (sp < sp_end) {
        if (isupper(*sp)) {
            *dp = tolower(*sp);
            sp++;
            dp++;
        }
        else {
            len = get_char_len(sp, sp_end);
            memcpy(dp, sp, len);
            sp += len;
            dp += len;
        }
    }
    *dp = '\0';
    *dst_len = dp - (unsigned char *) *dst;
}

static int
mecab_euc_jp_char_is_space(rast_char_t *ch)
{
    const unsigned char *p;

    p = (const unsigned char *) ch->ptr;
    if (ch->nbytes == 2) {
        return *p == 0xA1 && *(p + 1) == 0xA1;
    }
    return isspace(*p);
}

rast_encoding_module_t rast_encoding_mecab_euc_jp = {
    "EUC-JP",
    mecab_euc_jp_get_char_len,
    mecab_euc_jp_get_token,
    mecab_euc_jp_get_next_offset,
    mecab_euc_jp_normalize_text,
    mecab_euc_jp_normalize_chars,
    mecab_euc_jp_char_is_space,
};

/* vim: set filetype=c sw=4 expandtab : */
