package zephyr.kenkyusya.lajp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import zephyr.util.MultiHashMap;
import zephyr.util.ZephyrUtil;

public class MakeMorphHtml {

    private final MultiHashMap<String, String> dictIdMap = new MultiHashMap<String, String>();
    private final HashSet<String> depSet = new HashSet<String>();
    private final MultiHashMap<String, String> bodyIdMap = new MultiHashMap<String, String>();

    private static final String DT_ID_REGEXP = "<dt id=\"([0-9]+)\">(.+?)</dt>";
    private static final Pattern DT_ID = Pattern.compile(DT_ID_REGEXP);

    private void loadLajpBody(String bodyFile) throws Exception {
        InputStream in = new FileInputStream(new File(bodyFile));
        BufferedReader br =
                new BufferedReader(new InputStreamReader(in, ZephyrUtil.SHIFT_JIS_CODE));
        String line;
        while ((line = br.readLine()) != null) {
            Matcher m = DT_ID.matcher(line);
            if (m.find()) {
                String dt = ZephyrUtil.hex2uni(m.group(2));
                String key = ZephyrUtil.makeKey(dt, true);
                bodyIdMap.put(key, m.group(1));
            }
        }
        System.err.println("loaded " + bodyIdMap.size() + " entries from " + bodyFile);
    }

    private static final String DICT_ENT_REGEXP = "<dt id=\"([0-9]+)\">(.+?)\\*</dt>";
    private static final Pattern DICT_ENT = Pattern.compile(DICT_ENT_REGEXP);

    private void loadWhitackerDict(String dictFile) throws Exception {
        InputStream in = new FileInputStream(new File(dictFile));
        BufferedReader br = new BufferedReader(new InputStreamReader(in, "ASCII"));
        String line;
        while ((line = br.readLine()) != null) {
            Matcher m = DICT_ENT.matcher(line);
            if (m.find()) {
                String dt = m.group(2);
                dictIdMap.put(dt, m.group(1));
                if (line.contains("DEP\"")) {
                    depSet.add(dt);
                }
            }
        }
        System.err.println("loaded " + dictIdMap.size() + " entries from " + dictFile);
    }

    private static final String MORPHKEY_REGEXP = "([a-z123]+)|(<a .+?</a>)";
    private static final Pattern MORPHKEY = Pattern.compile(MORPHKEY_REGEXP);

    private static final HashMap<String, String> morphKeyMap = new HashMap<String, String>();

    static {
        morphKeyMap.put("noun", "名");
        morphKeyMap.put("pron", "代");
        morphKeyMap.put("adj", "形");
        morphKeyMap.put("num", "数");
        morphKeyMap.put("adv", "副");
        morphKeyMap.put("verb", "動");
        morphKeyMap.put("part", "分詞");
        morphKeyMap.put("prep", "前");
        morphKeyMap.put("conj", "接");
        morphKeyMap.put("inter", "感");
        //
        morphKeyMap.put("m", "男");
        morphKeyMap.put("f", "女");
        morphKeyMap.put("n", "中");
        //
        morphKeyMap.put("nom", "主");
        morphKeyMap.put("voc", "呼");
        morphKeyMap.put("gen", "属");
        morphKeyMap.put("dat", "与");
        morphKeyMap.put("acc", "対");
        morphKeyMap.put("abl", "奪");
        morphKeyMap.put("loc", "地");
        //
        morphKeyMap.put("sg", "単");
        morphKeyMap.put("pl", "複");
        //
        morphKeyMap.put("1st", "１");
        morphKeyMap.put("2nd", "２");
        morphKeyMap.put("3rd", "３");
        //
        morphKeyMap.put("comp", "比");
        morphKeyMap.put("super", "最");
        //
        morphKeyMap.put("card", "基数詞");
        morphKeyMap.put("ord", "序数詞");
        morphKeyMap.put("dist", "配分詞");
        morphKeyMap.put("adverb", "数副詞");
        //
        morphKeyMap.put("pres", "現");
        morphKeyMap.put("impf", "未完");
        morphKeyMap.put("fut", "未来");
        morphKeyMap.put("perf", "完");
        morphKeyMap.put("plup", "過完");
        morphKeyMap.put("futp", "未来完");
        //
        morphKeyMap.put("act", "能");
        morphKeyMap.put("pass", "受");
        //
        morphKeyMap.put("ind", "直");
        morphKeyMap.put("sub", "接");
        morphKeyMap.put("imp", "命");
        morphKeyMap.put("inf", "不");
        morphKeyMap.put("part", "分詞");
    }

    private String formatMorphDesc(String desc) {
        StringBuffer sb = new StringBuffer();
        int i = 0;
        Matcher m = MORPHKEY.matcher(desc);
        boolean isPart = false;
        while (m.find()) {
            if (i < m.start()) {
                String sub = desc.substring(i, m.start());
                if (!isPart || !sub.trim().isEmpty()) {
                    sb.append(sub);
                }
            }
            String key = m.group(1);
            if (key != null) {
                if (key.equals("part")) {
                    isPart = true;
                } else {
                    String newKey = morphKeyMap.get(key);
                    if (newKey != null) {
                        sb.append(newKey);
                    } else {
                        sb.append(key);
                    }
                    if (isPart) {
                        sb.append("分");
                        isPart = false;
                    }
                }
            } else {
                sb.append(m.group(2));
            }
            i = m.end();
        }
        if (i < desc.length()) {
            sb.append(desc.substring(i));
        }
        return sb.toString();
    }

    private void outBodyRef(String lemma) {
        LinkedList<String> bodyIds = bodyIdMap.getList(lemma);
        if (bodyIds != null) {
            for (String id : bodyIds) {
                System.out.print(String.format("<a href=\"body-lajp.html#%s\">%s</a> ", id, lemma));
            }
        }
    }

    private void findFromBody(String lemma, char c1, char c2) {
        String lemma2 = lemma.replace(c1, c2);
        if (!lemma2.equals(lemma2)) {
            outBodyRef(lemma2);
        }
    }

    private void findFromBody(String lemma) {
        outBodyRef(lemma);
        if (depSet.contains(lemma)) {
            outBodyRef(lemma + "r");
        }
        findFromBody(lemma, 'i', 'j');
        findFromBody(lemma, 'j', 'i');
        findFromBody(lemma, 'u', 'v');
        findFromBody(lemma, 'v', 'u');
    }

    private void loadMorph(String morphFile) throws Exception {
        InputStream in = new FileInputStream(new File(morphFile));
        BufferedReader br = new BufferedReader(new InputStreamReader(in, "ASCII"));
        String line;
        int entnum = 0;
        System.err.print("creating morphology file ");
        System.out.println("<html><body>");
        while ((line = br.readLine()) != null) {
            int idx = line.indexOf(',');
            int idx2 = line.indexOf(' ', idx + 1);
            String morph = line.substring(0, idx);
            String lemma = line.substring(idx + 1, idx2);
            String desc = line.substring(idx2 + 1);

            System.out.print("<dt>");
            System.out.print(morph);
            System.out.print("</dt><dd>");

            findFromBody(lemma);
            LinkedList<String> dictIds = dictIdMap.getList(lemma);
            if (dictIds != null) {
                for (String id : dictIds) {
                    System.out.print(String.format("<a href=\"whitaker-dict.html#%s\">%s*</a> ",
                            id, lemma));
                }
            }
            System.out.print(formatMorphDesc(desc));
            System.out.println("</dd>");
            entnum++;
            if ((entnum % 100000) == 0) {
                System.err.print(entnum / 100000);
            } else if ((entnum % 10000) == 0) {
                System.err.print(".");
            }
        }
        System.out.println("</body></html>");
        System.err.println();
        System.err.println("done. " + entnum + " entries");
    }

    private void makeMorph(String[] args) throws Exception {
        loadLajpBody(args[0]);
        loadWhitackerDict(args[1]);
        loadMorph(args[2]);
    }

    public static void main(String[] args) {
        if (args.length >= 3) {
            MakeMorphHtml app = new MakeMorphHtml();
            try {
                ZephyrUtil.setShiftJisOuput();
                app.makeMorph(args);
            } catch (Exception e) {
                e.printStackTrace();
            }
        } else {
            System.err
                    .println("Usage: java MakeMorphHtml body-lajp.html whitaker-dict.html whitaker.morph.txt");
        }
    }

}
