package jp.ac.osaka_u.ist.sel.similarity.hash;

import java.sql.SQLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import jp.ac.osaka_u.ist.sel.similarity.commons.MD5Digest;
import jp.ac.osaka_u.ist.sel.similarity.commons.Settings;
import jp.ac.osaka_u.ist.sel.similarity.commons.SupportedLanguage;
import jp.ac.osaka_u.ist.sel.similarity.database.ColumnName;
import jp.ac.osaka_u.ist.sel.similarity.hash.database.HashRegisterDAOManager;
import jp.ac.osaka_u.ist.sel.similarity.hash.database.InsertionHashInfoDAO;
import jp.ac.osaka_u.ist.sel.similarity.hash.database.InsertionTokenInfoDAO;
import jp.ac.osaka_u.ist.sel.similarity.hash.database.SelectionFileDAO;
import jp.ac.osaka_u.ist.sel.similarity.hash.tokenizer.CppTokenInfoRegister;
import jp.ac.osaka_u.ist.sel.similarity.hash.tokenizer.JavaTokenInfoRegister;
import jp.ac.osaka_u.ist.sel.similarity.hash.tokenizer.TokenInfoRegister;

import org.apache.log4j.Logger;

/**
 * 
 * Check files registered into FileInfo table,
 * and register MD5 hash value for each files.
 * 
 * @author ysk-ssk
 *
 */
public class HashInfoRegister {
    private final Logger _log = Logger.getLogger("HashInfoRegister");
    private final SelectionFileDAO _fileDAO;
    private final InsertionHashInfoDAO _hashDAO;
    private final InsertionTokenInfoDAO _tokenDAO;

    public HashInfoRegister(HashRegisterDAOManager daoManager) {
        super();
        _fileDAO = daoManager.getFileDAO();
        _hashDAO = daoManager.getHashDAO();
        _tokenDAO = daoManager.getTokenDAO();
    }

    public void clusterFile() {
        // cluster by ext
        Map<String,Set<Integer>> extMap = new HashMap<String,Set<Integer>>();
        for (int fileId : _fileDAO.getFileIdSet()) {
            String ext = _fileDAO.getFileExt(fileId);
            Set<Integer> fileIdSet = extMap.get(ext);
            if (fileIdSet == null) {
                fileIdSet = new HashSet<Integer>();
            }
            fileIdSet.add(fileId);
            extMap.put(ext, fileIdSet);
        }

        for (String ext : extMap.keySet()) {
            String lang = SupportedLanguage.getTargetLanguage(ext);
            TokenInfoRegister register = getTokenInfoRegister(lang);
            Set<Integer> fileIdSet = extMap.get(ext);
            if (register != null) {
                calculateHash(fileIdSet,register);
            } else {
                fileIdSet = collectSizeCluster(fileIdSet);
                calculateHash(fileIdSet);
            }
        }
    }

    /**
     * cluster by size
     * 
     * @param fileIdSet
     * @param ext
     * @return
     */
    private Set<Integer> collectSizeCluster(Set<Integer> fileIdSet) {
        Map<Long,Set<Integer>> sizeMap = new HashMap<Long,Set<Integer>>();
        for (int fileId : fileIdSet) {
            long size = _fileDAO.getFileSize(fileId);
            Set<Integer> fileIdSubSet = sizeMap.get(size);
            if (fileIdSubSet == null) {
                fileIdSubSet = new HashSet<Integer>();
            }
            fileIdSubSet.add(fileId);
            sizeMap.put(size, fileIdSubSet);
        }

        // remove clusters constructed by single component.
        Set<Integer> fileIdSubSet = new HashSet<Integer>();
        for (long size : sizeMap.keySet()) {
            Set<Integer> fileIdCluster = sizeMap.get(size);
            if (fileIdCluster.size() > 1) {
                fileIdSubSet.addAll(fileIdCluster);
            }
        }

        return fileIdSubSet;
    }

    private TokenInfoRegister getTokenInfoRegister(String langoffile) {
        TokenInfoRegister register = null;

        Set<String> languages = Settings.getInstance().getTargetLanguages();
        if (SupportedLanguage._KEY_LANG_C.equals(langoffile) && (languages.contains(SupportedLanguage._KEY_LANG_C) || languages.contains(SupportedLanguage._KEY_LANG_CPP))) {
            register = new CppTokenInfoRegister(_tokenDAO);
        } else if (SupportedLanguage._KEY_LANG_CPP.equals(langoffile) && languages.contains(SupportedLanguage._KEY_LANG_CPP)) {
            register = new CppTokenInfoRegister(_tokenDAO);
        } else if (SupportedLanguage._KEY_LANG_JAVA.equals(langoffile) && languages.contains(SupportedLanguage._KEY_LANG_JAVA)) {
            register = new JavaTokenInfoRegister(_tokenDAO);
        }

        return register;
    }

    private void calculateHash(Set<Integer> fileIdSet) {
        for (int fileId : fileIdSet) {
            try {
                String filePath = _fileDAO.getFilePath(fileId);
                // make a md5 digest
                byte[] digest = null;
                digest = MD5Digest.getFileDigest(filePath);
                if (digest != null) {
                    registerHash(fileId,digest);
                } else {
                    _log.error("Can not get hash:\t" + filePath);
                }
            } catch (Throwable e) {
                _log.error("hash detection error :\t" + fileId);
                _log.error(e.getMessage(),e);
            }
        }
    }

    private void calculateHash(Set<Integer> fileIdSet, TokenInfoRegister register) {
        for (int fileId : fileIdSet) {
            try {
                String filePath = _fileDAO.getFilePath(fileId);
                // make a md5 digest
                byte[] digest = null;
                // get token sequence of the file by its extension
                digest = register.calculateHash(fileId, filePath);
                if (digest != null) {
                    registerHash(fileId,digest);
                } else {
                    _log.error("Can not get hash:\t" + filePath);
                }
            } catch (Throwable e) {
                _log.error("hash detection error :\t" + fileId);
                _log.error(e.getMessage(),e);
            }
        }
    }

    private void registerHash(int fileId, byte[] md5digest) throws SQLException {
        assert(md5digest.length <= ColumnName.getColumnLength(ColumnName.MD5_HASH));
        if (!_hashDAO.insert(fileId, md5digest)) {
            _log.error("Can not register digest:\t" + fileId);
            return;
        }
        if (_log.isDebugEnabled()) {
            _log.debug("register digest:\t" + fileId);
        }
    }

}
