/*
 * Copyright (C) 2014 kgto.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301  USA
 */
/*
 * $Id: HtmlParser.java 77 2014-08-19 11:58:38Z tuna_p $
 */

package WebScraping;

import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.html.parser.ParserDelegator;

/**
 *
 * @author kgto
 */
public class HtmlParser {
    
    URL url;
    String pageData;
    
    ArrayList sData;
    
    // 作業ワーク
    String htmltag;
    String htmlid;
    String htmlclass;

    public HtmlParser() {
        url = null;
    }
    
    public HtmlParser(URL UrlAdress) {
        this.url = UrlAdress;
        getpageData();
    }
    
    public HtmlParser(String UrlAdress) {
        try {
            url = new URL(UrlAdress);
            getpageData();
            
        } catch (MalformedURLException ex) {
            Logger.getLogger(HtmlParser.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    
    public void seturl(URL UrlAdress) {
        this.url = UrlAdress;
        getpageData();
    }
    
    public void seturl(String UrlAdress) {
        try {
            url = new URL(UrlAdress);
            getpageData();
            
        } catch (MalformedURLException ex) {
            Logger.getLogger(HtmlParser.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    
    /**
     * ＨＴＭＬページ内検索.
     * 検索キーとして渡されたタグ,ID,クラスから、対象となるタグを探し出し、
     * around(タグ位置)として指定された箇所の文字列をregexp(正規表現)で指定された整形を
     * 行った結果を返す。<br>
     * aroundの初期値:0 検索キーとして未指定(未入力)の場合、最初(0)の文字列。<br>
     * regexpが指定(入力)ありの場合、正規表現にて整形を行う。<br>
     * 渡された検索キーに一致するタグが存在しなかった場合、NULLを返す。
     * @param skey 検索キーデータ(SearchData)
     * @return String　検索キーに一致するデータの文字列
     */
    public String search(SearchData skey) {
        
        // htmlページ内を検索
        if(isHtmlkeyEq(skey) == false) {
            serchpageData(skey);
        }
        /*
           around 出現位置指定 入力有り：指定された位置の情報のみ返す。
                              入力無し：取得した全ての情報を返す。
         */
        String regexp = skey.getregexp();
        if(skey.getaround().length() > 0) {
            int wkAround = Integer.parseInt(skey.getaround());  // 検索位置を数値変換
            if(wkAround < sData.size()) {
                String str = (String)sData.get(wkAround);
                String rtn = RegularExpression(str, regexp);
                return rtn;
            }
        } else {
            StringBuilder strbuf = new StringBuilder();
            for (Object sData1 : sData) {
                String str = (String)sData1;
                String rtn = RegularExpression(str, regexp);
                if(strbuf.length() > 0) {
                    strbuf.append("\t");
                }
                strbuf.append(rtn);
            }
            return strbuf.toString();
        }
        return null;
    }
    
    /**
     * 直近のHTMLタグ/ID/CLASS値と引数の値を比較する.
     * @param skey HTMLタグ/ID/CLASSが格納された検索キー
     * @return boolean HTMLタグ/ID/CLASS値が一致する時、true
     */
    public boolean isHtmlkeyEq(SearchData skey) {

        String stag   = skey.getHtmltag();
        String sid    = skey.getHtmlid();
        String sclass = skey.getHtmlclass();

        boolean rtn = true;
        
        // htmltag
        if(htmltag == null) {
            rtn = false;
        } else {
            if(htmltag.equals(stag) == false) {
                rtn = false;
            }
        }
        
        // htmlid
        if(htmlid == null) {
            rtn = false;
        } else {
            if(htmlid.equals(sid) == false) {
                rtn = false;
            }
        }
        
        // htmlclass
        if(htmlclass == null) {
            rtn = false;
        } else {
            if(htmlclass.equals(sclass) == false) {
                rtn = false;
            }
        }
        
        if(!rtn) {
            htmltag   = stag;
            htmlid    = sid;
            htmlclass = sclass;
        }
        
        return rtn;
    }
    
    /**
     * 正規表現検索.
     * @param strdata
     * @param regexp
     * @return 
     */
    public String RegularExpression(String strdata, String regexp) {
        String expdata = null;
        
        //regexpのチェック
        if(regexp.isEmpty()) {
            expdata = strdata;
            return expdata;
        }
                
        //正規表現検索
        Pattern ptn = Pattern.compile(regexp);
        Matcher matchdata = ptn.matcher(strdata);
        if (matchdata.find()) {
            if(matchdata.groupCount() >= 1) {
                expdata = matchdata.group(1);
            }
        }
        return expdata;
    }
    
    /**
     * インターネット接続.
     */
    private void getpageData() {
        try {
            //URL url = new URL(UrlAdress);
            HttpURLConnection con = (HttpURLConnection)url.openConnection();
            con.setRequestMethod("GET");
            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(con.getInputStream(), "utf-8"));
            String wkline;
            StringBuilder sb = new StringBuilder();
            while((wkline = reader.readLine()) != null) {
                sb.append(wkline).append("\n");
            }
            pageData = sb.toString();
            
            con.disconnect();
        }
        catch(IOException e) {
            System.err.println(e);
        }
    }

    /**
     * ＨＴＭＬパーサ.
     * @param skey 
     */
    public void serchpageData(SearchData skey){
        Reader reader;
        try {
            reader = new BufferedReader(new StringReader(pageData));
            HtmlParserCallback cb = new HtmlParserCallback(skey);
            ParserDelegator pd = new ParserDelegator();
            pd.parse(reader, cb, true);
            reader.close();
            
            sData = cb.getrtnData();
            
        } catch (IOException e) {
            System.err.println(e);
        }
    }
    
}
