/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.parser.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlHandler;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.XHTMLDowngradeHandler;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.utils.CharsetUtils;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Parser;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class HtmlParser
extends AbstractParser {
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.text((String)"html"), MediaType.application((String)"xhtml+xml"), MediaType.application((String)"vnd.wap.xhtml+xml"), MediaType.application((String)"x-asp"))));
    private static final String DEFAULT_CHARSET = "windows-1252";
    private static final int META_TAG_BUFFER_SIZE = 8192;
    private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile("(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]([^'\\\"]+)['\\\"]");
    private static final Schema HTML_SCHEMA = new HTMLSchema();

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
        String encoding;
        String charset;
        MediaType mt;
        String metaString;
        Matcher m;
        stream.mark(8192);
        char[] buffer = new char[8192];
        InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
        int bufferSize = isr.read(buffer);
        stream.reset();
        if (bufferSize != -1 && (m = HTTP_EQUIV_PATTERN.matcher(metaString = new String(buffer, 0, bufferSize))).find()) {
            String[] attrs = m.group(1).split(";");
            for (String attr : attrs) {
                String charset2;
                String[] keyValue = attr.trim().split("=");
                if (keyValue.length != 2 || !keyValue[0].equalsIgnoreCase("charset") || !CharsetUtils.isSupported((String)(charset2 = CharsetUtils.clean((String)keyValue[1])))) continue;
                metadata.set("Content-Encoding", charset2);
                return charset2;
            }
        }
        CharsetDetector detector = new CharsetDetector();
        String incomingCharset = metadata.get("Content-Encoding");
        String incomingType = metadata.get("Content-Type");
        if (incomingCharset == null && incomingType != null && (mt = MediaType.parse((String)incomingType)) != null && (charset = (String)mt.getParameters().get("charset")) != null && Charset.isSupported(charset)) {
            incomingCharset = charset;
        }
        if (incomingCharset != null) {
            detector.setDeclaredEncoding(incomingCharset);
        }
        detector.enableInputFilter(true);
        detector.setText(stream);
        for (CharsetMatch match : detector.detectAll()) {
            if (!Charset.isSupported(match.getName())) continue;
            metadata.set("Content-Encoding", match.getName());
            break;
        }
        if ((encoding = metadata.get("Content-Encoding")) == null) {
            encoding = Charset.isSupported(DEFAULT_CHARSET) ? DEFAULT_CHARSET : Charset.defaultCharset().name();
            metadata.set("Content-Encoding", encoding);
        }
        return encoding;
    }

    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
        if (!stream.markSupported()) {
            stream = new BufferedInputStream(stream);
        }
        stream = new CloseShieldInputStream(stream);
        InputSource source = new InputSource(stream);
        source.setEncoding(this.getEncoding(stream, metadata));
        HtmlMapper mapper = (HtmlMapper)context.get(HtmlMapper.class, (Object)new HtmlParserMapper());
        Parser parser = new Parser();
        parser.setProperty("http://www.ccil.org/~cowan/tagsoup/properties/schema", (Object)HTML_SCHEMA);
        parser.setFeature("http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons", true);
        parser.setContentHandler((ContentHandler)((Object)new XHTMLDowngradeHandler((ContentHandler)((Object)new HtmlHandler(mapper, handler, metadata)))));
        parser.parse(source);
    }

    protected String mapSafeElement(String name) {
        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
    }

    protected boolean isDiscardElement(String name) {
        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
    }

    public String mapSafeAttribute(String elementName, String attributeName) {
        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
    }

    private class HtmlParserMapper
    implements HtmlMapper {
        private HtmlParserMapper() {
        }

        public String mapSafeElement(String name) {
            return HtmlParser.this.mapSafeElement(name);
        }

        public boolean isDiscardElement(String name) {
            return HtmlParser.this.isDiscardElement(name);
        }

        public String mapSafeAttribute(String elementName, String attributeName) {
            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
        }
    }
}

