# HTMLPaserEx.py
# This is a derivative work of HTMLParser.py from Python.
# under the terms of Python Software Foundation License.

# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.
#
# Copyright (c) 2000 BeOpen.com.
# All rights reserved.
#
# Copyright (c) 1995-2001 Corporation for National Research Initiatives.
# All rights reserved.
#
# Copyright (c) 1991-1995 Stichting Mathematisch Centrum.
# All rights reserved.


# ChangeLog
#
# feed is always complete.
# character and entity references must be strict.
# support handling a uri (starting with http://).
#
# Copyright (C) 2006 by Aiwota Programmer
# aiwotaprog@tetteke.tk

from HTMLParser import starttagopen
import HTMLParser
import re

interesting_normal = re.compile("[<&h]")

charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+);')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*);')

urireg = re.compile("http://[a-zA-z0-9/.&=%\-?~]*")


class HTMLParserEx(HTMLParser.HTMLParser):

    def reset(self):
        HTMLParser.HTMLParser.reset(self)
        self.interesting = interesting_normal

    def clear_cdata_mode(self):
        HTMLParser.HTMLParser.clear_cdata_mode(self)
        self.interesting = interesting_normal

    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            match = self.interesting.search(rawdata, i) # < or & or h
            if match:
                j = match.start()
            else:
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
            if i == n: break
            startswith = rawdata.startswith
            if startswith('<', i):
                if starttagopen.match(rawdata, i): # < + letter
                    k = self.parse_starttag(i)
                elif startswith("</", i):
                    k = self.parse_endtag(i)
                elif startswith("<!--", i):
                    k = self.parse_comment(i)
                elif startswith("<?", i):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    k = self.parse_declaration(i)

                if k < 0: # not terminated, out simply.
                    self.handle_data("<")
                    k = i + 1

                i = self.updatepos(i, k)
            elif startswith("&#", i):

                match = charref.match(rawdata, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    i = self.updatepos(i, k)
                else:
                    self.handle_data("&#")
                    i = self.updatepos(i, i + 2)

            elif startswith('&', i):

                match = entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    i = self.updatepos(i, k)
                else:
                    self.handle_data("&")
                    i = self.updatepos(i, i + 1)

            elif startswith("h", i):

                match = urireg.match(rawdata, i)
                if match:
                    k = match.end()
                    attr = [("href", match.group())]
                    self.handle_starttag("a", attr)
                    self.handle_data(match.group())
                    self.handle_endtag("a")
                    i = self.updatepos(i, k)
                else:
                    self.handle_data("h")
                    i = self.updatepos(i, i + 1)

            else:
                assert 0, "interesting.search() lied"
        # end while
        if i < n: # always complete
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]
