=begin

XMLParser and HTMLParser Mini

=end

class XMLParser
	def initialize
		@my = self.class
	end
	def parse(str)
		@doc = Document::new(DOMImplementation::instance)
		@cur = @doc
		str = parse_elment(str) while str
		return @doc
	end
private
	TX_NAME = "[A-Za-z\\-_\\.:][A-Za-z0-9\\-_\\.:]*"
	RE_STAG = /^<(#{TX_NAME})(\s+[^>]+[^>\/])?\s*(\/)?>/m
	RE_ETAG = /^<\/(#{TX_NAME})\s*>/m
	RE_ATTR = /^\s+(#{TX_NAME})\s*=\s*(?:\"([^\"]*)\"|\'([^\']*)\')/m
	RE_COMM = /^<!--((?!--).*)-->/m
	RE_OTHER = /^<[?!][^>]+>/m
	def parse_elment(str)
		sidx = str.index('<')
		if sidx then
			eidx = str.index('>', sidx+1)
			raise "parenthesis mismatch" unless eidx
			text = str[0..sidx-1] if sidx > 0
			tag = str[sidx..eidx]
			back = str[eidx+1, str.size]
			back = nil if back.empty?
		else
			text = str
			tag = nil
			back = nil
		end
		if text then
			if Element === @cur and not /\A\s*\Z/m === text then
				node = @doc.createTextNode(unescape(text))
				@cur.appendChild(node)
			end
		end
		case tag
		when nil, "" then
		when @my::RE_STAG then
			@kind = 'start'
			@tag, attrstr = $1, $2
			@kind = 'empty' unless $3.to_s.empty?
			@attr = parse_attr(attrstr)
			append_node()
		when @my::RE_ETAG then
			@kind = 'end'
			@tag = $1
			close_node()
		when @my::RE_COMM then
			@kind = 'comment'
			@text = $1 || $2
			@cur.appendChild(@doc.createComment(@text))
		when @my::RE_OTHER then
			@kind = 'other'
		else
			raise "parse error #{str[0,80]}"
		end
		return back
	end
	def append_node()
		node = @doc.createElement(@tag)
		@attr.each do |attr_name, attr_value|
			node.setAttribute(attr_name, attr_value)
		end
		@cur.appendChild(node)
		@cur = node unless @kind == 'empty'
	end
	def close_node()
		raise "tag name mismatch #{@cur.nodeName} != #{@tag}\n#{@doc}" if @cur.nodeName != @tag
		@cur = @cur.parentNode
	end
	def parse_attr(str)
		attr = Hash::new
		while @my::RE_ATTR === str do
			attr[$1] = unescape($2||$3||$4)
			str = $'
		end
		return attr
	end
	UNESCAPES = {'&lt;'=>'<','&gt;'=>'>','&amp;'=>'&','&quot;'=>'"','&apos;'=>'\''}
	UNESCAPES_KEYS = Regexp::new(UNESCAPES.keys.sort.reverse.join('|'))
	def unescape(str)
		str.gsub(@my::UNESCAPES_KEYS){@my::UNESCAPES[$&]}
	end
end

class HTMLParser < XMLParser
	RE_ATTR = /^\s+(#{TX_NAME})\s*=\s*(?:\"([^\"]*)\"|\'([^\']*)\'|(\S+))/m
	RE_COMM = /^<!--((?!--).*)-->/m
	UNESCAPES = {'&lt;'=>'<','&gt;'=>'>','&amp;'=>'&','&quot;'=>'"','&nbsp;'=>' '}
	UNESCAPES_KEYS = Regexp::new(UNESCAPES.keys.sort.reverse.join('|'))
	def parse_elment(str)
		str = super(str)
		if @kind == 'start' and (@tag == 'style' || @tag == 'script') then
			idx = str.index("</#{@tag}")
			raise "</#{@tag}> not found" unless idx
			text = str[0, idx]
			str = str[idx, str.size]
			node = @doc.createTextNode(unescape(text))
			@cur.appendChild(node)
		end
		return str
	end
	def append_node()
		@tag = @tag.downcase
		node = @doc.createElement(@tag)
		@attr.each do |attr_name, attr_value|
			node.setAttribute(attr_name.downcase, attr_value)
		end
		if @cur.parentNode then
			pare = @cur.parentNode.nodeName
			@cur = @cur.parentNode if pare == 'p' or (@my::DEPEND.key?(pare) and @my::DEPEND[pare].include?(@tag))
		end
		@cur.appendChild(node)
		@cur = node unless @kind == 'empty' or @my::EMPTY.include?(@tag)
	end
	def close_node()
		@tag = @tag.downcase
		cur = @cur
		while cur.parentNode do
			if cur.nodeName == @tag then
				@cur = cur.parentNode
				return
			end
			cur = cur.parentNode
		end
	end

	DEPEND = {
		'dt' => ['dt','dd'],
		'dd' => ['dt','dd'],
		'li' => ['li'],
		'option' => ['option'],
		'rp' => ['rt','rp'],
		'rt' => ['rt','rp'],
		'tr' => ['tr'],
		'th' => ['tr','th','td'],
		'td' => ['tr','th','td'],
	}

	EMPTY = [
		'base',
		'basefont',
		'br',
		'col',
		'hr',
		'img',
		'input',
		'link',
		'meta',
		'param',
		'button',
		'frameset',
		'nobr',
	]

end
