; $Id: www-parse.l,v 1.3 2005/11/24 15:12:27 torihat Exp $
;
; www-parse.l
;
; by HATTORI Masashi

(provide "www/www-parse")

(eval-when (:compile-toplevel :load-toplevel :execute)
  (require "www/www-tag")
  (require "www/www-http"))
 
(in-package "www")

;; T䂳ɂK\
(defvar *www-tag-options-regexp*
  (compile-regexp
   "\\([A-Za-z][A-Za-z0-9.-]*\\)\\([ \t\n]*=[ \t\n]*\\([^ \t\n\"'>]*\\|\"\\([^\"]*\\)\"\\|'\\([^']*\\)'\\)\\)?")
)

(defvar *www-special-chars-alist* '(("&nbsp;" . " ")
				    ("&lt;" . "<")
				    ("&gt;" . ">")
				    ("&quot;" . "\"")
				    ("&amp;" . "&")
;				    ("&aacute;" . "a'")
;				    ("&eacute;" . "e'")
;				    ("&iacute;" . "i'")
;				    ("&oacute;" . "o'")
;				    ("&uacute;" . "u'")
;				    ("&Aacute;" . "A'")
;				    ("&Eacute;" . "E'")
;				    ("&Iacute;" . "I'")
;				    ("&Oacute;" . "O'")
;				    ("&Uacute;" . "U'")
;				    ("&atilde;" . "a~")
;				    ("&Atilde;" . "A~")
;				    ("&ntilde;" . "n~")
;				    ("&Ntilde;" . "N~")
;				    ("&nbsp;" . " ")
;				    ("&mdash;" . "--")
;				    ("&ndash;" . "-")
;				    ("&copy;" . "(c)")
;				    ("&trade;" . "(TM)")
;				    ("&reg;" . "(R)")
				   ))

(defvar *www-character-entity-alist*
  '(
    ("&nbsp;" . 160)
    ("&iexcl;" . 161)
    ("&cent;" . 162)
    ("&pound;" . 163)
    ("&curren;" . 164)
    ("&yen;" . 165)
    ("&brvbar;" . 166)
    ("&sect;" . 167)
    ("&uml;" . 168)
    ("&copy;" . 169)
    ("&ordf;" . 170)
    ("&laquo;" . 171)
    ("&not;" . 172)
    ("&shy;" . 173)
    ("&reg;" . 174)
    ("&macr;" . 175)
    ("&deg;" . 176)
    ("&plusmn;" . 177)
    ("&sup2;" . 178)
    ("&sup3;" . 179)
    ("&acute;" . 180)
    ("&micro;" . 181)
    ("&para;" . 182)
    ("&middot;" . 183)
    ("&cedil;" . 184)
    ("&sup1;" . 185)
    ("&ordm;" . 186)
    ("&raquo;" . 187)
    ("&frac14;" . 188)
    ("&frac12;" . 189)
    ("&frac34;" . 190)
    ("&iquest;" . 191)
    ("&Agrave;" . 192)
    ("&Aacute;" . 193)
    ("&Acirc;" . 194)
    ("&Atilde;" . 195)
    ("&Auml;" . 196)
    ("&Aring;" . 197)
    ("&AElig;" . 198)
    ("&Ccedil;" . 199)
    ("&Egrave;" . 200)
    ("&Eacute;" . 201)
    ("&Ecirc;" . 202)
    ("&Euml;" . 203)
    ("&Igrave;" . 204)
    ("&Iacute;" . 205)
    ("&Icirc;" . 206)
    ("&Iuml;" . 207)
    ("&ETH;" . 208)
    ("&Ntilde;" . 209)
    ("&Ograve;" . 210)
    ("&Oacute;" . 211)
    ("&Ocirc;" . 212)
    ("&Otilde;" . 213)
    ("&Ouml;" . 214)
    ("&times;" . 215)
    ("&Oslash;" . 216)
    ("&Ugrave;" . 217)
    ("&Uacute;" . 218)
    ("&Ucirc;" . 219)
    ("&Uuml;" . 220)
    ("&Yacute;" . 221)
    ("&THORN;" . 222)
    ("&szlig;" . 223)
    ("&agrave;" . 224)
    ("&aacute;" . 225)
    ("&acirc;" . 226)
    ("&atilde;" . 227)
    ("&auml;" . 228)
    ("&aring;" . 229)
    ("&aelig;" . 230)
    ("&ccedil;" . 231)
    ("&egrave;" . 232)
    ("&eacute;" . 233)
    ("&ecirc;" . 234)
    ("&euml;" . 235)
    ("&igrave;" . 236)
    ("&iacute;" . 237)
    ("&icirc;" . 238)
    ("&iuml;" . 239)
    ("&eth;" . 240)
    ("&ntilde;" . 241)
    ("&ograve;" . 242)
    ("&oacute;" . 243)
    ("&ocirc;" . 244)
    ("&otilde;" . 245)
    ("&ouml;" . 246)
    ("&divide;" . 247)
    ("&oslash;" . 248)
    ("&ugrave;" . 249)
    ("&uacute;" . 250)
    ("&ucirc;" . 251)
    ("&uuml;" . 252)
    ("&yacute;" . 253)
    ("&thorn;" . 254)
    ("&yuml;" . 255)
    ("&OElig;" . 338)
    ("&oelig;" . 339)
    ("&Scaron;" . 352)
    ("&scaron;" . 353)
    ("&Yuml;" . 376)
    ("&circ;" . 710)
    ("&tilde;" . 732)
    ("&fnof;" . 402)
    ("&Alpha;" . 913)
    ("&Beta;" . 914)
    ("&Gamma;" . 915)
    ("&Delta;" . 916)
    ("&Epsilon;" . 917)
    ("&Zeta;" . 918)
    ("&Eta;" . 919)
    ("&Theta;" . 920)
    ("&Iota;" . 921)
    ("&Kappa;" . 922)
    ("&Lambda;" . 923)
    ("&Mu;" . 924)
    ("&Nu;" . 925)
    ("&Xi;" . 926)
    ("&Omicron;" . 927)
    ("&Pi;" . 928)
    ("&Rho;" . 929)
    ("&Sigma;" . 931)
    ("&Tau;" . 932)
    ("&Upsilon;" . 933)
    ("&Phi;" . 934)
    ("&Chi;" . 935)
    ("&Psi;" . 936)
    ("&Omega;" . 937)
    ("&alpha;" . 945)
    ("&beta;" . 946)
    ("&gamma;" . 947)
    ("&delta;" . 948)
    ("&epsilon;" . 949)
    ("&zeta;" . 950)
    ("&eta;" . 951)
    ("&theta;" . 952)
    ("&iota;" . 953)
    ("&kappa;" . 954)
    ("&lambda;" . 955)
    ("&mu;" . 956)
    ("&nu;" . 957)
    ("&xi;" . 958)
    ("&omicron;" . 959)
    ("&pi;" . 960)
    ("&rho;" . 961)
    ("&sigmaf;" . 962)
    ("&sigma;" . 963)
    ("&tau;" . 964)
    ("&upsilon;" . 965)
    ("&phi;" . 966)
    ("&chi;" . 967)
    ("&psi;" . 968)
    ("&omega;" . 969)
    ("&thetasym;" . 977)
    ("&upsih;" . 978)
    ("&piv;" . 982)
    ("&ensp;" . 8194)
    ("&emsp;" . 8195)
    ("&thinsp;" . 8201)
    ("&zwnj;" . 8204)
    ("&zwj;" . 8205)
    ("&lrm;" . 8206)
    ("&rlm;" . 8207)
    ("&ndash;" . 8211)
    ("&mdash;" . 8212)
    ("&lsquo;" . 8216)
    ("&rsquo;" . 8217)
    ("&sbquo;" . 8218)
    ("&ldquo;" . 8220)
    ("&rdquo;" . 8221)
    ("&bdquo;" . 8222)
    ("&dagger;" . 8224)
    ("&Dagger;" . 8225)
    ("&bull;" . 8226)
    ("&hellip;" . 8230)
    ("&permil;" . 8240)
    ("&prime;" . 8242)
    ("&Prime;" . 8243)
    ("&lsaquo;" . 8249)
    ("&rsaquo;" . 8250)
    ("&oline;" . 8254)
    ("&frasl;" . 8260)
    ("&euro;" . 8364)
    ("&image;" . 8465)
    ("&ewierp;" . 8472)
    ("&real;" . 8476)
    ("&trade;" . 8482)
    ("&alefsym;" . 8501)
    ("&larr;" . 8592)
    ("&uarr;" . 8593)
    ("&rarr;" . 8594)
    ("&darr;" . 8595)
    ("&harr;" . 8596)
    ("&crarr;" . 8629)
    ("&lArr;" . 8656)
    ("&uArr;" . 8657)
    ("&rArr;" . 8658)
    ("&dArr;" . 8659)
    ("&hArr;" . 8660)
    ("&forall;" . 8704)
    ("&part;" . 8706)
    ("&exist;" . 8707)
    ("&empty;" . 8709)
    ("&nabla;" . 8711)
    ("&isin;" . 8712)
    ("&notin;" . 8713)
    ("&ni;" . 8715)
    ("&prod;" . 8719)
    ("&sum;" . 8721)
    ("&minus;" . 8722)
    ("&lowast;" . 8727)
    ("&radic;" . 8730)
    ("&prop;" . 8733)
    ("&infin;" . 8734)
    ("&ang;" . 8736)
    ("&and;" . 8743)
    ("&or;" . 8744)
    ("&cap;" . 8745)
    ("&cup;" . 8746)
    ("&int;" . 8747)
    ("&there4;" . 8756)
    ("&sim;" . 8764)
    ("&cong;" . 8773)
    ("&asymp;" . 8776)
    ("&ne;" . 8800)
    ("&equiv;" . 8801)
    ("&le;" . 8804)
    ("&ge;" . 8805)
    ("&sub;" . 8834)
    ("&sup;" . 8835)
    ("&nsub;" . 8836)
    ("&sube;" . 8838)
    ("&supe;" . 8839)
    ("&oplus;" . 8853)
    ("&otimes;" . 8855)
    ("&perp;" . 8869)
    ("&sdot;" . 8901)
    ("&lceil;" . 8968)
    ("&rceil;" . 8969)
    ("&lfloor;" . 8970)
    ("&rfloor;" . 8971)
    ("&lang;" . 9001)
    ("&rang;" . 9002)
    ("&loz;" . 9674)
    ("&spades;" . 9824)
    ("&clubs;" . 9827)
    ("&hearts;" . 9829)
    ("&diams;" . 9830)
    ))

(defun www-delete-comment ()
  (save-excursion
    (goto-char (point-min))
    (while (scan-buffer "<!--" :tail t)
      (let ((beg (match-beginning 0)))
	(when (scan-buffer "-->" :tail t)
	  (delete-region beg (point)))
      )
    )
    (goto-char (point-min))
    (while (scan-buffer "<!doctype[^>]+>" :regexp t :tail t :case-fold t)
      (delete-region (match-beginning 0) (match-end 0)))
    ;(replace-buffer "<!DOCTYPE[^>]+>" "" :regexp t)
    (goto-char (point-min))
    (replace-buffer "\r" "")
  ))

(defun www-parse-tag (beg end)
  (let (name
	opts
	close
       )
    (save-excursion
      (goto-char (1+ beg))
      (setq beg (point))
      (cond ((looking-at "!")
	     (progn
	       (scan-buffer "-->" :tail t)
	       (return-from www-parse-tag)
	     ))
	    ((looking-at "/")
	     (progn
	       (push (list 'close t) opts)
	       (forward-char 1)
	     ))
      )
      (setq beg (point))
      (skip-chars-forward "^ \t\n>")
      (when (= (point) beg)
	(message "WWW: could not find tag name")
	(return-from www-parse-tag))
      (setq name (nstring-downcase (buffer-substring beg (point))))
      (setq beg (point))
      (save-restriction
	(narrow-to-region beg end)
	(while (scan-buffer *www-tag-options-regexp* :regexp t :tail t)
	  (let ((key (match-string 1))
		val
	       )
	    (when (match-beginning 2)
	      (setq val (or (match-string 5)
			    (match-string 4)
			    (match-string 3)))
	    )
	    ;(push (list (intern (nstring-downcase key)) (or val t)) opts)
	    (push (list (intern (nstring-downcase key) *www-package*) (or val t)) opts)
	  ))
      ))
    (list (intern name *www-package*) opts)
  ))

(defun www-parse-html ()
  (let (parsed
	beg
	end
	temp
	in
	(last-percent nil)
       )
    (save-excursion
      (goto-char (point-min))
      (setq beg (point))
      (message "WWW: parsing ... ")
      (www-interval-message (300)
	(while (scan-buffer "<[^<>]+>" :regexp t :tail t);;;͉ƂȂ
	  (let ((start (match-beginning 0))
		(stop (match-end 0))
		str)
	    (setq temp (www-parse-tag start stop))
	    (let ((tname (www-tag-name temp))
		  (topt (www-tag-opt temp)))
	      (when (eq tname 'meta)
		(let ((fname (www-tag-get-option 'http-equiv topt))
		      (ctype (www-tag-get-option 'content topt))
		      type)
		  (when (and fname
			     (equalp fname "content-type")
			     ctype)
		    (when (and (setq type (www-get-encode ctype))
			       (not (eq type www-charset)))
		      ;(message-box (format nil "~S\n~S" type www-charset))
		      (delete-buffer (selected-buffer))
		      (setq www-charset type)
		      (throw 'reload t))))))
;	  (unless (= beg start)
;	    (push (www-view-convert-special-chars (buffer-substring beg start)) parsed)
;	  )
	    (unless (= beg start)
	      (save-excursion
		(goto-char start)
		(when *www-view-debug*
		  (refresh-screen)
		  (msgbox "~S" (buffer-substring beg (point))))
		(save-excursion
		  (www-view-convert-special-chars beg (point))
		)
		(push (buffer-substring beg (point)) parsed)
;	      (when (< beg (point))
;		(if (www-view-nochange-p in)
;		    (push (buffer-substring beg (point)) parsed)
;		  (push (www-view-remove-spaces beg (point)) parsed)
;		)
;	      )
	      )
	    )
	    (when temp
	      (push temp parsed)
	      (setq in (www-view-tag-out temp in))
	      (when (and (not (www-tag-close-p temp))
			 (www-tag-nochange-p temp))
		(push temp in))
	    )
	    (setq beg (point))
	    (when *www-show-status-message*
	      (let ((percent (truncate (* 100 beg) (point-max))))
		(unless (eql percent last-percent)
		  (setq last-percent percent)
		  (message "WWW: parsing ... ~3D%" percent)))
	    )
	    ;(message "WWW: parsing ... ~3D%" (truncate (/ (* 100 beg) (point-max))))
	  )
	)
      (unless (= beg (point-max))
	(push (buffer-substring beg (point-max)) parsed))
      );www-interval-message
      (message "WWW: parsing ... done")
    );save-excursion
    parsed
  ))

(defun www-study-html (html)
  (let (parsed add)
    (while html
      (multiple-value-setq (html add)
	(www-study-tag html))
      (when add
	(push add parsed))
      (let ((tag (car html)))
	(when (and (listp tag)
		   (www-tag-close-p tag))
	  (setq html (cdr html))
	)
      )
    )
    (nreverse parsed)
  ))

(defun www-study-tag (html)
  (let (parsed element otag name)
    (while html
      (setq element (car html))
      ;(message-box (format nil "~S:~S" otag element))
      (if otag
	  (cond ((stringp element)
		 (progn
		   (push element otag)
		   ;(message-box (format nil "pushed string: ~A" element))
		   (setq html (cdr html))
		 ))
		((listp element)
		 (cond ((www-tag-close-p element)
			(setq otag (nreverse otag))
			(if (www-tag-is-close-for-p otag element)
			    (return-from www-study-tag (values (cdr html) otag))
			  (return-from www-study-tag (values html otag))
;			(if (www-tag-is-close-for-p (reverse otag) element)
;			    (return-from www-study-tag (values (cdr html) (reverse otag)))
;			  (return-from www-study-tag (values html (reverse otag)))
			))
		       (t
			(let (add)
			  (multiple-value-setq (html add)
			    (www-study-tag html))
			  (when add
			    (push add otag)
			  )))
		 ))
	  );cond
	(cond ((stringp element)
	       (progn
		 ;(message-box (format nil "pushed string: ~A" element))
		 (return-from www-study-tag (values (cdr html) element))
	       ))
	      ((listp element)
	       (cond ((www-tag-close-p element)
		      (return-from www-study-tag html))
		     (t
		      (progn
			(setq name (www-tag-name element))
			(setq otag (nreverse element))
			(setq html (cdr html))
		      )
		     )
	       ))
	);cond
      );if
    );while
    html
  ))

;;
;; for test ...
;;
;;
(defun www-parse-test ()
  (interactive)
  (let ((buffer (selected-buffer))
	temp)
    (set-buffer (get-buffer-create *www-buffer-temp*))
    (insert-buffer buffer)
    (www-delete-comment)
    (setq temp (www-parse-html))
    (erase-buffer *www-buffer-temp*)
    (with-output-to-selected-buffer
      (dolist (x (reverse temp))
	(cond ((stringp x)
	       (format t "~A~%" x)
	      )
	      (t
	       (format t "~S~%" x)
	      )
	)
      )
      (format t "~S~%" (www-study-html (reverse temp)))
    )
  ))
