/*
 * Copyright (c) 2003 The Ochusha Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $Id: htmlutils.c,v 1.7 2003/11/26 04:13:51 fuyu Exp $
 */

#include "config.h"

#include "ochusha_private.h"

#include "htmlutils.h"
#include "utils.h"

#include <glib.h>

#include <ctype.h>
#include <errno.h>
#include <iconv.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


static const char *mempbrk(const char *string, const char *charset,
			   size_t len);
static const char *process_tag(ElementHandler *handler, void *context,
			       const char *head, int len);
static const char *process_entity_ref(ElementHandler *handler, void *context,
				      const char *head, int len);
#if ENABLE_FAKE_ANCHOR
static const char *fake_anchor_tag(ElementHandler *handler, void *context,
				   const char *head, int len);
#endif
static gboolean default_entity_ref_handler(ElementHandler *handler,
					   void *context, char *entity);
static gchar *encode_unicode_to_utf8(unsigned int character, gchar *buffer);


static GHashTable *default_entity_dictionary = NULL;
static GHashTable *reverse_entity_dictionary = NULL;

static gchar *default_entity[] =
  {
    /* <!ENTITY % HTMLlat1 PUBLIC "-//W3C//ENTITIES// Latin 1//EN//HTML"> */

    "nbsp", "\302\240",	/* U+00A0 */
    "iexcl", "\302\241",
    "cent", "\302\242",
    "pound", "\302\243",
    "curren", "\302\244",
    "yen", "\302\245",
    "brvbar", "\302\246",
    "sect", "\302\247",
    "uml", "\302\250",
    "copy", "\302\251",
    "ordf", "\302\252",
    "laquo", "\302\253",
    "not", "\302\254",
    "shy", "\302\255",
    "reg", "\302\256",
    "macr", "\302\257",
    "deg", "\302\260",
    "plusmn", "\302\261",
    "sup2", "\302\262",
    "sup3", "\302\263",
    "acute", "\302\264",
    "micro", "\302\265",
    "para", "\302\266",
    "middot", "\302\267",
    "cedil", "\302\270",
    "sup1", "\302\271",
    "ordm", "\302\272",
    "raquo", "\302\273",
    "frac14", "\302\274",
    "frac12", "\302\275",
    "frac34", "\302\276",
    "iquest", "\302\277",
    "Agrave", "\303\200",
    "Aacute", "\303\201",
    "Acirc", "\303\202",
    "Atilde", "\303\203",
    "Auml", "\303\204",
    "Aring", "\303\205",
    "AElig", "\303\206",
    "Ccedil", "\303\207",
    "Egrave", "\303\210",
    "Eacute", "\303\211",
    "Ecirc", "\303\212",
    "Euml", "\303\213",
    "Igrave", "\303\214",
    "Iacute", "\303\215",
    "Icirc", "\303\216",
    "Iuml", "\303\217",
    "ETH", "\303\220",
    "Ntilde", "\303\221",
    "Ograve", "\303\222",
    "Oacute", "\303\223",
    "Ocirc", "\303\224",
    "Otilde", "\303\225",
    "Ouml", "\303\226",
    "times", "\303\227",
    "Oslash", "\303\230",
    "Ugrave", "\303\231",
    "Uacute", "\303\232",
    "Ucirc", "\303\233",
    "Uuml", "\303\234",
    "Yacute", "\303\235",
    "THORN", "\303\236",
    "szlig", "\303\237",
    "agrave", "\303\240",
    "aacute", "\303\241",
    "acirc", "\303\242",
    "atilde", "\303\243",
    "auml", "\303\244",
    "aring", "\303\245",
    "aelig", "\303\246",
    "ccedil", "\303\247",
    "egrave", "\303\250",
    "eacute", "\303\251",
    "ecirc", "\303\252",
    "euml", "\303\253",
    "igrave", "\303\254",
    "iacute", "\303\255",
    "icirc", "\303\256",
    "iuml", "\303\257",
    "eth", "\303\260",
    "ntilde", "\303\261",
    "ograve", "\303\262",
    "oacute", "\303\263",
    "ocirc", "\303\264",
    "otilde", "\303\265",
    "ouml", "\303\266",
    "divide", "\303\267",
    "oslash", "\303\270",
    "ugrave", "\303\271",
    "uacute", "\303\272",
    "ucirc", "\303\273",
    "uuml", "\303\274",
    "yacute", "\303\275",
    "thorn", "\303\276",
    "yuml", "\303\277",


    /* <!ENTITY % HTMLsymbol PUBLIC "-//W3C//TNTITIES Symbols//EN//HTML"> */

    "fnof", "\306\222",	/* U+0192 000 110 - 010 010 */

    "Alpha", "\316\221",	/* U+0391 001 110 - 010 001 */
    "Beta", "\316\222",
    "Gamma", "\316\223",
    "Delta", "\316\224",
    "Epsilon", "\316\225",
    "Zeta", "\316\226",
    "Eta", "\316\227",
    "Theta", "\316\230",
    "Iota", "\316\231",
    "Kappa", "\316\232",
    "Lambda", "\316\233",
    "Mu", "\316\234",
    "Nu", "\316\235",
    "Xi", "\316\236",
    "Omicron", "\316\237",
    "Pi", "\316\240",
    "Rho", "\316\241",
    /* there is no Sigmaf, and no U+03A2 character either */
    "Sigma", "\316\243",
    "Tau", "\316\244",
    "Upsilon", "\316\245",
    "Phi", "\316\246",
    "Chi", "\316\247",
    "Psi", "\316\250",
    "Omega", "\316\251",

    "alpha", "\316\261",	/* U+03B1 001 110 - 110 001 */
    "beta", "\316\262",
    "gamma", "\316\263",
    "delta", "\316\264",
    "epsilon", "\316\265",
    "zeta", "\316\266",
    "eta", "\316\267",
    "theta", "\316\270",
    "iota", "\316\271",
    "kappa", "\316\272",
    "lambda", "\316\273",
    "mu", "\316\274",
    "nu", "\316\275",
    "xi", "\316\276",
    "omicron", "\316\277",
    "pi", "\317\200",
    "rho", "\317\201",
    "sigmaf", "\317\202",
    "sigma", "\317\203",
    "tau", "\317\204",
    "upsilon", "\317\205",
    "phi", "\317\206",
    "chi", "\317\207",
    "psi", "\317\210",
    "omega", "\317\211",

    "thetasym", "\317\221",	/* U+03D1 001 111 - 010 001 */
    "upsih", "\317\222",

    "piv", "\317\226",	/* U+03D6 001 111 - 010 110 */

    "bull", "\342\200\242",	/* U+2022 0 010 - 000 000 - 100 010 */

    "hellip", "\342\200\246", /* U+2026 */

    "prime", "\342\200\262",	/* U+2032 */
    "Prime", "\342\200\263",	/* U+2033 */

    "oline", "\342\200\276",	/* U+203E 0 010 - 000 000 - 111 110 */

    "frasl", "\342\201\204",	/* U+2044 0 010 - 000 001 - 000 100 */

    "image", "\342\204\221",	/* U+2111 0 010 - 000 100 - 010 001 */
    "weierp", "\342\204\230",	/* U+2118 */
    "real", "\342\204\234",	/* U+211C */
    "trade", "\342\204\242",	/* U+2122 */
    "alefsym", "\342\204\265",/* U+2135 */

    "larr", "\342\206\220",	/* U+2190 0 010 - 000 110 - 010 000 */
    "uarr", "\342\206\221",
    "rarr", "\342\206\222",
    "darr", "\342\206\223",
    "harr", "\342\206\224",

    "crarr", "\342\206\265",	/* U+21B5 0 010 - 000 110 - 110 101 */

    "lArr", "\342\207\220",	/* U+21D0 0 010 - 000 111 - 010 000 */
    "uArr", "\342\207\221",
    "rArr", "\342\207\222",
    "dArr", "\342\207\223",
    "hArr", "\342\207\224",

    "forall", "\342\210\200",	/* U+2200 0 010 - 001 000 - 000 000 */
    "part", "\342\210\202",	/* U+2202 */
    "exist", "\342\210\203",	/* U+2203 */
    "empty", "\342\210\205",	/* U+2205 */
    "nabla", "\342\210\207",	/* U+2207 */
    "isin", "\342\210\210",	/* U+2208 */
    "notin", "\342\210\211",	/* U+2209 */
    "ni", "\342\210\213",	/* U+220B */
    "prod", "\342\210\217",	/* U+220F */
    "sum", "\342\210\221",	/* U+2211 */
    "minus", "\342\210\222",	/* U+2212 */
    "lowast", "\342\210\227",	/* U+2217 */
    "radic", "\342\210\232",	/* U+221A */
    "prop", "\342\210\235",	/* U+221D */
    "infin", "\342\210\236",	/* U+221E */
    "ang", "\342\210\240",	/* U+2220 */
    "and", "\342\210\247",	/* U+2227 */
    "or", "\342\210\250",	/* U+2228 */
    "cap", "\342\210\251",	/* U+2229 */
    "cup", "\342\210\252",	/* U+222A */
    "int", "\342\210\253",	/* U+222B */

    "there4", "\342\210\264",	/* U+2234 */
    "sim", "\342\210\274",	/* U+223C */
    "cong", "\342\211\205",	/* U+2245 */
    "asymp", "\342\211\210",	/* U+2248 */
    "ne", "\342\211\240",	/* U+2260 */
    "equiv", "\342\211\241",	/* U+2261 */
    "le", "\342\211\244",	/* U+2264 */
    "ge", "\342\211\245",	/* U+2265 0 010 - 001 001 - 100 101 */
    "sub", "\342\212\202",	/* U+2282 0 010 - 001 010 - 000 010 */
    "sup", "\342\212\203",	/* U+2283 */
    "nsub", "\342\212\204",	/* U+2284 */
    "sube", "\342\212\206",	/* U+2286 */
    "supe", "\342\212\207",	/* U+2287 */
    "oplus", "\342\212\225",	/* U+2295 */
    "otimes", "\342\212\227",	/* U+2297 */
    "perp", "\342\212\245",	/* U+22A5 */
    "sdot", "\342\213\205",	/* U+22C5 0 010 - 001 011 - 000 101 */

    "lceil", "\342\214\210",	/* U+2308 0 010 - 001 100 - 001 000 */
    "rceil", "\342\214\211",	/* U+2309 */
    "lfloor", "\342\214\212",	/* U+230A */
    "rfloor", "\342\214\213",	/* U+230B */
    "lang", "\342\214\251",	/* U+2329 */
    "rang", "\342\214\252",	/* U+232A */

    "loz", "\342\227\212",	/* U+25CA 0 010 - 010 111 - 001 010 */

    "spades", "\342\231\240",	/* U+2660 0 010 - 011 001 - 100 000 */
    "clubs", "\342\231\243",	/* U+2663 */
    "hearts", "\342\231\245",	/* U+2665 0 010 - 011 001 - 100 101 */
    "diams", "\342\231\246",	/* U+2666 */


    /* <!ENTITY % HTMLspecial PUBLIC "-//W3C//ENTITIES Special//EN//HTML"> */

    "quot",	"\042",
    "amp", "&",
    "lt", "<",
    "gt", ">",

    "OElig", "\305\222",	/* U+0152 000 101 - 010 010 */
    "oelig", "\305\223",	/* U+0153 */
    "Scaron", "\305\240",	/* U+0160 */
    "scaron", "\305\241",	/* U+0161 */
    "Yuml", "\305\270",		/* U+0178 */

    "circ", "\313\206",		/* U+02C6 001 011 - 000 110 */
    "tilde", "\313\234",	/* U+02DC */

    "ensp", "\342\200\202",	/* U+2002 0 010 - 000 000 - 000 010 */
    "emsp", "\342\200\203",	/* U+2003 */
    "thinsp", "\342\200\211",	/* U+2009 */
    "zwnj", "\342\200\214",	/* U+200C */
    "zwj", "\342\200\215",	/* U+200D */
    "lrm", "\342\200\216",	/* U+200E */
    "rlm", "\342\200\217",	/* U+200F */
    "ndash", "\342\200\223",	/* U+2013 */
    "mdash", "\342\200\224",	/* U+2014 */
    "lsquo", "\342\200\230",	/* U+2018 */
    "rsquo", "\342\200\231",	/* U+2019 */
    "sbquo", "\342\200\232",	/* U+201A */
    "ldquo", "\342\200\234",	/* U+201C */
    "rdquo", "\342\200\235",	/* U+201D */
    "bdquo", "\342\200\236",	/* U+201E */
    "dagger", "\342\200\240",	/* U+2020 */
    "Dagger", "\342\200\241",	/* U+2021 */
    "permil", "\342\200\260",	/* U+2030 */
    "lsaquo", "\342\200\271",	/* U+2039 */
    "rsaquo", "\342\200\272",	/* U+203A */
    "euro", "\342\202\254",	/* U+20AC 0 010 - 000 010 - 101 100 */
    NULL
  };


void
initialize_default_entity_dictionary()
{
  gchar **entry = default_entity;

  if (default_entity_dictionary != NULL)
    return;

  default_entity_dictionary = g_hash_table_new(g_str_hash, g_str_equal);
  reverse_entity_dictionary = g_hash_table_new(g_str_hash, g_str_equal);

  /* ǤäϿ롣*/
  while (*entry != NULL)
    {
      g_hash_table_insert(default_entity_dictionary,
			  entry[0], entry[1]);
      g_hash_table_insert(reverse_entity_dictionary,
			  entry[1], entry[0]);
      entry += 2;
    }
}


GHashTable *
get_default_entity_dictionary()
{
  return default_entity_dictionary;
}


GHashTable *
get_reverse_entity_dictionary()
{
  return reverse_entity_dictionary;
}


#define UTF8_BUFFER_SIZE	4
static gboolean
default_entity_ref_handler(ElementHandler *handler, void *context,
			   char *entity)
{
  char default_buffer[UTF8_BUFFER_SIZE];
  char *buffer = NULL;
  int len;
  gchar *utf8_entity = NULL;

  if (handler->characters == NULL)
    return TRUE;

  len = strlen(entity);

  if (entity[0] == '#' && len > 3)
    {
      unsigned int character;
      int result;

      /* Numeric character references */
      if (entity[1] == 'x' || entity[1] == 'X')
	{
	  /* ȣȣȣȣȣȣȣ */
	  /* entity          (entity + len - 1) */
	  /* hexadecimal number */
	  result = sscanf(entity + 2, "%x", &character);
	}
      else
	{
	  /* decimal number */
	  result = sscanf(entity + 1, "%d", &character);
	}

      if (result == 1)
	buffer = encode_unicode_to_utf8(character, default_buffer);
    }
  else
    buffer = g_hash_table_lookup(default_entity_dictionary, entity);

  if (buffer == NULL)
    return FALSE;

#if DEBUG_PARSER_MOST
  fprintf(stderr, "entity: %s -> %s\n", entity, buffer);
#endif

  (*handler->characters)(context, buffer, strlen(buffer));
  if (utf8_entity != NULL)
    G_FREE(utf8_entity);

  return TRUE;
}


#define DEFAULT_BUFFER_SIZE	4096
/*
 * (SAX)HTMLѡ
 * textĹlenʸHTMLʸȻפäƲϤΤΤentity
 * ȤΤ褦ʤΡơʳʸΤ줾ˤĤơб
 * callbackƤӽФ
 *
 * callbackNULLξ硢ʲ˼褦ʥǥեȤư򤹤롣
 * startElement: ̵
 * endElement: ̵
 * entity: ơ֥ʸѴcharactersϥɥƤ֡
 * characters: ̵
 *
 * converter0ξ硢callbackƤiconvȤäʸencoding
 * Ѵ롣converter0ξ硢textUTF-8 encodingǤȲꤹ롣
 *
 * '<'⤷'&'줿ˤؤ餺б'>'';'ʤä硢
 * '<'⤷'&'ľޤǤǥѡ󥰤ߤΥݥ󥿤֤
 *ʳξˤNULL֤
 */
const char *
parse_text(ElementHandler *handler, void *context, const char *text, int len)
{
  gchar default_buffer[DEFAULT_BUFFER_SIZE];
  size_t buffer_size = DEFAULT_BUFFER_SIZE;
  gchar *buffer = default_buffer;

  const char *cur_pos = text;
  const char *tmp_pos = text;
  int rest_of_text = len;

  if (len < 0)
    rest_of_text = strlen(text);

#if DEBUG_PARSER_MOST
  fprintf(stderr, "text: %s\n", text);
#endif

  while (rest_of_text > 0)
    {
      int chr_len;
      tmp_pos = mempbrk(cur_pos, "<&", rest_of_text);
      chr_len = (tmp_pos == NULL) ? rest_of_text : tmp_pos - cur_pos;

#if ENABLE_FAKE_ANCHOR
      if (chr_len > 0)
	{
	  const char *raw_url;
	  /*
	   * ޤcur_postmp_posδ֤ttp://õ
	   * ĤСURLȻפäaǤ٤Ԥ
	   *
	   * aǤ٤ϡǤtmp_posURLƬt⤷h֤
	   * ˤƤ٤ʬľޤǤ̤̾charactersȤ
	   * 갷θ塢*tmp_post⤷hξ礽ºݤ٤
	   * Ԥɤ
	   *
	   * aǤ٤硢URLʬtmp_posۤ礬(URL
	   * &ʤɤ)դɬס
	   */
	  raw_url = g_strstr_len(cur_pos, chr_len, "ttp://");
	  if (raw_url != NULL)
	    {
	      if (raw_url > cur_pos && raw_url[-1] == 'h')
		raw_url--;	/* 1ʸ᤹ */

	      tmp_pos = raw_url;
	      chr_len = (tmp_pos == NULL) ? rest_of_text : tmp_pos - cur_pos;
	    }
	}
#endif

      if (cur_pos != tmp_pos && handler->characters != NULL)
	{
	  if (handler->converter != NULL)
	    {
	      const char *inbuf = cur_pos;
	      gchar *outbuf = buffer;
	      size_t outbytesleft = buffer_size;
	      size_t inbytesleft = chr_len;
	      size_t size;

#if DEBUG_PARSER_MOST
	      fprintf(stderr, "Conversion start.\n");
#endif
	      /* cur_posĹchr_lenʬiconvѴcharactersϥɥ
	       * 򥳡Хå
	       */
	      while (1)
		{
#if ENABLE_MS_IZONMOJI
		  do
		    {
		      size = iconv(handler->converter, &inbuf, &inbytesleft,
				   &outbuf, &outbytesleft);
		      if (size == -1 && errno == EILSEQ
			  && handler->helper != NULL)
			size = (*handler->helper)(&inbuf, &inbytesleft,
						  &outbuf, &outbytesleft);
		      else
			break;
		    } while (size != -1);

		  if (size != -1)
		    break;	/* ｪλ */

		  if (errno == E2BIG)
		    {
		      size_t old_size = buffer_size;
		      /* allocate 2 times larger buffer in the heap. */
		      buffer_size *= 2;
		      if (buffer != default_buffer)
			buffer = (gchar *)G_REALLOC(buffer, buffer_size);
		      else
			{
			  gchar *old_buffer = buffer;
			  buffer = (gchar *)G_MALLOC(buffer_size);
			  if (buffer != NULL)
			    memcpy(buffer, old_buffer,
				   old_size - outbytesleft);
			}
		      outbytesleft += old_size;
		      outbuf = buffer + buffer_size - outbytesleft;
		    }
		  else
		    {
#if DEBUG_ICONV
		      /*
		       * TODO: ϽĥäѴ
		       *       ҤȤޤ̵롣ġĤȤconverter
		       *       //IGNOREꤷƤ餳ˤ
		       *       ϤʤΤġġ
		       */
		      int i;
		      int count = (inbytesleft > 8) ? 8 : inbytesleft;
		      fprintf(stderr, "iconv failed: %s(%d).\n",
			      strerror(errno), errno);
		      fprintf(stderr, "failed sequence:");
		      for (i = 0; i < count; i++)
			fprintf(stderr, " 0x%02x\n", inbuf[i] & 0xff);
		      fprintf(stderr, "\n");
#endif
		      if (buffer != default_buffer)
			G_FREE(buffer);
#if DEBUG_ICONV_MOST
		      abort();
#else
		      return NULL;
#endif
		    }
#else
		  size = iconv(handler->converter,
			       &inbuf, &inbytesleft, &outbuf, &outbytesleft);
		  if (size == -1)
		    {
		      if (errno == E2BIG)
			{
			  size_t old_size = buffer_size;
			  /* allocate 2 times larger buffer in the heap. */
			  buffer_size *= 2;
			  if (buffer != default_buffer)
			    buffer = (gchar *)G_REALLOC(buffer, buffer_size);
			  else
			    {
			      gchar *old_buffer = buffer;
			      buffer = (gchar *)G_MALLOC(buffer_size);
			      if (buffer != NULL)
				memcpy(buffer, old_buffer,
				       old_size - outbytesleft);
			    }
			  outbytesleft += old_size;
			  outbuf = buffer + buffer_size - outbytesleft;
			}
		      else
			{
#if DEBUG_ICONV
			  /*
			   * TODO: ϽĥäѴ
			   *       ҤȤޤ̵롣ġĤȤconverter
			   *       //IGNOREꤷƤ餳ˤ
			   *       ϤʤΤġġ
			   */
			  int i;
			  int count = (inbytesleft > 8) ? 8 : inbytesleft;
			  fprintf(stderr, "iconv failed: %s(%d).\n",
				  strerror(errno), errno);
			  fprintf(stderr, "failed sequence:");
			  for (i = 0; i < count; i++)
			    fprintf(stderr, " 0x%02x\n", inbuf[i] & 0xff);
			  fprintf(stderr, "\n");
#endif
			  if (buffer != default_buffer)
			    G_FREE(buffer);
#if DEBUG_ICONV_MOST
			  abort();
#else
			  return NULL;
#endif
			}
		    }
		  else
		    break;
#endif	/* ENABLE_MS_IZONMOJI */
		}
	      (*handler->characters)(context, buffer,
				     buffer_size - outbytesleft);
	    }
	  else
	    {
	      (*handler->characters)(context, cur_pos, chr_len);
	    }
	}

      if (tmp_pos == NULL)
	return NULL;	/* λ */

      rest_of_text -= chr_len;

#if ENABLE_FAKE_ANCHOR
      switch (*tmp_pos)
	{
	case '<':
	  cur_pos = process_tag(handler, context, tmp_pos, rest_of_text);
	  break;

	case '&':
	  cur_pos = process_entity_ref(handler, context, tmp_pos,
				       rest_of_text);
	  break;

	case 'h':
	case 't':
	  cur_pos = fake_anchor_tag(handler, context, tmp_pos, rest_of_text);
	  break;
	}
#else
      if (*tmp_pos == '<')
	cur_pos = process_tag(handler, context, tmp_pos, rest_of_text);
      else if (*tmp_pos == '&')
	cur_pos = process_entity_ref(handler, context, tmp_pos, rest_of_text);
#endif

      if (cur_pos == NULL)
	return tmp_pos;

      rest_of_text -= (cur_pos - tmp_pos);
    }
  return NULL;
}


#define MAX_ATTRIBUTES	8
/*
 * head򥿥ȸʤƲϤޤϤǤŬʥϥɥƤӡ
 * μʸؤΥݥ󥿤֤Ϥ˼ԤNULL֤
 */
static const char *
process_tag(ElementHandler *handler, void *context, const char *head, int len)
{
  /* (*head == '<') */
  int n_attrs = 0;
  char *attrs[MAX_ATTRIBUTES * 2 + 1];
  char *name;
  const char *cur_pos = ++head;
  const char *tmp_pos = mempbrk(cur_pos, " \t\n>", --len);
  gboolean end_tag;
  gboolean buffer_full = FALSE;

  if (tmp_pos == NULL)
    return NULL;

  if (*cur_pos == '/')
    {
      cur_pos++;
      len--;
      end_tag = TRUE;
    }
  else
    end_tag = FALSE;

  name = G_STRNDUP(cur_pos, tmp_pos - cur_pos);
  len -= tmp_pos - cur_pos;	/* ̾ʬ򺹤 */
  cur_pos = tmp_pos;

  while (len > 0)
    {
      while (*cur_pos == ' ' || *cur_pos == '\t' || *cur_pos == 'n')
	{
	  cur_pos++;
	  len--;
	  if (len == 0)
	    {
	      cur_pos = NULL;
	      goto finish;
	    }
	}

      if (*cur_pos == '>')
	break;

      tmp_pos = mempbrk(cur_pos + 1, "= \t\n>", len - 1);
      if (tmp_pos == NULL)
	{
	  cur_pos = NULL;
	  goto finish;
	}
      /* λcur_posϥȥӥ塼̾ƬؤƤϤ */
      if (!buffer_full)
	{
	  attrs[n_attrs * 2] = G_STRNDUP(cur_pos, tmp_pos - cur_pos);
	  n_attrs++;
	}

      len -= tmp_pos - cur_pos;
      cur_pos = tmp_pos;
      if (*cur_pos == '=')
	{
	  char quot = *(++cur_pos);
	  len--;
	  if (len == 0)
	    {
	      cur_pos = NULL;
	      if (!buffer_full)
		attrs[n_attrs * 2 - 1] = NULL;
	      goto finish;
	    }

	  if (quot == '\'' || quot == '"')
	    {
	      tmp_pos = memchr(++cur_pos, quot, --len);
	      if (tmp_pos == NULL)
		{
		  cur_pos = NULL;
		  if (!buffer_full)
		    attrs[n_attrs * 2 - 1] = NULL;
		  goto finish;
		}
	      if (!buffer_full)
		attrs[n_attrs * 2 - 1] = G_STRNDUP(cur_pos, tmp_pos - cur_pos);
	      len -= (tmp_pos - cur_pos + 1);
	      cur_pos = tmp_pos + 1;
	    }
	  else
	    {
	      tmp_pos = mempbrk(cur_pos, " \t\n>", len);
	      if (tmp_pos == NULL)
		{
		  cur_pos = NULL;
		  if (!buffer_full)
		    attrs[n_attrs * 2 - 1] = NULL;
		  goto finish;
		}
	      if (!buffer_full)
		attrs[n_attrs * 2 - 1] = G_STRNDUP(cur_pos, tmp_pos - cur_pos);
	      len -= (tmp_pos - cur_pos);
	      cur_pos = tmp_pos;
	    }
	}
      else if (!buffer_full)
	attrs[n_attrs * 2 - 1] = NULL;

      if (n_attrs >= MAX_ATTRIBUTES)
	buffer_full = TRUE;
    }

  if (*cur_pos != '>')
    {
      cur_pos = NULL;
      goto finish;
    }

  attrs[n_attrs * 2] = NULL;

  if (end_tag)
    {
      if (handler->endElement != NULL)
	(*handler->endElement)(context, name);
    }
  else
    {
      if (handler->startElement != NULL)
	(*handler->startElement)(context, name, (const char *const *)attrs);
    }

 finish:
  G_FREE(name);

  while (--n_attrs >= 0)
    {
      G_FREE(attrs[n_attrs * 2]);
      if (attrs[n_attrs * 2 + 1] != NULL)
	G_FREE(attrs[n_attrs * 2 + 1]);
    }

  return (cur_pos != NULL) ? cur_pos + 1 : NULL;
}


#define DEFAULT_ENTITY_SIZE	32
static const char *
process_entity_ref(ElementHandler *handler, void *context, const char *head,
		   int len)
{
  /* (*head == '&') */
  char default_buffer[DEFAULT_ENTITY_SIZE];
  char *entity;
  const char *ref_tail;
  int size;

  /* Ȥʬμ */
  if (len < 2)
    return NULL;	/* ¿ʬ */

  if (len >= 2 && head[1] != '#')
    { /* character entity references */
      size = 0;
      ref_tail = &head[1];
      do
	{
	  char c = *ref_tail;
	  if (!isalnum(c))
	    break;
	  size++;
	  ref_tail++;
	} while (size + 2 < len);
    }
  else if (len >= 4 && (head[2] == 'x' || head[2] == 'X'))
    {
      /* numeric character references(16ɽ) */
      size = 2;
      ref_tail = &head[3];
      do
	{
	  char c = *ref_tail;
	  if (!isxdigit(c))
	    break;
	  size++;
	  ref_tail++;
	} while (size + 4 < len);
    }
  else if (len >= 3)
    {
      /* numeric character references(10ɽ) */
      size = 1;
      ref_tail = &head[2];
      do
	{
	  char c = *ref_tail;
	  if (!isdigit(c))
	    break;
	  size++;
	  ref_tail++;
	} while (size + 3 < len);
    }
  else
    return NULL;	/* ¿ʬ */

  if (size >= DEFAULT_ENTITY_SIZE)
    entity = G_STRNDUP(head + 1, size);
  else
    {
      entity = default_buffer;
      memcpy(default_buffer, head + 1, size);
      entity[size] = '\0';
    }

#if DEBUG_PARSER_MOST
  fprintf(stderr, "size = %d, entity = '%s'\n", size, entity);
#endif

  if (handler->entityReference != NULL)
    (*handler->entityReference)(context, entity);
  else
    if (!default_entity_ref_handler(handler, context, entity)
	&& handler->characters != NULL)
      {
	gchar *utf8_entity = NULL;
	gchar broken_ref_buffer[DEFAULT_BUFFER_SIZE];
	utf8_entity = convert_string(handler->converter, handler->helper,
				     entity, -1);
	if (utf8_entity != NULL)
	  snprintf(broken_ref_buffer, DEFAULT_BUFFER_SIZE, "&%s%s",
		   utf8_entity, (*ref_tail == ';' ? ";" : ""));
	else
	  snprintf(broken_ref_buffer, DEFAULT_BUFFER_SIZE, "&%s%s",
		   entity, (*ref_tail == ';' ? ";" : ""));
	(*handler->characters)(context, broken_ref_buffer,
			       strlen(broken_ref_buffer));
	if (utf8_entity != NULL)
	  G_FREE(utf8_entity);
#if DEBUG_PARSER_MOST
	fprintf(stderr, "broken entity ref: %s -> %s\n", entity, broken_ref_buffer);
#endif
      }

  if (entity != default_buffer)
    G_FREE(entity);

  return ref_tail + (*ref_tail == ';' ? 1 : 0);
}


#if ENABLE_FAKE_ANCHOR
/*
 * headURLʸȻפäơ줬AǤhref°ǤäΤ褦
 * ϥɥƤӽФURLʸľʸؤΥݥ󥿤֤
 */
static const char *
fake_anchor_tag(ElementHandler *handler, void *context, const char *head,
		int len)
{
  static const char *delimiter = " <>\"\'()\n\t";	/* Τ餤ǡġ */
  const char *cur_pos = head;
  const char *tail;
  char *attrs[] =
    {
      "href",
      NULL,
      NULL
    };
  char url_buffer[PATH_MAX] = "http://";
  int tmp_len;
  int url_len;

  if (*head == 'h')
    {
      cur_pos += 7;	/* skip "http://" */
      len -= 7;
    }
  else
    {
      cur_pos += 6;	/* skip "ttp://" */
      len -= 6;
    }

#if ENABLE_SKIP_IME_NU
  /*
   * ime.nuϼΤƼΤơ
   */
  while (TRUE)
    {
      if (len > 7 && g_ascii_strncasecmp(cur_pos, "ime.nu/", 7) == 0)
	{
	  cur_pos += 7;
	  len -= 7;
	  continue;
	}
      if (len > 14 && g_ascii_strncasecmp(cur_pos, "pinktower.com/", 14) == 0)
	{
	  cur_pos += 14;
	  len -= 14;
	  continue;
	}
      break;
    }
#endif

  tail = cur_pos;
  tmp_len = len;

  while (tmp_len > 0)
    {
      const char *c = delimiter;
      char s = *tail;
      if (s & 0x80)
	goto tail_found;	/* multi bytesʸäݤΤڤܤȸʤ */

      while (*c)
	{
	  if (*c == s)
	    goto tail_found;
	  c++;
	}
      tail++;
      tmp_len--;
    }
 tail_found:

  url_len = tail - cur_pos;
  if (url_len > PATH_MAX - 7)
    return head + 2;	/* Ĺϥѥ(2ʸΤƤƥᥤ롼פⲽ) */

  memcpy(url_buffer + 7, cur_pos, url_len);
  url_buffer[url_len + 7] = '\0';

#if DEBUG_PARSER_MOST
  fprintf(stderr, "URL: \"%s\"\n", url_buffer);
#endif
  attrs[1] = url_buffer;

  if (handler->startElement != NULL)
    (*handler->startElement)(context, "a", (const char *const *)attrs);

  if (handler->characters != NULL)
    (*handler->characters)(context, head, tail - head);

  if (handler->endElement != NULL)
    (*handler->endElement)(context, "a");

  return tail;
}
#endif


/*
 * Ĺǽstrpbrkؿ
 * stringĹlenʸcharset˴ޤޤǽʸؤΥݥ󥿤֤
 * ʤNULL֤
 */
static const char *
mempbrk(const char *string, const char *charset, size_t len)
{
  while (len > 0)
    {
      const char *c = charset;
      char s = *string;
      while (*c)
	{
	  if (*c == s)
	    return string;
	  c++;
	}
      string++;
      len--;
    }
  return NULL;
}


static gchar *
encode_unicode_to_utf8(unsigned int character, gchar *buffer)
{
  /* ִפϡ&#x6f22;&#x5b57;*/
  if (character > 0xfffd)
    return NULL;	/* encodingλΤʤġ */

  /* ХåեNULü4byteʲˤʤ롣
   * δؿǤUTF8_BUFFER_SIZEݤƤȲꤹ롣*/
  if (character >= 1 && character <= 0x7f)
    {
      buffer[0] = (char)character;
      buffer[1] = '\0';
      return buffer;
    }

  if (character == 0 || (character >= 0x80 && character <= 0x7ff))
    {
      buffer[0] = (char)(0xc0 | ((character >> 6) & 0x1f));
      buffer[1] = (char)(0x80 | (character & 0x3f));
      buffer[2] = '\0';
      return buffer;
    }

  buffer[0] = (char)(0xe0 | ((character >> 12) & 0x0f));
  buffer[1] = (char)(0x80 | ((character >> 6) & 0x3f));
  buffer[2] = (char)(0x80 | (character & 0x3f));
  buffer[3] = '\0';
  return buffer;
}


/* MEMO: buf_len > DEFAULT_BUFFER_SIZEξ硢ҡ׾ΥХåեȸʤ */
typedef struct _CanonContext
{
  gchar *buffer;
  int buf_len;
  int number_of_chars;
  gboolean give_up;
} CanonContext;


static void
append_characters(void *ctx, const gchar *ch, int len)
{
  CanonContext *context = (CanonContext *)ctx;

  if (context->give_up)
    return;


  while (context->buf_len - context->number_of_chars < len + 1)
    {
      gchar *new_buffer;
      if (context->buf_len == DEFAULT_BUFFER_SIZE)
	{
	  new_buffer = (gchar *)G_MALLOC(context->buf_len * 2);
	  memcpy(new_buffer, context->buffer, context->number_of_chars);
	}
      else
	new_buffer = (gchar *)G_REALLOC(context->buffer, context->buf_len * 2);

      context->buf_len *= 2;
      context->buffer = new_buffer;
    }
  memcpy(context->buffer + context->number_of_chars, ch, len);
  context->number_of_chars += len;
  context->buffer[context->number_of_chars] = '\0';
}


gchar *
simple_string_canon(const char *src, int len,
		    iconv_t converter, iconv_helper *helper)
{
  gchar buffer[DEFAULT_BUFFER_SIZE];
  const char *cur_pos;
  int chars_left = (len == -1) ? strlen(src) : len;

  ElementHandler handler =
    {
      converter,
      helper,
      NULL,
      NULL,
      NULL,
      append_characters
    };

  CanonContext context =
    {
      buffer,
      DEFAULT_BUFFER_SIZE,
      0,
      FALSE
    };

  buffer[0] = '\0';
  cur_pos = src;
  while (cur_pos != NULL && chars_left > 0)
    {
      const char *tmp_pos = parse_text(&handler, &context,
				       cur_pos, chars_left);
      chars_left -= tmp_pos - cur_pos;
      if (tmp_pos == NULL)
	break;

      cur_pos = tmp_pos;
      if (*cur_pos == '&')
	parse_text(&handler, &context, "&amp;", 5);
      else if (*cur_pos == '<')
	parse_text(&handler, &context, "&lt;", 4);
      else
	break;	/*  */
      cur_pos++;
    }

  if (context.buffer == buffer)
    return G_STRDUP(context.buffer);

  return context.buffer;
}
