<?php

require_once dirname(dirname(__FILE__)) . '/Enum.class.php';

class Xoonips_Search {

	/**
	 * constant value for fulltext search data
	 * @var string
	 * @access private
	 */
	private $WINDOW_SIZE = Xoonips_Enum::XOONIPS_WINDOW_SIZE;

	/**
	 * regex patterns
	 * @var array
	 * @access private
	 */
	private $patterns;

	/**
	 * constractor
	 *
	 * @access public
	 */
	public function Xoonips_Search() {
		$this->initializePatterns();
	}

	/**
	 * get fulltext search sql by query string
	 *
	 * @access public
	 * @param string $field column name of table
	 * @param string $query search query
	 * @param string $encoding text encoding of search query
	 * @return string fulltext search sql
	 */
	public function getFulltextSearchSql($field, $query, $encoding) {
		// convert query encoding to 'UTF-8'
		if ($encoding != 'UTF-8') {
			$query = mb_convert_encoding($query, 'UTF-8', $encoding);
		}

		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$query = $this->normalizeString($query);

		// create fulltext search part of SQL
		return $this->makeFulltextSearchSql($field, $query);
	}

	/**
	 * get search sql by query string
	 *
	 * @access public
	 * @param string $field column name of table
	 * @param string $query search query
	 * @param object $dataType DataType class
	 * @param string $encoding text encoding of search query
	 * @param bool $isExact true:exact search
	 * @return string search sql
	 */
	public function getSearchSql($field, $query, $dataType, $isExact) {
		// detect query encoding
		$encoding = mb_detect_encoding($query);

		// convert query encoding to 'UTF-8'
		if ($encoding != 'UTF-8') {
			$query = mb_convert_encoding($query, 'UTF-8', $encoding);
		}

		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$query = $this->normalizeString($query);
		
		// create fulltext search part of SQL
		return mb_convert_encoding($this->makeSearchSql($field, $query, $dataType, $isExact), $encoding, 'UTF-8');
	}

	/**
	 * get fulltext data for storing into database
	 *
	 * @access public
	 * @param string $text UTF-8 encoded text
	 * @return string UTF-8 encoded fulltext data
	 */
	public function getFulltextData($text) {
		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$text = $this->normalizeString($text);

		// split text to search tokens
		$tokens = $this->splitIntoTokens($text);

		// get fulltext search data
		$data = $this->makeFulltextSearchData($tokens, false);

		return $data;
	}

	/**
	 * normalize string for fulltext search
	 * @access private
	 * @param string $text 'UTF-8' encoded input text
	 * @return string normalized string
	 */
	private function normalizeString($text) {

		// convert html character entities to numeric entities
		$text = Xoonips_Utils::htmlNumericEntities($text);

		// convert all html numeric entities to UTF-8 character
		$text = mb_decode_numericentity($text, array(0x0, 0xffff, 0, 0xffff), 'UTF-8');

		// sanitize non printable characters
		$pattern = sprintf('%s+', $this->patterns['noprint']);
		$text = mb_ereg_replace($pattern, ' ', $text);

		// normalize Japanese characters
		$text = mb_convert_kana($text, 's', 'UTF-8');

		// convert latin1 suppliment characters to html numeric entities
		$text = mb_encode_numericentity($text, array(0x0080, 0x00ff, 0, 0xffff), 'UTF-8');

		// trim string
		$text = trim($text);

		return $text;
	}

	/**
	 * make fulltext search sql
	 * @access private
	 * @param string $field column name of table
	 * @param string $query search query
	 * @return string fulltext search sql
	 */
	private function makeFulltextSearchSql($field, $query) {
		$search_query = new Xoonips_Search_Query($query);
		
		if ( $search_query->parse() ) {
			$sql = $search_query->stack->render($field);
		} else {
			$sql = "( '1' = '0' )"; // TODO: Is this reasonable?
		}
		
		return $sql;
	}

	/**
	 * make search sql
	 * @access private
	 * @param string $field column name of table
	 * @param string $query search query
	 * @param object $dataType DataType class
	 * @param bool $isExact true:exact search
	 * @return string search sql
	 */
	function makeSearchSql($field, $query, $dataType, $isExact) {
		if ($isExact) {
			$v = $dataType->convertSQLStr($query);
			return '("t1"' . $field ."='" . $v . "')";
		}
	
		$search_query = new Xoonips_Search_Query($query);
		
		if ( $search_query->parse() ) {
			$sql = $search_query->stack->render($field, $dataType);
		} else {
			$sql = "( '1' = '0' )"; // TODO: Is this reasonable?
		}
		
		return $sql;
	}

	/**
	 * split text into tokens
	 *
	 * @access private
	 * @param string $text UTF-8 encoded text
	 * @return array array of word not inclueded space
	 */
	private function splitTextBySpace($text) {
		mb_ereg_search_init($text, '[^\\x20]+');

		$tokens = array();
		$len = strlen($text);
		for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) {
			mb_ereg_search_setpos($i);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			$tokens[] = $regs[0];
		}

		return $tokens;
	}
	
	/**
	 * split text into tokens
	 *
	 * @access private
	 * @param string $text UTF-8 encoded text
	 * @return array array of token
	 */
	private function splitIntoTokens($text) {
		$pattern = sprintf('%s|%s', $this->patterns['sbword'], $this->patterns['mbword']);
		mb_ereg_search_init($text, $pattern);

		$tokens = array();
		$len = strlen($text);
		for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) {
			mb_ereg_search_setpos($i);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			$tokens[] = $regs[0];
		}

		return $tokens;
	}

	/**
	 * make fulltext search data
	 * @access private
	 * @param array $tokens UTF-8 encoded fulltext search tokens
	 * @param bool $isSql if this flag is true enclose search data in double quotation 
	 * @return string UTF-8 encoded fulltext search data
	 */
	private function makeFulltextSearchData($tokens, $isSql) {
		$ngram = array();
		foreach ($tokens as $token) {
			if ($this->isMultibyteWord($token)) {
				$ngramtokens = $this->ngram($token, $this->WINDOW_SIZE, !$isSql);
				foreach ($ngramtokens as $ngramtoken) {
					$ngram[] = bin2hex($ngramtoken);
				}
			} else {
				$ngram[] = $token;
			}
		}
		if ($isSql) {
			return '"' . implode(' ', $ngram) . '"';
		} else {
			return implode(' ', $ngram);
		}
	}

	/**
	 * get array of N-gram applied string
	 *
	 * @access private
	 * @param string $text input string
	 * @param int $n window size
	 * @param bool $trailing flag for output trailing
	 * @return array array of N-gram applied string
	 */
	private function ngram($text, $n, $trailing) {
		$tokens = array();
		$len = mb_strlen($text, 'UTF-8');
		for ($i = 0; $i + $n <= $len; $i++) {
			$tokens[] = mb_substr($text, $i, $n, 'UTF-8');
		}
		if ($trailing) {
			$pos = ($len - $n + 1) > 0 ? ($len - $n + 1) : 0;
			for ($i = $pos; $i < $len; $i++) {
				$tokens[] = mb_substr($text, $pos, $len - $i, 'UTF-8');
			}
		}
		return $tokens;
	}

	/**
	 * return true if multibyte word
	 * @access private
	 * @param string $token 'UTF-8' encoded word
	 * @return bool true if multibyte word
	 */
	private function isMultibyteWord($token) {
		$result = mb_ereg($this->patterns['mbword'], $token);
		return $result !== false;
	}

	/**
	 * initialize regex patterns
	 * @access private
	 */
	private function initializePatterns() {
		$mb_delimiter = array(
			array(0xe3, 0x80, 0x81),	// ,
			array(0xe3, 0x80, 0x82),	// .
			array(0xe2, 0x80, 0x99),	// '
			array(0xe2, 0x80, 0x9d),	// "
			array(0xe3, 0x83, 0xbb),	// centered dot
			array(0xe3, 0x80, 0x8a),	// case arc
			array(0xe3, 0x80, 0x8b),	// case arc
			array(0xe3, 0x80, 0x8c),	// case arc
			array(0xe3, 0x80, 0x8d),	// case arc
			array(0xe3, 0x80, 0x8e),	// case arc
			array(0xe3, 0x80, 0x8f),	// case arc
			array(0xe3, 0x80, 0x90),	// case arc
			array(0xe3, 0x80, 0x91),	// case arc
			array(0xe3, 0x80, 0x94),	// case arc
			array(0xe3, 0x80, 0x95)		// case arc
		);

		// non printable characters
		$patterns['noprint'] = sprintf('[\\x00-\\x1f\\x7f%s]',
			Xoonips_Utils::getCodeToLatin1(0x80, 0x9f));
		// single byte word
		$patterns['sbword'] = sprintf('[0-9a-zA-Z\\x27%s%s%s]+',
			Xoonips_Utils::getCodeToLatin1(0xc0, 0xd6),
			Xoonips_Utils::getCodeToLatin1(0xd8, 0xf6),
			Xoonips_Utils::getCodeToLatin1(0xf8, 0xff));
		// multi byte word
		$patterns['mbword'] = sprintf('[^\\x00-\\x7f%s%s]+',
			Xoonips_Utils::getCodeToLatin1(0x80, 0xff),
			Xoonips_Utils::getCodeToUtf8($mb_delimiter));
		// case arc
		$patterns['casearc'] = '\\x22[^\\x22]+\\x22|\\x28[^\\x28\\x29]+\\x29';

		$this->patterns = $patterns;
	}
	
}

class Xoonips_Search_Query_Element_Base {
	function Xoonips_Search_Query_Element_Base() {}
	function render($field, $dataType){}
	function renderFulltext($field){}
}

class Xoonips_Search_Query_Element extends Xoonips_Search_Query_Element_Base {
	var $value;
	var $is_compo;
	function Xoonips_Search_Query_Element( $value, $is_compo=false )
	{
		$this->value = $value;
		$this->is_compo = $is_compo;
	}
	function render($field, $dataType) {
		if ( $this->is_compo ) {
			$ret = $this->value->render($field, $dataType);
		} else {
			switch(true) {
				case ($dataType->isLikeSearch()):
					$ret = ' ("t1".' . $field . " like '%" . $dataType->convertSQLStrLike($this->value) . "%') ";
					break;
				case ($dataType->isNumericSearch()):
					$ret = ' ("t1".' . $field . "='" . $dataType->convertSQLNum($this->value) . "') ";
					break;
				default:
					$ret = ' ("t1".' . $field . "='" . $dataType->convertSQLStr($this->value) . "') ";
					break;
			}
		}
		return $ret;
	}
	function renderFulltext($field) {
		if ( $this->is_compo ) {
			$ret = $this->value->render($field, $dataType);
		} else {
			$ret = ' MATCH ( ' . $field . " ) AGAINST ( '" .  $dataType->convertSQLStr($this->value) . "' IN BOOLEAN MODE) ";
		}
		return $ret;
	}
}

class Xoonips_Search_Query_Component extends Xoonips_Search_Query_Element_Base {
	var $elements = array();
	var $conditions = array();
	function Xoonips_Search_Query_Component( $ele=null, $condition='AND' ){
		if ( isset( $ele ) && is_object( $ele ) ) {
			$this->add( $ele, $condition );
		}
	}
	function add( &$ele, $condition ) {
		$this->elements[] =& $ele;
		$this->conditions[] = $condition;
	}
	function render($field, $dataType) {
		$ret = '';
		$cnt = count( $this->elements );
		if ( $cnt > 0 ) {
			$ret = '( '.$this->elements[0]->render($field, $dataType);
			for ( $i = 1; $i < $cnt; $i++ ) {
				$ret .= ' '.$this->conditions[$i].' '.$this->elements[$i]->render($field, $dataType);
			}
			$ret .= ' )';
		}
		return $ret;
	}
	function renderFulltext($field) {
		$ret = '';
		$cnt = count( $this->elements );
		if ( $cnt > 0 ) {
			$ret = '( '.$this->elements[0]->renderFulltext($field);
			for ( $i = 1; $i < $cnt; $i++ ) {
				$ret .= ' '.$this->conditions[$i].' '.$this->elements[$i]->renderFulltext($field);
			}
			$ret .= ' )';
		}
		return $ret;
	}
}


class Xoonips_Search_Query {
	var $stack;
	var $lex_str = '';
	var $lex_strlen = 0;
	var $lex_pos = 0;
	var $lex_retmean = 'EOF';
	var $lex_retval = '';

	function Xoonips_Search_Query( $text ){
		$this->lex_str = $text;
		$this->lex_strlen = strlen( $text );
	}

	function parse()
	{
		$op = null;
		$brstack = array();
		$brstack_pos = 0;
		while ( $this->lex() ) {
			switch ( $this->lex_retmean ) {
				case 'WORD':
					$val = new Xoonips_Search_Query_Element( $this->lex_retval, false );
					if ( $brstack_pos == 0 ) {
						$brstack_pos++;
						$brstack[$brstack_pos] = new Xoonips_Search_Query_Component( $val, 'AND' );
					} else {
						if ( is_null( $op ) ) { $op = 'AND'; }
						$brstack[$brstack_pos]->add( $val, $op );
						unset( $val );
						$op = null;
					}
					break;
				case 'OR':
					$op = 'OR';
					break;
				case 'AND':
					$op = 'AND';
					break;
				case 'RIGHTBR':
					$tmp_stack = new Xoonips_Search_Query_Component();
					if ( $brstack_pos == 0 ) {
						$brstack_pos++;
						$brstack[$brstack_pos] =& $tmp_stack;
					} else {
						if ( is_null( $op ) ) { $op = 'AND'; }
						$brstack[$brstack_pos]->add( $tmp_stack, $op );
						$brstack_pos++;
						$brstack[$brstack_pos] =& $tmp_stack;
					}
					unset( $tmp_stack );
					break;
				case 'LEFTBR':
					if ( ! is_null( $op ) ) { echo 'error LEFTBR 1'; return false; }
					if ( $brstack_pos < 2 ) { echo 'error LEFTBR 2'; return false; }
					unset( $brstack[$brstack_pos] );
					$brstack_pos--;
					break;
				default:
					echo 'error illegal mean';
					return false;
			}
		}
		if ( $brstack_pos != 1 )
			return false;
		$this->stack =& $brstack[1];
		return true;
	}

	function dump()
	{
		echo "target : ".$this->lex_str."\n";
		while ( $this->lex() ) {
			echo "key : " .$this->lex_retmean.'['.$this->lex_retval.']'."\n";
		}
	}

	function lex() {
		$mean = 'EOF';
		$ret = null;
		$in_quote = false;
		$in_escape = false;
		$continue = true;
		$pop_require = false;
		for( $pos = $this->lex_pos; $continue && $pos < $this->lex_strlen; $pos++ ) {
			$c = $this->lex_str{$pos};
			if ( $in_quote ) {
				if ( $in_escape ) {
					if ( $c == '"' || $c == '\\' ) {
						$ret .= $c;
						$in_escape = false;
					}
				} else {
					if ( $c == '"' ) {
						$in_quote = false;
					} else if ( $c == '\\' ) {
						$in_escape = true;
					} else {
						$ret .= $c;
					}
				}
			} else {
				if ( $c == ')' || $c == '(' ){
					if ( $ret == null ) {
						$ret = $c;
						$mean = ( $c == ')' ) ? 'LEFTBR' : 'RIGHTBR';
						$continue = false;
					} else {
						$continue = false;
						$pop_require = true;
					}
				} else if ( $c == '"' ) {
					if ( $ret == null ) {
						$in_quote = true;
						$mean = 'PHRASE';
						$ret = '';
					} else {
						$continue = false;
						$pop_require = true;
					}
				} else {
					if ( $c == ' ' ) {
						if ( $ret != null ) { $continue = false; }
					} else {
						if ( $ret == null ) { $ret = $c; } else { $ret .= $c; }
						$mean = 'WORD';
					}
				}
			}
		}
		if ($pop_require ) $pos--;
		if ( $mean == 'WORD' ){
			switch( strtoupper( $ret ) ) {
				case 'AND':
					$mean = 'AND';
					break;
				case 'OR':
					$mean = 'OR';
					break;
			}
		} else if ( $mean == 'PHRASE' ) {
			$mean = 'WORD';
		}
		$this->lex_pos = $pos;
		$this->lex_retmean = $mean;
		$this->lex_retval = $ret;
		return ($mean != 'EOF');
	}
}
