# prime/engine/engine-basic.rb
# $Id: engine-userdict.rb,v 1.8.2.8 2004/01/15 02:38:36 komatsu Exp $
#
# Copyright (C) 2002 Hiroyuki Komatsu <komatsu@taiyaki.org>
#     All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
# You can redistribute it and/or modify it under the terms of 
# the GNU General Public License version 2.
#

require 'sary'
require 'prime/prime-dict-config.rb'
require 'prime/engine/engine-japanese'

USERDICT = 'userdict'
$engine_class_name = 'PrimeEngineUserdict'

class PrimeUserDict < Sary::Searcher
  @@global_dict_data = {}

  def initialize (file, index_suffix = "")
    index_file = file + index_suffix + ".ary"
    File::ensure(file,       0600)
    File::ensure(index_file, 0600)
    super(file, index_file)
    @tmp_dict = {}
    @tmp_result = nil
    @tmp_result_counter = 0
    @parent_result = nil

    if @@global_dict_data[file].nil? then
      @@global_dict_data[file] = []
    end
    @dict_data = @@global_dict_data[file]
  end

  def set_dict (key, index_no, line)
    if key.is_a?(String) then
      (key + "\t").increase {|chars|
	set_dict_internal(chars, index_no)
      }
    else
      set_dict_internal(format("%d\t", key), index_no)
    end
    @dict_data[index_no] = line
  end

  def set_dict_internal (key, index_no)
    if @tmp_dict[key].nil? then
      @tmp_dict[key] = [index_no]
    else
      unless @tmp_dict[key].member?(index_no) then
	@tmp_dict[key] << index_no
      end      
    end
  end

  def search (query)
    @tmp_result = @tmp_dict[query]
    @tmp_result_counter = 0
    @parent_result = super
    return (@parent_result or @tmp_result)
  end

  def multi_search (queries)
    @tmp_result = []
    @tmp_result_counter = 0
    queries.each {|query|
      result = @tmp_dict[query]
      unless result.nil? then
	@tmp_result += result
      end
    }
    @parent_result = super
    return (@parent_result or @tmp_result)
  end

  def get_next_context_line ()
    if @tmp_result.nil? or @tmp_result[@tmp_result_counter].nil? then
      if @parent_result then
	return super
      else
	return nil
      end
    else
      index_no = @tmp_result[@tmp_result_counter]
      @tmp_result_counter += 1
      return @dict_data[index_no]
    end
  end

  def sort_occurrences ()
    if @parent_result then
      super
    end
  end
end


class PrimeEngineUserdict < PrimeEngineJapanese
  def initialize
    super

    @name = "Userdict engine"
    @id   = "userdict"
    @description = "Userdict engine"

    @dict_file        = File::join2(PRIME_USER_DIR, USERDICT)
    @partdict_file    = @dict_file + '-pos'
    @contextdict_file = @dict_file + '-co'

    initialize_userdict()
    initialize_timestamp()
    initialize_priority()
    initialize_dictbuffer()

    @file_index_no = File::join2(PRIME_USER_DIR, USERDICT + '_index_no')
    File::ensure(@file_index_no, 0600)

    @index_no_alt_counter = 0 # This is used when $PRIME_NO_SAVE is true.
    @dict_size = @timestamps.length

    @max_candidates = 10
    @last_context_literal = ""
    @last_context_index   = nil
  end

  def initialize_userdict ()
    @dict         = PrimeUserDict.new(@dict_file, '_pron')
    @dict_literal = PrimeUserDict.new(@dict_file, '_literal')
    @dict_index   = PrimeUserDict.new(@dict_file, '_index')
    @partdict     = PrimeUserDict.new(@partdict_file)
    @contextdict  = PrimeUserDict.new(@contextdict_file, '_pron')
  end

  def initialize_timestamp
    @timestamps = []
    @filename_timestamps = File::join2(PRIME_USER_DIR, USERDICT + '-ts')
    File::ensure(@filename_timestamps)
    File::open(@filename_timestamps, "r") {|io|
      string = io.read()
      data_size = string.length / 4
      data_size.times{|i|
        @timestamps[i] = string[i*4,4].unpack('N').first
      }
    }
  end

  def initialize_priority
    @longest_time = 3600 * 24 * 7
    @base_score = 1000
    @step_number = @base_score / get_priority_func(@longest_time)
  end

  def initialize_dictbuffer()
    @logbuffer = ""
    @dictbuffer_learning     = ""
    @dictbuffer_cooccurrence = ""

    filename = File::join2(PRIME_USER_DIR, "userdict_diff")
    @file_dictbuffer_learning     = open_dictbuffer(filename)
    @file_dictbuffer_cooccurrence = open_dictbuffer(filename + "-co")
    @file_logbuffer = open_dictbuffer(filename + ".log")
  end

  def open_dictbuffer (filename)
    file_option = (File::CREAT|File::WRONLY|File::APPEND)
    file_io = Kernel::open(filename, file_option)
    file_io.chmod(0600) if file_io::stat.owned?
    return file_io
  end

  # ---------------------------------------- End of intialization.

  def refresh ()
    initialize_userdict()
    initialize_timestamp()
    return true
  end

  def learn_word (pron, literal, pos, context, suffix, rest)
    if pos == "ü" then
      dictbuffer_flush()
      return true
    end

    pos = pos.empty? ? "̤θ" : pos
    context_index = get_context_index(context)

    ## Flush dictbuffer if context is empty or is not last_context
    if context != @last_context_literal or context == "" then
      dictbuffer_flush()
    end

    index_no = learn_word_internal(pron, literal, pos, context_index)

    if suffix.length > 0 then
      index_no = learn_word_internal(suffix, suffix, "::#{pos}", index_no)
    end
    if rest.length > 0 then
      index_no = learn_word_internal(rest, rest, "", index_no)
    end

    unless suffix.empty? or rest.empty? then
      pron    += (suffix + rest)
      literal += (suffix + rest)
      index_no = learn_word_internal(pron, literal, "ʸ", context_index,
				     :nologging)
    end

    @last_context_index   = index_no
    @last_context_literal = literal

    # write_log(...)
    return true
  end

  def learn_word_internal (pron, literal, pos, context_index,
			   logging = :logging)
    timestamp = Time.new().to_i()
    indexes = get_indexes(pron, pos, literal)
    if indexes.empty? then
      index_no = learn_new_word(pron, literal, pos, timestamp)
    else
      index_no = indexes[0] # The number of indexes must be one.
    end
    learn_cooccurrence(pron, literal, index_no, context_index)

    set_timestamp(index_no, timestamp)

    if logging == :logging then
      @logbuffer += [index_no, pron, pos, literal, timestamp].join("\t") + "\n"
    end

    return index_no
  end

  def set_timestamp(index_no, timestamp)
    @timestamps[index_no] = timestamp
    unless $PRIME_NO_SAVE then
      File::open(@filename_timestamps, "r+") {|io|
        io.seek(index_no * 4)
        io.write([timestamp].pack('N'))
      }
    end
  end

  def get_index_no ()
    index_no = nil
    if $PRIME_NO_SAVE then
      File::open(@file_index_no, "r") {|io|
	index_no = io.read().to_i + @index_no_alt_counter
	@index_no_alt_counter += 1
      }
    else
      File::open(@file_index_no, "r+") {|io|
	index_no = io.read().to_i
	io.rewind()
	io.print((index_no + 1).to_s)
      }
    end
    return index_no
  end

  def learn_new_word (pron, literal, pos, timestamp)
    index_no = get_index_no()

    new_line = format("%d\t%s\t%s\t%s\t%d\n",
		      index_no, pron, pos, literal, timestamp)
    @dictbuffer_learning += new_line
    
    pron_key = [pron, pos, literal].join("\t")
    @dict.set_dict(pron_key, index_no, new_line)
    @dict_literal.set_dict(literal, index_no, new_line)
    @dict_index.set_dict(index_no, index_no, new_line)

    @dict_size += 1
    return index_no
  end

  def learn_cooccurrence(pron, literal, index_no, context_index)
    if context_index.nil? then
      return
    end
    key = format("%d:%s\t", context_index, pron)
    new_line = format("%s%d:%s\t%d", key, context_index, literal, index_no)
    dictbuffer_line = [context_index, pron, literal, index_no].join("\t")
    @dictbuffer_cooccurrence += (dictbuffer_line + "\n")

    @contextdict.set_dict(key, index_no, new_line)
  end

  def get_context_index (context)
    if context == @last_context_literal then
      return @last_context_index
    else
      @dict_literal.search(context)
      line = @dict_literal.get_next_context_line()
      if line.nil? or line.empty? then
	return nil
      else
	index_no = line.chomp.split(/\t/)[0]
	return index_no
      end
    end
  end

  def get_indexes (pron, pos = nil, literal = nil)
    query = [pron, pos, literal].join("\t")
    @dict.search(query)
    indexes = []
    while result_line = @dict.get_next_context_line() do
      indexes << result_line.chomp.split(/\t/)[0].to_i
    end
    return indexes
  end

  # ---------------------------------------- End of learning module
  def dictbuffer_flush ()
    if $PRIME_NO_SAVE then
      return
    end

    unless @dictbuffer_learning.empty? then
      @file_dictbuffer_learning.flock(File::LOCK_EX|File::LOCK_NB)
      @file_dictbuffer_learning.print(@dictbuffer_learning)
      @file_dictbuffer_learning.flock(File::LOCK_UN|File::LOCK_NB)
      @dictbuffer_learning = ""
    end
    unless @dictbuffer_cooccurrence.empty? then
      @file_dictbuffer_cooccurrence.flock(File::LOCK_EX|File::LOCK_NB)
      @file_dictbuffer_cooccurrence.print(@dictbuffer_cooccurrence)
      @file_dictbuffer_cooccurrence.flock(File::LOCK_UN|File::LOCK_NB)
      @dictbuffer_cooccurrence = ""
    end
    unless @logbuffer.empty? then
      @file_logbuffer.flock(File::LOCK_EX|File::LOCK_NB)
      @file_logbuffer.print(@logbuffer + "\n") 
      @file_logbuffer.flock(File::LOCK_UN|File::LOCK_NB)
      @logbuffer = ""
    end
  end

  def close
    dictbuffer_flush()
    @file_dictbuffer_learning.close()
    @file_dictbuffer_cooccurrence.close()
    return true
  end

  # ---------------------------------------- End of writing dictionaries.

  private
  def lookup (input, method = :prefix)
    results = PrimeResult.new
    queries = make_queries(input.base)
    if method == :prefix then
      input.expands.each {|string|
	queries.add(string) 
      }
      rests = queries.rests
      max   = @max_candidates
    else
      queries.add(input.original, nil, "", "", false)
      rests = [""]
      max   = 100
    end

    rests.each {|rest|
      query_lines = queries.query_lines(rest)

      (indexes, result_lines) =
	lookup_dict_with_context(query_lines, input.context, max)
      result_lines.each {|result_line|
	(index_no, pattern, part, word, freq) = result_line.split(/\t/)
	suffix = get_suffix(input.base, pattern, rest)
	timestamp = lookup_timestamp(index_no.to_i) 
	priority  = 18000 + get_priority(timestamp) - (rest.length * 1000)
	cand = PrimeCandidate.new(pattern, word, priority,
				  part, suffix, rest)
	results << cand
      }
      max -= results.length
      (max > 0) or break

      result_lines = lookup_dict(query_lines, max)
      result_lines.each {|result_line|
	(index_no, pattern, part, word, freq) = result_line.split(/\t/)
	unless indexes.member?(index_no) then
	  suffix = get_suffix(input.base, pattern, rest)
	  timestamp = lookup_timestamp(index_no.to_i) 
	  priority  = 15000 + get_priority(timestamp) - (rest.length * 1000)
	  cand = PrimeCandidate.new(pattern, word, priority,
				    part, suffix, rest)
	  results << cand
	end
      }
      max -= results.length
      (max > 0) or break
    }
    return results
  end

  def lookup_dict_with_context (query_lines, context, max = nil)
    ## FIXME: Refact the code.
    ## FIXME: <komatsu@taiyaki.org> (2003-12-03)
    results = []
    indexes = []

    if context.nil? then
      return [results, indexes]
    end
    query_lines_with_context = []
    @dict_literal.search(context + "\t")
    while line = @dict_literal.get_next_context_line do
      index_no = line.chomp.split(/\t/)[0]
      query_lines.each {|query_line|
	query_lines_with_context << format("%d:%s", index_no, query_line)
      }
    end

    return [results, indexes] if max == (nil or 0)
    
    if !(query_lines_with_context.empty?) and
	@contextdict.multi_search(query_lines_with_context) then
      @contextdict.sort_occurrences()
      while line = @contextdict.get_next_context_line do
	index_no = line.chomp.split(/\t/)[2]
	## FIXME: Consider the frequency of each indexes
	## FIXME: <komatsu@taiyaki.org> (2003-12-04)
	if indexes.member?(index_no) == false then
	  @dict_index.search(index_no + "\t")
	  results << @dict_index.get_next_context_line
	  indexes << index_no
	end

	## FIXME: The candidates are not ordered by their priorities.
	## FIXME: <komatsu@taiyaki.org> (2003-12-03)
# 	if results.length == max then
# 	  break
# 	end
      end
    end
    return [indexes, results]
  end

  def lookup_dict (query_lines, max = nil)
    results = []
    return results if max == (nil or 0)

    if !(query_lines.empty?) and @dict.multi_search(query_lines) then
      @dict.sort_occurrences()
      while line = @dict.get_next_context_line do
	results << line.chomp
	## FIXME: The candidates are not ordered by their priorities.
	## FIXME: <komatsu@taiyaki.org> (2003-12-03)
# 	if results.length == max then
# 	  break
# 	end
      end
    end
    return results
  end

  def lookup_timestamp (index_no)
    return @timestamps[index_no]
  end

  def lookup_part (base)
    if @partdict.search(base + "\t") then
      (yomi, *parts) = @partdict.get_next_context_line.chomp.split(/\t/)
      return parts
    else
      return []
    end
  end

  def get_priority (timestamp)
    current_time = Time.new().to_i
    time_diff = current_time - timestamp
    priority = (@base_score - get_priority_func(time_diff) * @step_number).to_i
    return priority
  end

  def get_priority_func(number)
    if number < 0 then
      return 0
    else
      return Math::log(number * (1.0 / @base_score) + 1)
    end
  end
end

