# == Simple Crawler 
# :title: SimpleCrawler - a generic web crawler library in Ruby
# Author::    Peter Krantz (http://www.peterkrantz.com)
# License::   LGPL (See LICENSE file)
#
# The SimpleCrawler module is a library for crawling web sites. The crawler provides comprehensive data from the page crawled which can be used for page analysis, indexing, accessibility checks etc. Restrictions can be specified to limit crawling of binary files.
#
# == Output
# The SimpleCrawler::Crawler class yields a SimpleCrawler::Document object instance. This object contains information about a specific URI such as http headers and response data etc.
#
# == Contributions
# None yet :-) Why don't you go ahead and be first?
#
# == Example usage
# See the "Simple Crawler wiki"[http://www.peterkrantz.com/simplecrawler/wiki/].

module SimpleCrawler

  require 'uri'
  require 'rubygems'
  require 'hpricot'
  require 'open-uri'
  require File.dirname(__FILE__) + '/document'

  # See
  MARKUP_MIME_TYPES = [
    "text/html", "text/xml", "text/plain",
    "application/xml", "application/xhtml+xml",
  ]
  VERSION = "0.1.7 + modified by katoy"

  class Crawler

    attr_accessor :user_agent, :queue, :links, :site_uri, :current_count
    attr_accessor :load_binary_data, :skip_patterns, :include_patterns, :maxcount, :maxnest

    def initialize(url)
      @load_binary_data = false # default, skip loading of pagedata for binary files into Document.data
      @site_uri = URI.parse(url)
      @site_uri.path = "/" if @site_uri.path == ""
      @site_uri.fragment = nil
      @links = { }
      @queue = []
      @current_count = 0
      add_uri(@site_uri, 0)
    end

    # Override this method for your own logging needs.
    def log(message)
      puts message
    end
    # Override this method for your own logging needs.
    def href_filter(href)
      href
    end
    # Override this method for your own logging needs.
    def enable_uri?(uri)
      false
    end

    # Check if a path should be ignored because it matches a skip pattern or is already visited.
    def skip_uri?(uri, nest = 0)

      # Check if maxcount is reached
      if @maxcount
        if @current_count >= @maxcount
          return true
        end
      end

      # Check if maxnest is reached
      if @maxnest
        if nest > @maxnest
          return true
        end
      end

      # Check if path belongs to site_uri
      unless (uri.relative? or uri.host == @site_uri.host or enable_uri?(uri))
        return true
      end

      # Check if fragment identifier (e.g. #content)
      if uri.path.length == 0 and uri.fragment.length > 0
        return true
      end

      #Check if uri already visited in this crawl or if it is queued for crawling
      if @links.has_key?(uri.to_s) or @queue.include?(uri.to_s)
        return true
      end

      #Check if uri is in a skip pattern
      if @skip_patterns
        for skip_pattern in @skip_patterns
          re = Regexp.new(skip_pattern)
          if re.match(uri.path) 
            return true
          end
        end
      end

      # Check if uri is in at least one of the include patterns
      if @include_patterns
        match_found = false
        for include_pattern in @include_patterns
          re = Regexp.new(include_pattern)
          if re.match(uri.path) 
            match_found = true
          end
        end

        return true unless match_found
      end

      return false
    end

    def add_uri(uri, nest = 0)
      if uri.class == String
        uri = URI.parse(uri.strip)
        uri.fragment = nil
      end

      unless skip_uri?(uri, nest)
        @queue << {:uri => uri.to_s, :path => uri.path, :nest => nest}
        @current_count += 1
        @links[uri.to_s] = {:visited => false, :nest => nest}
        log("   Added #{uri}")
      end      
    end

    def get_doc(path, nest)
      doc = Document.new

      begin
        uri = @site_uri + path
        doc.uri = uri
        doc.fetched_at = Time.now
        doc.nest = nest

        log("Opening #{uri}")

        open(uri) { |f|

          mime_type = f.meta["content-type"].split(";")[0] if f.meta["content-type"]
        
          if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
            log("Loading data from #{uri}")
            doc.data = f.read
          else
            log("Skipping data for #{uri}")
            doc.data = nil
          end

          doc.headers = f.meta
          doc.http_status = f.status
          if f.meta["content-location"] != nil
            doc.location = uri + f.meta["content-location"]
          else
            doc.location = uri
          end
        }
      rescue => error
        log("Error fetching #{uri}: #{error.message}")
        if error.message[0..2] =~ /\d\d\d/ then
          doc.http_status = [error.message[0..2], error.message[3..-1]]
          return doc
        else
          raise error
        end
      end

      return doc 
    end

    def queue_local_links(doc_src)
      return if doc_src.data == nil
      log("Queuing links for #{doc_src.uri}")

      Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
      doc = Hpricot(doc_src.data)
      refs = doc.search("a[@href]")
      return if refs == nil
      for link in refs
        if link.attributes["href"].length > 0 then
          begin
            href = href_filter(link.attributes["href"])
            uri = URI.parse(href)
            uri = doc_src.uri + uri  if doc_src.uri != nil
            uri.fragment = nil
            add_uri(uri, doc_src.nest + 1)
          rescue => error
            # skip this link
            puts "queue_local_links: " + error
            # raise error
          end
        end
      end
      doc = nil
    end

    # Initiate crawling.
    def crawl()
      while (!@queue.empty?)
        begin
          path_info = @queue.shift
          current_doc = get_doc(path_info[:uri], path_info[:nest])

          yield current_doc

          queue_local_links(current_doc)
          @links[path_info[:uri]][:visited] = true
        rescue => error
          # skip this link
          puts "crawl: " + error
          # raise error
        end
      end
    end
  end

end
