
# http://www.sec.gov/info/edgar/ednews/xbrlrss.htm
#   SEC RSS Feed についての情報がある。
#   - http://www.sec.gov/Archives/edgar/xbrlrss.xml
#     An hourly update of the most recent 100 XBRL documents
#   - http://www.sec.gov/Archives/edgar/2009-03-23.rss.xml
#     All XBRL documents distributed from April 4, 2005 through March 23, 2009 inclusive.
#   - http://www.sec.gov/Archives/edgar/xbrldata.zip
#     and http://www.sec.gov/Archives/edgar/xbrldata.tar.gz (78MB).
#     compressed archives of XBRL documents through March 23, 2009 inclusive
# このプログラムでは、
#    検索画面での XBRL による検索結果全体から xbrl データリンクを抜き出し、ローカルファイルへ
#    保存している

require 'rubygems'
require 'hpricot'
require 'benchmark'

require File.dirname(__FILE__) + '/../lib/simplecrawler'
# require 'simplecrawler'
require 'pp'

def getSC(page)
  # Set up a new crawler
  uri = "http://searchwww.sec.gov/EDGARFSClient/jsp/EDGAR_Query_Result.jsp?startDoc=#{(page-1)*25+1}&queryString=XBRL&queryForm=1&isAdv=1&numResults=#{page*25}"
  sc = SimpleCrawler::Crawler.new(uri)

  # override
  def sc.log(message)
  end

  def sc.href_filter(href)
    if /javascript:opennew\('(.*)','(.*)','(.*)'\);/ =~ href
      $1
    else
      href
    end
  end
  # SITE_DIR 以下でないが、クロール対象にしたい URI を指定する。
  def sc.enable_uri?(uri)
    uri.to_s.start_with?('http://www.sec.gov/Archives/edgar/data')
  end

  # sc.maxcount = 100
  sc.maxnest = 1
  sc.load_binary_data = false

  sc
end

def multi_mkdir(mpath)
  path = ''
  mpath.split('/').each do |f|
    path.concat(f)
    Dir.mkdir(path) unless path == '' || File.exist?(path)
    path.concat('/')
  end
end

puts Benchmark.measure {

  # The crawler yields a Document object for each visited page.
  num = 0

  SAVE_DIR = './data'
  startPage= 1
  endPage =  41  # 41
  startPage.upto(endPage) do |page|
    sc = getSC(page)
    sc.crawl { |document|

      unless (document.uri.to_s.downcase.end_with?('.xml') or
            document.uri.to_s.downcase.end_with?('.xbrl') or
            document.uri.to_s.downcase.end_with?('.xsd'))
        next
      end
      size = document.data == nil ? -1: document.data.size
      if document.headers and document.headers["last-modified"]
        time = Time.parse(document.headers["last-modified"])
      else
        time = document.fetched_at
      end

      num += 1
      printf("%5d: #{document.uri.to_s}\n", num)
      puts "\tnest:#{document.nest} #{time}\tsize:#{size}"

      save_path = URI.decode("#{SAVE_DIR}/#{document.uri.to_s.sub('http://www.sec.gov/Archives/edgar/data/', '')}")
      save_path += 'index.html' if save_path.end_with?('/')

      save_dir = File::dirname(save_path)
      multi_mkdir(save_dir) unless FileTest.exist?(save_dir)
      open(save_path, "w") {|dest|
        dest.write(document.data)
      }
    }
  end
}
