#!/usr/bin/ruby
# -*- coding:utf-8 -*-

require 'fileutils'


# ==============================================================================
# check_filesize
# ==============================================================================

check_filesize = Proc.new do
	wikifiles = Dir.glob($targetfiles)
	$filesize = 0

	wikifiles.length.times do |i|
		file = File.new(wikifiles[i], "r")
		c = file.read.bytesize
		$filesize = $filesize + c
		file.close
	end

	# Wikipedia本文を100万個のファイルに分割したときのファイルサイズを計算
	$filesize = $filesize / 1000000
	puts "filesize = " + $filesize.to_s
end


# ==============================================================================
# split_files
# ==============================================================================

split_files = Proc.new do
	file = File.new($filename, "r")
		lines = file.readlines
	file.close

	if $c == nil
		$c = 0
	end

	t = ""

	lines.length.times do |i|
		s = lines[i]

		if s.index('<doc id="') != 0 &&
		s.index('</doc>') != 0 &&
		s != "\n"
			t = t + s
		end

		# 指定サイズ以上の大きさになったらファイルに書き込む
		if t.bytesize >= $filesize
			# ファイルが1000個を超えたらディレクトリを変える
			dirname = "jawiki_splitted/" + sprintf("%03d", ($c / 1000))

			if Dir.exist?(dirname) == false
				FileUtils.mkdir_p(dirname)
			end

			$dicname = dirname + "/" + sprintf("%06d", $c) + ".txt"
			dicfile = File.new($dicname, "w")
			dicfile.puts t
			dicfile.close

			$c = $c + 1
			t = ""
		end

		# 最後の1回の場合は指定サイズ未満でも書き込む
		if i == (lines.length - 1) &&
		t != ""
			dicfile = File.new($dicname, "a")
			dicfile.puts t
			dicfile.close
		end
	end
end


# ==============================================================================
# main
# ==============================================================================

$targetfiles = ARGV

if ARGV == []
	puts "Usage: ruby script.rb [FILE]"
	exit
end

check_filesize.call

$targetfiles.length.times do |i|
	$filename = $targetfiles[i]

	split_files.call
end
