#!/usr/bin/perl
# mkdata.pl: ؽǡץ
#
#						sako shinji
#
# $Id: mkdata.pl,v 1.3 2006/10/20 01:15:10 sako Exp $
# 
use File::Basename;
use File::Copy;
use File::Path;

$script_base = dirname($0); 
require $script_base . "/config.pl";

# ٥ǡʤɤΥѥ
$labeldir="${database}/${dataset}/label";
$monolabeldir = "${labeldir}/monophone";
$fulllabeldir = "${labeldir}/fullcontext";
$genlabeldir = "${labeldir}/gen";

# ƥեν
$out_rawdir =  "${datadir}/raw";
$out_labeldir = "${datadir}/labels";
$out_seglabdir = "${out_labeldir}/mono";
$out_fulllabdir = "${out_labeldir}/full";
$out_genlabdir = "${out_labeldir}/gen";
$out_f0dir = "${datadir}/f0";

# ȷǡ()̵֤Ĺ, ͤڤͤ(sec)
$sil_threshold = 0.25;

# ϥǥ쥯ȥκ
if( ! -d $workdir ) {
    mkdir $workdir, 0755;
}

mkdir $datadir, 0755;
mkdir $traindir, 0755;
mkdir $out_data, 0755;
mkdir $out_rawdir, 0755;
mkdir $out_labeldir, 0755;
mkdir $out_seglabdir, 0755;
mkdir $out_fulllabdir, 0755;
mkdir $out_genlabdir, 0755;
mkdir $out_f0dir, 0755;
mkdir $tmpdir, 0755;

# ǡΥꥹ
@rawfiles = glob( "${recdir}/*.ad");
# ե뤴Ȥ˥롼

if( ! $#rawfiles) {
    printf "ȷǡĤޤ\n";
    exit;
}

$f0_lower = 50;
$f0_upper = 600;
print "estimate f0 range ... ";
# F0Фξ¤Ȳ¤ư
system( "${detect_f0range} ${recdir} ${getf0_base} ${sptkbindir}"); 
$detect_f0 = "/tmp/.f0range_$ENV{USER}";
if( -e $detect_f0 ){
        open ( UP_IN, "/tmp/.f0range_$ENV{USER}");
        $line = <UP_IN>;
        ($f0_lower, $f0_upper) = split( " ", $line);
        close( UP_IN);
	# Ǿͤμ˼ԤȤ
        if( $f0_lower < 0 ){
                $f0_lower = 50;
        }
	# ͤμ˼ԤȤ
        if( $f0_upper < 0 ) {
                $f0_upper = 600;
        }
}
print "[ $f0_lower , $f0_upper ]" . "\n";

$getf0_option = "${getf0_option} -U $f0_upper -L $f0_lower ";
unlink( "/tmp/.f0range_$ENV{USER}");

# ġβǡȤˡǶF0ФԤ
# 줿Ƕ˴Ť̵֤򤽤
foreach $rawfile (@rawfiles){
    my $base = basename($rawfile, ".ad");
    my $outbase = ${dataset} . '_' . ${speaker} . '_' . ${base};

    # ȥե
    my $monolabel = "${monolabeldir}/${base}.lab";
    my $tmpraw = "${tmpdir}/${speaker}.raw";
    my $tmpseglabel = "${tmpdir}/${speaker}_${base}.lab";

    # ϥե
    my $outraw = "${out_rawdir}/${outbase}.raw";
    my $f0data = "${out_f0dir}/${outbase}.f0";
    my $seglabel = "${out_seglabdir}/${outbase}.lab";
    my $fulllabel = "${out_fulllabdir}/${outbase}.lab";

    print "${base}: ";

    # juliusˤ벻ǥơ¹
    system( "${julian_script} ${rawfile} ${monolabel} ${tmpseglabel}");
    print ".";

    # ̵֤Ĺ
    my $sil_begin_width = 0;
    my $sil_end_width = 0;

    ($sil_begin_width, $sil_end_width) = &detect_silence( ${tmpseglabel});
    if( -z ${tmpseglabel}){
	print " skip (饤ȼ)\n";
    }
    else{
	# Хȥå (big-endian -> little-endian)
	&file_byteswap( $rawfile, $tmpraw);
	print ".";

	# ̵֤򤽤
	&file_cut( $tmpraw,
		   ($sil_begin_width - $sil_threshold),
		   ($sil_end_width - $sil_threshold),
		   $outraw);
	print ".";

	# ٥
	&modify_segment( $tmpseglabel,
			 $sil_begin_width - $sil_threshold,
			 $sil_end_width - $sil_threshold,
			 $seglabel);
	print ".";

	# ڤꤽǡF0Ф¹
	system( "${getf0_script} ${getf0_option} -o ${f0data} ${outraw}");
	print ".";

	# fullcontext٥򥳥ԡ
	copy "${fulllabeldir}/${base}.lab", $fulllabel or die "եǤޤ (${fulllabel})";
	print ".";	

	# ֥ǡ
	unlink( $tmpraw);
	unlink( $tmpseglabel);

	# ʸϽλ
	print "\n";
    }
}

# ǡΥꥹ
@genlabfiles = glob( "${genlabeldir}/*.lab");
# ѥ٥򥳥ԡ
foreach my $labfile (@genlabfiles){
    my $base = basename($labfile, ".lab");
    my $outbase = ${dataset} . '_' . ${speaker} . '_' . ${base};
    my $genlabel = "${out_genlabdir}/${outbase}.lab";

    print "${base}:";
    copy $labfile, $genlabel or die "եǤޤ(${genlabel})";
    print ".";
    
    print "\n";
}

# configureץȤμ¹
system( "SPEAKER=${speaker} DATASET=${dataset} ./configure --with-sptk-search-path=${sptkbindir} --with-hts-search-path=${htsbindir}");

# 2byte short ǡΥХȥå
sub file_byteswap{
    my $in_file = $_[0];
    my $out_file = $_[1];
    
    open( IN, $in_file) or die "file_byteswap: ե򳫤ޤ ($in_file)\n";
    open( OUT, ">$out_file") or die "file_byteswap: ե򳫤ޤ ($out_file)\n";

    binmode(IN);
    binmode(OUT);

    while(sysread( IN, $buf, 2)){
           my $data = unpack( "s", $buf);
           my $swap_data = pack( "n", $data);
           syswrite( OUT, $swap_data, 2);
    }

    close( OUT);
    close( IN);
}

# ΥݡĹ
sub detect_silence{
    my $seg_label = $_[0];

    open( IN, $seg_label) or die "detect_silence: ե򳫤ޤ($seg_label)\n";
    my $sil_bigin = 0;
    my $sil_end = 0;
    while( <IN>){
	chomp;
	split;
	my $start = $_[0];
	my $end = $_[1];
	my $phoneme = $_[2];
	if( $phoneme eq "sil"){
	    if( $start == 0 ){
		$sil_begin = $end-$start;
	    }
	    else{
		$sil_end = $end-$start;
	    }
	}
    }
    close(IN);

    # HTK뤫ñ̤Ѵ
    return ($sil_begin/10000000,$sil_end/10000000);
}

# Ƕν
sub modify_segment{
    my $seg_label = $_[0];
    my $begin_shift = $_[1];
    my $end_shift = $_[2];
    my $mod_label = $_[3];

    # äHTKñ̤Ѵ
    if( $begin_shift < 0){
	$begin_shift = 0;
    }
    if( $end_shift < 0){
	$end_shift = 0;
    }

    $begin_shift *= 10000000;
    $end_shift *= 10000000;

    # 
    $begin_shift = int($begin_shift);
    $end_shift = int($end_shift);

    open( IN, $seg_label);
    open( OUT, ">$mod_label");

    while( <IN>){
	chomp;
	split;
	my $start = $_[0];
	my $end = $_[1];
	my $phoneme = $_[2];

	if( $phoneme eq "sil"){
	    if( $start == 0){
		$end -= $begin_shift;
	    }
	    else{
		$start -= $begin_shift;
		$end -= ($begin_shift + $end_shift);
	    }
	}
	else{
	    $start -= $begin_shift;
	    $end -= $begin_shift;
	}
	
	printf(OUT "%d %d %s\n", $start, $end, $phoneme);
    }
    close( OUT);
    close( IN);
}

# ХʥǡΰΥḁ̇̄ե¸
sub file_cut{
    my $in_file = $_[0];
    my $start_offset = $_[1];
    my $end_offset = $_[2];
    my $out_file = $_[3];

    # ξ0
    if( $start_offset < 0){
	$start_offset = 0;
    }
    # ξ0
    if( $end_offset < 0){
	$end_offset = 0;
    }
    
    # 
    $start_frame = int($samplerate * $start_offset);
    $end_frame = int($samplerate * $end_offset);

    # ե륵μ
    my @stat = stat( $in_file);
    my $sample_num = $stat[7]/2;

    open( IN, $in_file) or die "file_cut: ե뤬ޤ ($in_file)\n";
    binmode( IN);

    open( OUT, ">$out_file") or die "file_cut: ե뤬ޤ ($out_file)\n";
    binmode( OUT);

    # ڽФƬ֤ޤǰư
    seek( IN, $start_frame * 2, 0);

    my $cut_length = $sample_num - ($start_frame + $end_frame);
    # ڽФ֤Τ߽
    for( my $f=0; $f < $cut_length; $f++){
	sysread( IN, $buf, 2);
	syswrite( OUT, $buf, 2);
    }

    close( OUT);
    close( IN);
}

# ZIPΥեŸ
sub extract_zip{
	my $zipfile = $_[0];
	my $outpath = $_[1];

	if( ! -s $zipfile ){
	    print "extract_zip: ե뤬Ĥޤ($zipfile)\n";
	    exit;
	}
	
#	my $zip = Archive::Zip->new();
#	$zip->read( $zipfile);
#	$zip->extractTree( '', "${outpath}/");
	system( "unzip -o -q -d ${outpath} ${zipfile}");
}


# ؽǡѥץȤ
sub modify_Makefile_data{
    my $makefile = $_[0];
    my $sptkdir = $_[1];
    my $speaker = $_[2];
    my $dataset = $_[3];
    my $mceporder = $_[4];

    # Хåå
    if( ! -s $makefile ){
	print "modify_Makefile_data: ե뤬Ĥޤ($makefile)\n";
	exit;
    }

    my $makefile_org = "${makefile}.org";
    copy $makefile, $makefile_org;

    open( MKOUT, ">$makefile") or die "modify_Makefile: ե뤬ޤ($makefile)\n";
    open( MKIN, $makefile_org) or die "modify_Makefile: ե뤬ޤ($makefile_org)\n";
    
    while( $_ = <MKIN>){
	s/^SPTKDIR.*/SPTKDIR = ${sptkdir}/;
	s/^SPEAKER.*/SPEAKER = ${speaker}/;
	s/^DATASET.*/DATASET = ${dataset}/;
	s/^MCEPORDER.*/MCEPORDER = ${mceporder}/;
	print MKOUT;
    }
    close( MKIN);
    close( MKOUT);
}

# ǥؽMakefileν
sub modify_Makefile_train{
    my $makefile = $_[0];
    my $sptkdir = $_[1];
    my $htsdir = $_[2];
    my $speaker = $_[3];
    my $dataset = $_[4];
    my $mceporder = $_[5];

    if( ! -s $makefile ){
	print "modify_Makefile_train: ե뤬Ĥޤ($makefile)";
	exit;
    }

    # Хåå
    my $makefile_org = "${makefile}.org";
    copy $makefile, $makefile_org;

    open( MKOUT, ">$makefile") or die "modify_Makefile: ե뤬ޤ($makefile)\n";
    open( MKIN, $makefile_org) or die "modify_Makefile: ե뤬ޤ($makefile_org)\n";
    
    while( $_ = <MKIN>){
	s/^SPTKBINDIR.*/SPTKBINDIR = ${sptkdir}/;
	s/^HTSBINDIR.*/HTSBINDIR = ${htsdir}/;
	s/^#DATADIR.*/DATADIR = ${datadir}/;
	s/^#NAME.*/NAME = ${speaker}/;
	s/^#DATASET.*/DATASET = ${dataset}/;
	s/^#MCEPORDER.*/MCEPORDER = ${mceporder}/;
	print MKOUT;
    }
    close( MKIN);
    close( MKOUT);
}
