#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2009 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Tandem.pm,v 1.1.1.1 2002/04/02 20:25:42 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::Tandem;

use SubOpt;
use G::Messenger;

use GD;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
		 find_tandem
		 foreach_tandem
		 graphical_LTR_search
);


__DATA__

#::::::::::::::::::::::::::::::
#        Perldoc
#::::::::::::::::::::::::::::::

=head1 NAME

  G::Seq::Tandem - Sequence analysis methods related to tandem repeats.

=head1 DESCRIPTION

           This class is a part of G-language Genome Analysis Environment,
           collecting basic sequence analysis methods.

=cut

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::


#find_tandem ver.20010705-01
#scripting by Koya Mori(s98982km@sfc.keio.ac.jp)
#This program finds tandem repeats from DNA sequence.
#(array Repeat)=&find_tandem(pointer GENOME,  int pattern_length,  boolean debug);
sub find_tandem{
    &opt_default(PatLength=>3,output=>"stdout",filename=>"tandem.csv");
    my @args=opt_get(@_);
    
#    my $gb=opt_as_gb(shift @args);
    my $gb=shift @args;
    my $PatLen=opt_val("PatLength");
    my $filename=opt_val("filename");
    my $output=opt_val("output");
    my $SaikiTrup=shift @args;
    my $ref_Genome;
    my $CountArrayLength;
    my $RepeatNumber;
    my $NextPattern;
    my $BeforePattern;
    my $RepeatEndPos;
    my $Half;
    my $pattern;
    my %PatRst;
    my @Repeat;
    my $i;
    

    if($SaikiTrup==''){
	$ref_Genome=\$gb->{SEQ};
	$Half=int($PatLen/2);
    }
    else{
	$ref_Genome=$gb;
	$Half=$PatLen-1;
    }


    if($SaikiTrup==1 && $PatLen<1){
	return 0;
    } 
    else{

	for($CountArrayLength=0;$CountArrayLength<=length($$ref_Genome)-$PatLen;$CountArrayLength++){
	    $RepeatNumber=1;
	    $pattern=substr($$ref_Genome,$CountArrayLength,$PatLen);
	    $NextPattern=substr($$ref_Genome,$CountArrayLength+$PatLen,$PatLen);

	    if($pattern eq $NextPattern || $SaikiTrup==1){
		if($SaikiTrup==''){
		    if($pattern=~/n/){
			$PatRst{$pattern}=1;
		    }
		    elsif($PatRst{$pattern}==''){
			$PatRst{$pattern}=find_tandem(\$pattern,-PatLength=>$Half,1);
		    } 
		}
		else{
		    $PatRst{$pattern}=find_tandem($ref_Genome,-PatLength=>$Half,1);
		} 

		if($PatRst{$pattern}==1 && $SaikiTrup==1){
		    last;
		}
		if($PatRst{$pattern}==0){
		    if($CountArrayLength-$PatLen>=0){
			$BeforePattern=substr($$ref_Genome,$CountArrayLength-$PatLen,$PatLen);
		    }
		    if($pattern ne $BeforePattern){
			while($pattern eq $NextPattern){
			    $RepeatNumber++;
			    $NextPattern=substr($$ref_Genome,$CountArrayLength+$PatLen*$RepeatNumber,$PatLen);
			}
		    }
		    if($RepeatNumber!=1){
			$RepeatEndPos=$CountArrayLength+$PatLen*$RepeatNumber;
			if($SaikiTrup==''){
			    $Repeat[$i]{pattern}=$pattern;
			    $Repeat[$i]{patlength}=$PatLen;
			    $Repeat[$i]{repnumber}=$RepeatNumber;
			    $Repeat[$i]{startpos}=$CountArrayLength+1;
			    $Repeat[$i]{endpos}=$RepeatEndPos;
			    $i++;
			}
			$CountArrayLength=$CountArrayLength+$PatLen*$RepeatNumber-1;
		    }
		}
	    }
	}

	if($SaikiTrup==''){
	    if($output eq "f"){
		_print_tandem(\@Repeat,-print=>"f",-filename=>$filename);
	    }
	    if($output eq "stdout"){
		_print_tandem(\@Repeat);
	    }
	    return \@Repeat;
	}
	elsif($RepeatNumber*$PatLen==length($$ref_Genome) || $PatRst{$pattern}==1){
	    return 1;
	}
	elsif($RepeatNumber*$PatLen!=length($$ref_Genome)){ 
	    return 0;
	}
    }
}


#foreach_tandem ver.20010629-01
#scripting by Koya Mori(mory@g-language.org)
#This program finds each length pattern repeat from sequens.
#(array Repeat)=&foreach_tandem(pointer GENOME,  int max_length,  int min_length,  boolean debug);
sub foreach_tandem{
    &opt_default(MaxLength=>3,MinLength=>2,output=>"stdout",filename=>"tandem.csv");    
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $MaxLen=opt_val("MaxLength");
    my $MinLen=opt_val("MinLength");
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my $d;
    my $c;
    my $q;
    my $Repeat;
    my @Result;
    my @Result_tmp;
    my $start;
    my $end;
    my $i;
    

    for($d=$MaxLen;$d>$MinLen-1;$d--){
	$Repeat=find_tandem($gb,-PatLength=>$d,-output=>"n");
	$q=0;
	$c=0;
	while(defined($$Repeat[$c])){
	    $start=$$Repeat[$c]{startpos};
	    $end=$$Repeat[$c]{endpos};
	    while(defined($Result[$q])){
		if($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$end){
		    $$Repeat[$c]{pattern}='0';
		    last;
		}
		elsif($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$start && $Result[$q+1]{startpos}<=$end && $Result[$q+1]{endpos}>=$end){
		    $i=0;
		    while($i*$$Repeat[$c]{patlength}+$start<$Result[$q]{endpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{startpos}=$$Repeat[$c]{startpos}+$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    $i=0;
		    while($end-$i*$$Repeat[$c]{patlength}>$Result[$q+1]{startpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{endpos}=$$Repeat[$c]{endpos}-$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$start && $Result[$q]{endpos}<$end){
		    $i=0;
		    while($i*$$Repeat[$c]{patlength}+$start<$Result[$q]{endpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{startpos}=$$Repeat[$c]{startpos}+$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}<=$end && $Result[$q]{endpos}>=$end && $Result[$q]{startpos}>$start){
		    $i=0;
		    while($end-$i*$$Repeat[$c]{patlength}>$Result[$q]{startpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{endpos}=$$Repeat[$c]{endpos}-$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}>$end){
		    last;
		} 
		else{
		    $q++;
		}
	    }
	    if($$Repeat[$c]{pattern} ne '0'){
		@Result=(@Result,$$Repeat[$c]);
	    } 
	    $q--;
	    $c++;
	}
        @Result_tmp=sort{$a->{startpos} <=> $b->{startpos}}@Result;
	@Result=();
        @Result=@Result_tmp;
    }

    if($output eq "f"){
	_print_tandem(\@Result,-print=>"f",-filename=>$filename);
    }
    if($output eq "stdout"){
	_print_tandem(\@Result);
    } 

    return \@Result;
}


#print_tandem ver.20010629-01
#scripting by Koya Mori(mory@g-language.org)
#This program prints result of find_tandem.
#&print_tandem(pointer TANDEM,  boolean debug);
sub _print_tandem{
    &opt_default(print=>"n",filename=>"tandem.csv");
    my @args=opt_get(@_);

    my $Repeat=shift @args;
    my $print=opt_val("print");
    my $filename=opt_val("filename");
    my $i;


    if($print eq "f"){
	open(FILE,">>$filename");
	foreach(@$Repeat){
	    print FILE "$$Repeat[$i]{pattern},$$Repeat[$i]{patlength},$$Repeat[$i]{repnumber},$$Repeat[$i]{startpos},$$Repeat[$i]{endpos}\n";
	    $i++;
	}
	print FILE "\n\n";
	close(FILE);
    }
    else{
	foreach(@$Repeat){
	    &msg_send("$$Repeat[$i]{pattern}\t$$Repeat[$i]{patlength}\t$$Repeat[$i]{repnumber}\t$$Repeat[$i]{startpos}..$$Repeat[$i]{endpos}\n");
	    $i++;
	}
    }
}


# graphical_LTR_search ver.20010906-01
# Author: Kazuharu Arakawa
# Usage: NULL = &graphical_LTR_search(pointer string seq); 
# Options:
# -length    length of LTR to search (default:64)
# -filename  output filename (default:'LTR.png')
# -output    "g" for graph, "show" for display 
# Description:
#  Converts a sequence to a png image in order to find LTR.
#  This method is useful because if LTR is present, you can
#  see pixels aligned in several rows at the locus.
#  A is shown in red, T is shown in green, 
#  G is shown in yellow, and C is shown in blue. 
# Requirements:
#  SubOpt.pm
#  GD.pm

sub graphical_LTR_search {
    &opt_default(length=>64, filename=>"LTR.png", output=>"show");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = \$gb->{SEQ};
    my $width = opt_val("length");
    my $height = int((length($$seq)+1)/$width)+1;
    die("\nERROR at graphical_LTR_search: sequence too long!\n\n\n")
	if ($height > 15000);
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    my $topmargin = 50;
    my $sidemargin = 50;
    my $im = new GD::Image($width + $sidemargin * 2, 
			   $height + $topmargin + 20);

    my $white = $im->colorAllocate(255,255,255);
    my $black = $im->colorAllocate(0,0,0);
    my $red = $im->colorAllocate(255,0,0);
    my $yellow = $im->colorAllocate(255,255,0);
    my $green = $im->colorAllocate(0,150,0);
    my $blue = $im->colorAllocate(0,0,255);

    my ($x, $y);
    my $count = 0;
    for ($y = $topmargin; $y <= $height + $topmargin; $y ++){
	for ($x = 0; $x <= $width; $x ++){
	    my $color=$white;
	    $color=$red if (substr($$seq, $count, 1) eq 'a');
	    $color=$yellow if (substr($$seq, $count, 1) eq 'g');
	    $color=$green if (substr($$seq, $count, 1) eq 't');
	    $color=$blue if (substr($$seq, $count, 1) eq 'c');
	    
	    $im->setPixel($x + $sidemargin,$y,$color);
	    last if ($count == length($$seq));
	    $count ++;
	}
    }
    for ($y = $topmargin; $y <= $height + $topmargin; $y += $sidemargin){
	my $num = ($y - $topmargin) * $width;
	$im->string(gdTinyFont, 6, $y, "$num", $black);
    }
    $im->string(gdSmallFont, 5,  5, "LTR search:", $black);
    $im->string(gdSmallFont, 20, 20, "width = $width", $black);
    $im->string(gdSmallFont, $sidemargin + 25 + $width, $topmargin + 10, 
		"A", $red);
    $im->string(gdSmallFont, $sidemargin + 25 + $width, $topmargin + 30,
		"T", $yellow);
    $im->string(gdSmallFont, $sidemargin + 25 + $width, $topmargin + 50,
		"G", $green);
    $im->string(gdSmallFont, $sidemargin + 25 + $width, $topmargin + 70, 
		"C", $blue);
    
    mkdir ("graph", 0777);
    open(OUT, '>graph/' . $filename);
    binmode OUT;
    print OUT $im->png;
    close(OUT);

    msg_gimv("gimv graph/$filename") if ($output eq 'show');
}


1;
