#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: PatSearch.pm,v 1.3 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::PatSearch;

use SubOpt;
use G::Messenger;
use G::Tools::Graph;
use G::Seq::Primitive;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

use SelfLoader;

require Exporter;

@ISA = qw(Exporter);

@EXPORT = qw(
	     palindrome
	     oligomer_counter
	     find_seq
	     find_dnaAbox
	     find_dif
	     baseParingTest
	     nucleotide_periodicity
);

=head1 NAME

  G::Seq::PatSearch - component of G-language Genome Analysis Environment

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to GC skew.

=head1 AUTHOR
    
    Kazuharu Gaou Arakawa (gaou@g-language.org)

=head1 SYNOPSIS

=cut

#__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::

=head2 palindrome

  Description:
    Searches palindrome sequences

 Usage: 
    palindrome(sequence); 

 Options:
    -shortest shortest palindrome to search (default:4)
    -loop     longest stem loop to allow (default: 0)
    -gtmatch  if 1, allows g-t match (default: 0)
    -output   "f" for file output
    
  Author: 
    Kazuharu Gaou Arakawa (gaou@g-language.org)

  History:
    20010829-01 initial posting

=cut




sub palindrome {
    &opt_default(gtmatch=>0, loop=>0, shortest=>4, -output=>"stdout", -filename=>"palindrome.csv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $length = int(opt_val("shortest") / 2);
    my $output = opt_val("output");
    my $filename = opt_val("filename");

    my %palindrome;

    my $i = $length - 1; 
    my ($len, $j, $k, $stem);

    if (opt_val("output") eq "f"){
	open(OUT, '>' . $filename) || &msg_error("G::Seq::PatSearch::palindrome() $! $filename");
	print OUT "Length, start, end, sequence\n";
    }

    while($i <= length($gb->{SEQ}) - 1 - $length - opt_val("loop")){
	$stem = opt_val("loop");

	while($stem >= 0){
	    $j = $i;
	    $k = $stem + 1 + $i;
	    $len = 0;
	    last if ($k > length($gb->{SEQ}) - 1);

	    while(&baseParingTest(substr($gb->{SEQ}, $j, 1), 
			       substr($gb->{SEQ}, $k, 1),
			       &opt_val("gtmatch")) 
		  )
	    {
		$j --;
		$k ++;
		last if ($j < 0 || $k > length($gb->{SEQ}) - 1);
		$len += 2;
	    }

	    if ($len >= opt_val("shortest")){
		&msg_send(sprintf("Length: %2d Position: %7d %7d Sequence: %s %s %s\n",
		$len, $j + 1, $k - 2, 
		substr($gb->{SEQ}, $j + 1, $len/2),
		substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
		substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2))) if ($output eq 'stdout');

		if ($output eq "f"){
		    printf OUT "%d,%d,%d,%s %s %s\n",
		    $len, $j + 1, $k - 2, 
		    substr($gb->{SEQ}, $j + 1, $len/2),
		    substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
		    substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2);
		}

		$palindrome{$j + 1} = sprintf("%s %s %s", 
					      substr($gb->{SEQ}, $j + 1, $len/2),
					      substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
					      substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2)
					      );
	    }

	    $stem --;
	}
	$i ++;
    }
    close(OUT) if ($output eq "f");

    return \%palindrome;
}





=head2 find_dif

  Description:
    Finds E.coli dif sequence (ggtgcgcataatgtatattatgttaaat) in both strands.
    dif is a 28bp sequence element recognized by XerCD located near the replication
    terminus used for chromosome dimer resolution by recombination.
    
  Usage: 
    (array @position) = find_dif(sequence)
    
  Options:
    none
    
  Author: 
    Kazuharu Gaou Arakawa (gaou@g-language.org)
    
  History:
    20060711-01 initial posting

=cut



sub find_dif{
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $start = -1;
    my @pos = ();
    my $dif = "ggtgcgcataatgtatattatgttaaat";

    while(0 <= ($start = index($gb->{SEQ}, $dif, $start + 1))){
	push(@pos, $start);
    }

    while(0 <= ($start = index($gb->{SEQ}, complement($dif), $start + 1))){
	push(@pos, $start);
    }

    return @pos;
}




=head2 find_dnaAbox

  Description:
    Finds dnaA box(TT A/T TNCACA) in both strands.
    
  Usage: 
    (array @position) = find_dnaAbox(sequence)
    
  Options:
    none
    
  Author: 
    Kazuharu Gaou Arakawa (gaou@g-language.org)
    
  History:
    20021125-01 initial posting

=cut



sub find_dnaAbox {
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $i = 0;
    my @pos = ();

    for ($i = 0; $i < length($gb->{SEQ}) - 8; $i ++){
	if (substr($gb->{SEQ}, $i, 9) =~ /(tt[at]t.caca)/){
	    push (@pos, $i);
	    msg_send(sprintf "%d %s\n", $i, $1);
	}elsif(substr($gb->{SEQ}, $i, 9) =~ /(tgtg.a[at]aa)/){
	    push (@pos, $i);
	    msg_send(sprintf "%d %s\n", $i, $1);
	}
    }

    return @pos;
}



=head2 oligomer_counter

  Description:
    Counts the number of oligomers in a sequence (by windows optionally)

  Usage: 
    (array @count || int $count) = oligomer_counter(sequence);

 Options:
    -window      int window size.
                 If specified, seeks oligomer in specified windows
                 Method returns an array of numbers at each windows
                 If not specified, seeks oligomer in the genome
                 Method returns the number of oligomers
    -output      "f" for file output, "g" for graph output
                 Only available when -window option is specified

  Author: 
    Kazuharu Gaou Arakawa
    -based on atg7.wind + gcwind [rsaito]

  History:
    20010829-01 initial posting

=cut




sub oligomer_counter {
    opt_default("window"=>0);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = shift @args;
    my $window = opt_val("window");
    $window = length($gb->{SEQ}) if($window <= 0);

    if (opt_val("window")){
	open(OUT, '>oligo_count.csv') || &msg_error($!)
	    if (opt_val("output") eq "f");

	my $i = 0;
	my @wincount = ();
	my @winnum = ();
	for ($i = 0; $i <= int(length($gb->{SEQ}) / $window); $i ++){
	    my $partial = substr($gb->{SEQ}, $i * $window, $window);
	    last if (length($partial) < $window);
	    my $start = 0;
	    my $count = 0;
	    if (length($seq) == 1 && $seq =~ /a|t|g|c/){
		$count = $partial =~ tr/a/a/ if ($seq eq 'a');
		$count = $partial =~ tr/t/t/ if ($seq eq 't');
		$count = $partial =~ tr/g/g/ if ($seq eq 'g');
		$count = $partial =~ tr/c/c/ if ($seq eq 'c');
	    }else{
		while(0 <= ($start = index($partial, $seq, $start + 1))){
		    $count ++;
		}
	    }
	    push (@wincount, $count);
	    push (@winnum, $i * $window);
	    print OUT "%d,%d\n", $i*$window, $count
		if (opt_val("output") eq "f");
	}
	close(OUT) if (opt_val("output") eq "f");
	if (opt_val("output") eq "g"){
	    _UniMultiGrapher(\@winnum, \@wincount, -x=>'window(bp)', 
			  -y=>'number of oligomer', 
			  -title=>'oligomer by window',
			  -outfile=>'oligo_count.png'
			  );
	}
	return (@wincount);
    }else{
	my $start = 0;
	my $count = 0;
	while(0 <= ($start = index($gb->{SEQ}, $seq, $start + 1))){
	    $count ++;
	}
	return $count;
    }
}


=head2 find_seq

  Description:
    Counts an oligomer and its complement.

  Usage:
    (int $direct, int $comp, int $total) = find_seq(sequence, string $oligo);

 Options:
    none

  Author:
    Kazuharu Gaou Arakawa (gaou@g-language.org)
   
  History:
    20010326-01 initial posting

=cut


sub find_seq {
    my $gb = opt_as_gb(shift);
    my $ref_Genome = \$gb->{SEQ};
    my $sSeq = shift;
    my $printer=shift;
    my $sSeq2 = complement($sSeq);
    my $direct = 0;
    my $comp = 0;
    my $iSeqStart = 0;
    
    while(0 <= ($iSeqStart = index($$ref_Genome, $sSeq, $iSeqStart + 1))){
	$direct ++;
    }
    $iSeqStart = 0;
    while(0 <= ($iSeqStart = index($$ref_Genome, $sSeq2, $iSeqStart + 1))){
	$comp ++;
    }

    if($printer eq "f"){
	open(FILE,">>oligomer_count.rst");
	print FILE '--- find_sequence_result ---',"\n";
	print FILE "$sSeq: $direct\n$sSeq2: $comp\nTotal: $direct+$comp\n\n";
	close(FILE);
    }
    return ($direct, $comp, $direct + $comp);
}






=head2 BaseParingTest

  Description:
    Base pairing check
    
  Usage: 
    boolean $match = match_test(char $first, char $second, boolean $gtmatch);
    
  Options:
    none
    
  Author: 
    Kazuharu Gaou Arakawa (gaou@g-language.org)
    
  History:
    20010829-01 initial posting

=cut



sub baseParingTest {
    my $first = lc(shift);
    my $second = lc(shift);
    my $gtmatch = shift;
    die("First two arguments must be single base (i.e. a, t, g, or c).\n")
	unless(length($first) == 1 && length($second) == 1);

    if ($first eq 'a' && $second eq 't' ||
	$first eq 't' && $second eq 'a' ||
	$first eq 'g' && $second eq 'c' ||
	$first eq 'c' && $second eq 'g' ||
	$first eq 't' && $second eq 'g' && $gtmatch ||
	$first eq 'g' && $second eq 't' && $gtmatch
	)
    {
	return 1;
    }else{
	return 0;
    }
}



=head2 nucleotide_periodicity

  Description:
    Checks the periodicity of certain nucleotide (best known with AA dinucleotide)
    
  Usage: 
    array data = nucleotide_periodicity(sequence);
    
  Options:
    -nucleotide    nucleotide to search (default:aa)
    -window        window size to seek periodicity (default:50)
    -filename      output filename (default:aa_frequency.png)
    -output        "g" for graph file output only,
                   "show" for graph file output and display.
                   (default: show)
    
  ToDo:
    data output

  Author: 
    Kazuharu Gaou Arakawa (gaou@g-language.org)
    
  History:
    20070206-01 initial posting

=cut



sub nucleotide_periodicity {
    opt_default("nucleotide"=>"aa", "window"=>50, "filename"=>"aa_frequency.png", "output"=>"show");
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $nuc = opt_val("nucleotide");
    my $window = opt_val("window");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my @data = ();
    $data[$_] = 0 for (0..($window - 1));

    my $start = -1;
    while(0 <= ($start = index($gb->{SEQ}, $nuc, $start + 1))){
	my $innerPos = -1;
	my $localSeq = substr($gb->{SEQ}, $start + length($nuc), $window);
	while(0 <= ($innerPos = index($localSeq, $nuc, $innerPos + 1))){
	    $data[$innerPos]++;
	}
    }

    _UniMultiGrapher([0..($window - 1)], \@data, -filename=>$filename);
    msg_gimv("graph/$filename") if ($output eq 'show');

    return @data;
}



1;


