#!/usr/bin/perl
#
# this program is used to parse the sequence fasta file to obtain basic statistics for multiple sequences of coding cDNAs 
# program: atgc.pl
# author: Jack Min, @copyright
#
#the number of argument is 2
if ($#ARGV !=1)
{
	print "usage: atgc.pl sequence_fasta_file output\n";
	exit;
}

$seqFile = $ARGV[0];
$oFile = $ARGV[1];
open (SEQUENCEFILE, $seqFile) || die ("Cannot open file $seqFile\n\n");
print "In progress...\n";

# read the DNA sequence data from the file, ans store it into the array variable @DNA
@DNA = <SEQUENCEFILE>;
close SEQUENCEFILE;

# remove header and join the DNA sequence into one string
$seqNum = 0;
foreach $line (@DNA)
{
	chomp ($line); #remove the new line symbol
	$line =~ s/[\r\n\f\s]//g;
	if ($line =~ /^>/)
	{
  		$seqNum++;
		print $seqNum."\n";
		if ($seqNum >1)
		{
			push (@DNA2, $id."\t".$seq."\n");
			$seq = '';				
		}
		$id = substr($line, 1); # to remove > symbol
	}
	else
	{
		$seq = $seq.$line;   # $seq .= $line
	}
}
push (@DNA2, $id."\t".$seq."\n"); #this is important to push the last sequence into array

$itemNum = 0;
foreach $item (@DNA2)
{
	chomp ($item);
	$g = 0; $c = 0; $a = 0;	$t = 0; $n = 0;
	$g1 = 0; $c1 = 0; $a1 = 0; $t1 = 0; $n1 = 0;
	#### add variable for the 2nd and 3rd positions #######

	##################

	@tmp = split (/\t/, $item);
	$id = $tmp[0];
	$seq = $tmp[1];
	$seq = lc($seq);
	@base = split (//, $seq);
	$bCount = 0;#count each base
	foreach $base (@base)
	{
		$bCount++;
		if ($base eq 'a')
		{
			$a++;
		}
		elsif ($base eq 'c')
		{
			$c++;
		}
		elsif ($base eq 'g')
		{
			$g++;
		}
		elsif ($base eq 't')
		{
			$t++;
		}
		else
		{
			$n++;
		}
			
############# count the 1st position #########
		if ($bCount % 3 == 1)
		{			
			if ($base eq 'a')
			{
				$a1++;
			}
			elsif ($base eq 'c')
			{
				$c1++;
			}
			elsif ($base eq 'g')
			{
				$g1++;
			}
			elsif ($base eq 't')
			{
				$t1++;
			}
			else
			{
				$n1++;
			}
		}

################# count the 2nd codon position #######
		
################# count the 3rd codon position #######

####################################################		
	}
	push (@gcA, $id."\t".$a."\t".$t."\t".$g."\t".$c."\t".$n."\t".$a1."\t".$t1."\t".$g1."\t".$c1."\n");
}
			
#write to a file

open (OUT, ">$oFile");
print OUT (@gcA);
close OUT;
print "the work is done!\n";

exit;






