#!/usr/bin/perl -w
#
#
# Copyright (C) 2008 Milton Nishiyama Jr and Alper Ylmaz - Department of
#               Plant Cell and Molecular Biology - Ohio State University - USA
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
# This script is responsible for identify identical nucleotide or protein
# sequences from a multi-fasta file.
# It takes as inputs the option to generate the output, provide the option
# to print all duplicated sequences with gene_id and sequence or print and
# choose only one of the duplicated sequences already eliminating the other.
#
use Getopt::Long;
use Digest::MD5 qw(md5_hex); 
use strict;

#Separator
$/ = ">";

my %hopt     = ();

$hopt{output} = "";
$hopt{fileseq} = "";
$hopt{verbose} = 0;

GetOptions ("type=s" => \$hopt{type},    
	    "fileseq=s" => \$hopt{fileseq},
	    "output=s" => \$hopt{output},    
	    "verbose"   => \$hopt{verbose}
	    );


if($hopt{verbose} || (!$hopt{fileseq}) || (!$hopt{type}) ) {
    usage();
}

my %hinfo = ();
my $id    = "";
my $aux   = "";
my $seq   = "";
my $md5   = "";

open(FS,"$hopt{fileseq}") or die "Error: '$hopt{fileseq}' $!\n";
while(<FS>) {

    if(/(^\S+)\n([A-Z\n]+)/gi) {
	$id = "$1";
	$seq = "$2";
	$seq =~s/\n//g;
	$md5 = md5_hex($seq);
	$hinfo{$md5}{$id} = "$seq";
    }
}
close(FS);

open(OUT,">$hopt{output}") if(exists $hopt{output});
foreach my $md5 (keys %hinfo) {
    next if( (keys %{$hinfo{$md5}}) < 2);
    
    foreach my $id (keys %{$hinfo{$md5}}) {
	if(exists $hopt{output}) {
	    print OUT "$md5\t$id\t$hinfo{$md5}{$id}\n";
	}
	else {
	    print "$md5\t$id\t$hinfo{$md5}{$id}\n";
	}
	last if($hopt{type} eq 'uniq');
    }
}
close(OUT) if(exists $hopt{output});


#sub-routines

sub usage {
    print
	"Usage  : $0 : \n\t\t -f <fasta sequence> \n".
	"\t\t -t <output type: (all) duplicated ordered / (one) only one of duplicated> \n".
	"\t\t -o <output file> or stdout \n".
	"Example: $0 -f sequence.fasta -t all -o results.out\n";
    exit(0);
}

