#!/usr/bin/perl -w
#
#
# Copyright (C) 2008 Milton Nishiyama Jr and Alper Ylmaz - Department of
#               Plant Cell and Molecular Biology - Ohio State University - USA
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
# This script is responsible for Identification and Classification of 
# Transcription Factors
# It takes as inputs the transcription factor classification rules based
# on Classification from PlnTFDB and AtTFDB and the HMMPFAM output generated from 
# InterProScan software for all available proteins.
# The purpose is to prinnt only significant hits and discarding all proteins 
# with e-value lower than 0.001. The classifier produces as output a list of 
# putative Transcription Factors grouped into families
# Concerns: Can occur some cases of proteins that has two or more domains that
# characterize two or more families and there aren't rules for them, thus we
# recommend look manually the output for genes with more than one valid TF family,
# in the output the genes are ordered


use strict;
use Getopt::Long;

my %hopt          = ();
$hopt{fileOutput} = "";
$hopt{fileRules}  = "";
$hopt{fileIPS}    = "";
$hopt{evalue}     = 0.001;
$hopt{verbose}    = 0;

GetOptions ("out=s"    => \$hopt{fileOutput},    
	    "rules=s"  => \$hopt{fileRules},
	    "ips=s"    => \$hopt{fileIPS},
	    "evalue=f" => \$hopt{evalue},    
	    "verbose"  => \$hopt{verbose}
	    );

if( $hopt{verbose} || (!$hopt{fileRules}) || (!$hopt{evalue}) || (!$hopt{fileIPS}) ) {
    usage();
}

#Read and Get the family domain rules
my %hdomr = ();
open(FDR,"$hopt{fileRules}") or die "Error: $!\n";
while(<FDR>) {
    if(/([^\t]+)\t+([^\t]+)\t+([^\t]+)\r/) {
	$hdomr{"$1"}{"T"} = "$2";
	$hdomr{"$1"}{"F"} = "$3";
    }
}
close(FDR);

#Read the InterProScan output and store in the hash the information
#for each domain into each family for each gene using the specific
#evalue, default:0.001
my %hgfam = ();
open(FIPS,"$hopt{fileIPS}");
while(<FIPS>) {
    if(/([^\t]+)\t+[^\t]+\t+[^\t]+\t+[^\t]+\t+([^\t]+)\t+([^\t]+)\t+[^\t]+\t+[^\t]+\t+([^\t]+)\t+/) {
	next if($4 > $hopt{evalue});
	$hgfam{$1}{$2}{'pfdom'} = "$3";
	$hgfam{$1}{$2}{'ocurrences'}++;

	if(!exists $hgfam{$1}{$2}{'pfevalue'}) {
	    $hgfam{$1}{$2}{'pfevalue'} = $4;
	}
	else {
	    $hgfam{$1}{$2}{'pfevalue'} = ($hgfam{$1}{$2}{'pfevalue'} < $4) ? $hgfam{$1}{$2}{'pfevalue'}:$4;
	}
    }
}
close(FIPS);

my %hdomains = ();
my $pfid = "";

#process the significant hits from InterProScan, use the classification rules
#of permitted and forbidden domains into each families and store it in a hash for
#each gene
foreach my $g (keys %hgfam) {
    foreach my $pfid (keys %{$hgfam{$g}}) {
	if($pfid eq 'PF00010') {
	    $hdomains{$g}{'bHLH'}{"$pfid"}++;
	}
	if($pfid eq 'PF00046') {
	    $hdomains{$g}{'HB'}{"$pfid"}++;
	}
	if($pfid eq 'PF00096') {
	    $hdomains{$g}{'C2H2'}{"$pfid"}++;
	}
	if($pfid eq 'PF02373' || $pfid eq 'PF02375' || $pfid eq 'PF00628') {
	    $hdomains{$g}{'C2H2'}{"$pfid"}--;
	}
	if($pfid eq 'PF00170' || $pfid eq 'PF07716') {
	    $hdomains{$g}{'bZIP'}{"$pfid"}++;
	}
	if($pfid eq 'PF00010') {
	    $hdomains{$g}{'bZIP'}{"$pfid"}--;
	}
	if($pfid eq 'PF00249') {
	    if($hgfam{$g}{$pfid}{'ocurrences'}>=2) {
		$hdomains{$g}{'MYB'}{"$pfid"}++;
	    }
	    else {
		$hdomains{$g}{'MYB-related'}{"$pfid"}++;
	    }
	}
	if($pfid eq 'PF01388' || $pfid eq 'PF00072' || $pfid eq 'PF00176') {
	    $hdomains{$g}{'MYB-related'}{"$pfid"}--;
	    $hdomains{$g}{'MYB'}{"$pfid"}--;
	}
	if($pfid eq 'PF00319') {
	    $hdomains{$g}{'MADS'}{"$pfid"}++;
	}
	if($pfid eq 'PF00320') {
	    $hdomains{$g}{'C2C2-GATA'}{"$pfid"}++;
	}
	if($pfid eq 'PF00447') {
	    $hdomains{$g}{'HSF'}{"$pfid"}++;
	}
	if($pfid eq 'PF00505') {
	    $hdomains{$g}{'HMG'}{"$pfid"}++;
	}
	if($pfid eq 'PF01388' || $pfid eq 'PF04690') {
	    $hdomains{$g}{'HMG'}{"$pfid"}--;
	}
	if($pfid eq 'PF00628') {
	    $hdomains{$g}{'PHD'}{"$pfid"}++;
	}
	if($pfid eq 'PF02791' || $pfid eq 'PF00046' || $pfid eq 'PF02373' || $pfid eq 'PF02375') {
	    $hdomains{$g}{'PHD'}{"$pfid"}--;
	}	
	if($pfid eq 'PF00642') {
	    $hdomains{$g}{'C3H'}{"$pfid"}++;
	}
	if($pfid eq 'PF00628' || $pfid eq 'PF00176' || $pfid eq 'PF00096') {
	    $hdomains{$g}{'C3H'}{"$pfid"}--;
	}
	if($pfid eq 'PF00808') {
	    $hdomains{$g}{'CCAAT-HAP3'}{"$pfid"}++;
	}
	if($pfid eq 'PF02362') {
	    $hdomains{$g}{'ABI3VP1'}{"$pfid"}++;
	}
	if($pfid eq 'PF00847' || $pfid eq 'PF06507') {
	    $hdomains{$g}{'ABI3VP1'}{"$pfid"}--;
	}
	if($pfid eq 'PF00847') {
	    $hdomains{$g}{'AP2-EREBP'}{"$pfid"}++;
	}
	if($pfid eq 'PF01167') {
	    $hdomains{$g}{'TUB'}{"$pfid"}++;
	}
	if($pfid eq 'PF01388') {
	    $hdomains{$g}{'ARID'}{"$pfid"}++;
	}
	if($pfid eq 'PF01698') {
	    $hdomains{$g}{'LFY'}{"$pfid"}++;
	}
	if($pfid eq 'PF02042') {
	    $hdomains{$g}{'NLP'}{"$pfid"}++;
	}
	if($pfid eq 'PF02045') {
	    $hdomains{$g}{'CCAAT-HAP2'}{"$pfid"}++;
	}
	if($pfid eq 'PF02319') {
	    $hdomains{$g}{'E2F/DP'}{"$pfid"}++;
	}
	if($pfid eq 'PF02365') {
	    $hdomains{$g}{'NAC'}{"$pfid"}++;
	}
	if($pfid eq 'PF02373' || $pfid eq 'PF02375') { 
	    $hdomains{$g}{'JUMONJI'}{"$pfid"}++;
	}
	if($pfid eq 'PF02701') {
	    $hdomains{$g}{'C2C2-Dof'}{"$pfid"}++;
	}
	if($pfid eq 'PF03106') {
	    $hdomains{$g}{'WRKY'}{"$pfid"}++;
	}
	if($pfid eq 'PF03110') {
	    $hdomains{$g}{'SBP'}{"$pfid"}++;
	}
	if($pfid eq 'PF03514') {
	    $hdomains{$g}{'GRAS'}{"$pfid"}++;
	}
	if($pfid eq 'PF03634') {
	    $hdomains{$g}{'TCP'}{"$pfid"}++;
	}
	if($pfid eq 'PF03638') {
	    $hdomains{$g}{'CPP'}{"$pfid"}++;
	}
	if($pfid eq 'PF00856') { 
	    $hdomains{$g}{'CPP'}{"$pfid"}--;
	}
	if($pfid eq 'PF03859') { 
	    $hdomains{$g}{'CAMTA'}{"$pfid"}++;
	}
	if($pfid eq 'PF04504') {
	    $hdomains{$g}{'GeBP'}{"$pfid"}++;
	}
	if($pfid eq 'PF04690') {
	    $hdomains{$g}{'C2C2-YABBY'}{"$pfid"}++;
	}
	if($pfid eq 'PF04770') {
	    $hdomains{$g}{'zf-HD'}{"$pfid"}++;
	}
	if($pfid eq 'PF04873') {
	    $hdomains{$g}{'EIL'}{"$pfid"}++;
	}
	if($pfid eq 'PF05687') {
	    $hdomains{$g}{'BZR'}{"$pfid"}++;
	}
	if($pfid eq 'PF06200') {
	    $hdomains{$g}{'ZIM'}{"$pfid"}++;
	}
	if($pfid eq 'PF00320') {
	    $hdomains{$g}{'ZIM'}{"$pfid"}--;
	}
	if($pfid eq 'PF00249' || $pfid eq 'PF00072') { 
	    $hdomains{$g}{'ARR-B'}{"$pfid"}++;
	}
	if($pfid eq 'PF06203') {
	    $hdomains{$g}{'ARR-B'}{"$pfid"}--;
	}
	if($pfid eq 'PF06203' || $pfid eq 'PF00643') {
	    $hdomains{$g}{'C2C2-CO-like'}{"$pfid"}++;
	}
	if($pfid eq 'PF06217') {
	    $hdomains{$g}{'BBR/BPC'}{"$pfid"}++;
	}
	if($pfid eq 'PF06507') { 
	    $hdomains{$g}{'ARF'}{"$pfid"}++;	
	}
	if($pfid eq 'PF08536') {
	    $hdomains{$g}{'Whirly'}{"$pfid"}++;
	}
	if($pfid eq 'trihelix') {
	    $hdomains{$g}{'Trihelix'}{"$pfid"}++;
	}
	if($pfid eq 'VOZ-9') {
	    $hdomains{$g}{'VOZ-9'}{"$pfid"}++;
	}
    }
}

my ($f,$t,$c) = 0;
my (@atdom,@afdom, @acdom) = ();
my $i = 0;

#New Rules for proteins that has more than one domain
#characterizing more than one family
#This cases were analyzed manually to decide the right family
#classification, the criteria:
# - The literature
# - The highest e-value
#Ocurrences:
#       {C2C2-CO-like}   {PHD}
#       {ARID}           {PHD}
#       {ABI3VP1}        {PHD}
#       {bZIP}           {HB}
#       {HB}             {zf-HD}
#       {bHLH}           {TCP}


open(OUT,">$hopt{fileOutput}") if($hopt{fileOutput} ne '');

#Do the Family Classification for each gene, using the rules of permitted
#and forbidden domains into each family 
foreach my $g (keys %hdomains) {
    foreach my $fam (sort {$a cmp $b} keys %{$hdomains{$g}}) {

	($t, $f, $c,) = (0, 0, 0);

	#Get the all permitted domains for the specific family
	@atdom = split(/,/,$hdomr{$fam}{'T'});
	#Get the all forbidden domains for the specific family
	@afdom = split(/,/,$hdomr{$fam}{'F'}) if(exists $hdomr{$fam}{'F'});
	#Get the all domains that the family could posses
	@acdom = split(/,/,$hdomr{$fam}{'C'}) if(exists $hdomr{$fam}{'C'});

	#Count the permitted, forbidden and possible domains ocurrences
	for my $pfid (@atdom) {
	    $t++ if(exists $hdomains{$g}{"$fam"}{"$pfid"});
	}
	for my $pfidf (@afdom) {
	    $f++ if(exists $hdomains{$g}{"$fam"}{"$pfidf"});
	}
	for my $pfidf (@acdom) {
	    $c++ if(exists $hdomains{$g}{"$fam"}{"$pfidf"});
	}

	#eliminate the other families if the correct family was found!
	if($c > 0 && !$f ) {
	    foreach my $fm (keys %{$hdomains{$g}}) {
		next if($fm eq $fam);
		delete $hdomains{$g}{$fm};
	    }
	}

	delete $hdomains{$g}{'C2C2-CO-like'} if(exists $hdomains{$g}{'C2C2-GATA'}{'PF00320'});

	delete $hdomains{$g}{'ARR-B'} if(!exists $hdomains{$g}{'ARR-B'}{'PF00072'});					  

	delete $hdomains{$g}{'NAC'} if(exists $hdomains{$g}{'WRKY'}{'PF03106'});
	
	delete $hdomains{$g}{'MYB'} and delete $hdomains{$g}{'MYB-related'} if(exists $hdomains{$g}{'ARR-B'}{'PF00072'} && (exists $hdomains{$g}{'MYB'}{'PF00249'} || exists $hdomains{$g}{'MYB-related'}{'PF00249'}) );

	#IF doesn't exists Response_reg together with MYB, only MYB then change t=0 for fam='ARR-B'
	$t = 0 if($fam eq 'ARR-B' && !exists $hdomains{$g}{'ARR-B'}{'PF00072'});

	#Delete JUMONJI (JmjC and/or JmJN) if exists ARID
	delete $hdomains{$g}{'JUMONJI'} if(exists $hdomains{$g}{'ARID'}{'PF01388'});

	#Print the gene and your family classification
	if(!$f && $t) {
	    foreach my $id (keys %{$hdomains{$g}{$fam}}) {
		if($hopt{fileOutput} ne '') {
		    print OUT "$g\t$fam\t$hgfam{$g}{$id}{'pfevalue'}\t{@atdom}\t[@afdom]\n";
		}
		else {
		    print "$g\t$fam\t$hgfam{$g}{$id}{'pfevalue'}\t{@atdom}\t[@afdom]\n";
		}
	    }
	}
	undef @atdom;
	undef @afdom;
    }
}
close(OUT) if($hopt{fileOutput} ne '');





#Sub-routines

sub usage {
    print
	"Usage  : $0 : \n".
	"\t\t -r <family domain rules file>\n".
	"\t\t -e <E-value cutoff default:0.001>\n".
	"\t\t -i <InterProScan Format output>\n".
	"\t\t -o <output file> or stdout \n".
	"\t\t -v help\n".
	"Example: $0 -r Family_Rules.txt -i Proteins_InterProScan.output -o Proteins.output\n";
    exit(0);
}
