#! /usr/bin/perl -w

if ($#ARGV != 0) {
print << "EOF";
$0 - convert statistical data (mscore) output files to a gnuplot input file

Usage: $0 orderfile < scores-file

Orderfile should be a file containing the test filenames in the desired order.

(c) Goetz Schwandtner 2002
EOF
    exit -1;
}

######################################################################
# read in order file
######################################################################
open( ORDER, "<$ARGV[0]");

# array containing all filenames read
@numtoname = ();
# hash for back conversion (filename -> number)
%nametonum= ();

$i= 0;

while ($line= <ORDER>) {
    chop ($line);
# if a line does not entrirely consist of whitepaces, it contains a filename
    if ($line !~ /^\s*$/) {
	$numtoname[$i]= $line;
	$nametonum{$line}= $i;
	$i++;
    }
}

# subset is to determine whether a directory containing secondary structure
# info and therefore more different scores - it is set to 0 when such a 
# score is found
$subset= 1;

# now read time values
# we only use the entry "real" which specifies the elapsed time, not
# user ("user") or system ("sys") time, since that is what is 
# most interesting for us

# entry gives the index in the numtoname field
$entrynum= -1;

# array to store the values, in order of "nametonum"

@dis= ();
@sdis= ();
@csdis= ();
@pam= ();
@spam= ();
@cspam= ();
@blosum= ();
@sblosum= ();
@csblosum= ();

# now get values from file

while ($line= <STDIN>) {
    chop($line);

    if ($line =~ /^(.*\.msf)$/) {
	# found new header line
	if (!defined($nametonum{$1})) {
	    die "Found undefined file name: $line";
	}
	$entrynum= $nametonum{$1};
    } elsif ($line =~ /^dis: DiffScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: dis!" if $entrynum== -1;
	$dis[$entrynum]= $1;
    } elsif ($line =~ /^sdis: DiffScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: sdis!" if $entrynum== -1;
	$sdis[$entrynum]= $1;
	# now we have secondary structure in this dir -> no subset of scores!
	$subset= 0;	
    } elsif ($line =~ /^csdis: DiffScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: csdis!" if $entrynum== -1;
	$csdis[$entrynum]= $1;
    } elsif ($line =~ /^pam: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: pam!" if $entrynum== -1;
	$pam[$entrynum]= $1;
    } elsif ($line =~ /^spam: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: spam!" if $entrynum== -1;
	$spam[$entrynum]= $1;
    } elsif ($line =~ /^cspam: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: cspam!" if $entrynum== -1;
	$cspam[$entrynum]= $1;
    } elsif ($line =~ /^blosum: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: blosum!" if $entrynum== -1;
	$blosum[$entrynum]= $1;
    } elsif ($line =~ /^sblosum: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: sblosum!" if $entrynum== -1;
	$sblosum[$entrynum]= $1;
    } elsif ($line =~ /^csblosum: SPScore: (-?\d*)$/) {
	# got info about current number of sequences
	die "Input file format error: csblosum!" if $entrynum== -1;
	$csblosum[$entrynum]= $1;
    }

}

######################################################################
# print gnuplot output
######################################################################

open(OUTFILE, ">dis.gnuplot");
for ($i= 0; $i < $#numtoname; $i++) {
    if ( defined $dis[$i] ) {
	print OUTFILE $i." ".$dis[$i]."\n";
    }
}
close(OUTFILE);

open(OUTFILE, ">pam.gnuplot");
for ($i= 0; $i < $#numtoname; $i++) {
    if ( defined $pam[$i] ) {
	print OUTFILE $i." ".$pam[$i]."\n";
    }
}
close(OUTFILE);

open(OUTFILE, ">blosum.gnuplot");
for ($i= 0; $i < $#numtoname; $i++) {
    if ( defined $blosum[$i] ) {
	print OUTFILE $i." ".$blosum[$i]."\n";
    }
}
close(OUTFILE);

if ($subset == 0) {

    open(OUTFILE, ">sdis.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $sdis[$i] ) {
	    print OUTFILE $i." ".$sdis[$i]."\n";
	}
    }
    close(OUTFILE);

    open(OUTFILE, ">csdis.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $csdis[$i] ) {
	    print OUTFILE $i." ".$csdis[$i]."\n";
	}
    }
    close(OUTFILE);

    open(OUTFILE, ">spam.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $spam[$i] ) {
	    print OUTFILE $i." ".$spam[$i]."\n";
	}
    }
    close(OUTFILE);

    open(OUTFILE, ">cspam.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $cspam[$i] ) {
	    print OUTFILE $i." ".$cspam[$i]."\n";
	}
    }
    close(OUTFILE);

    open(OUTFILE, ">sblosum.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $sblosum[$i] ) {
	    print OUTFILE $i." ".$sblosum[$i]."\n";
	}
    }
    close(OUTFILE);

    open(OUTFILE, ">csblosum.gnuplot");
    for ($i= 0; $i < $#numtoname; $i++) {
	if ( defined $csblosum[$i] ) {
	    print OUTFILE $i." ".$csblosum[$i]."\n";
	}
    }
    close(OUTFILE);
}
print "Done."
