#!/bin/bash

# short shell script creating multiple sequence alignments

######################################################################
# using: CLUSTAL-W
######################################################################

#        scanning directories for test directories containing the
#        *.msf alignment files and creating sequence files from them
#        and deleting all *.ann, *.html, *.ftb and *.rsf files,
#        which are of no use for creating the sequence files

######################################################################
# some global parameters

# full filename of alignment program
# (usually found in current dir)
msaprogname=`pwd`/clustalw;

# directory containing the filter programs
filtersdir=/home/goetz/multal/filter;

# filename of some filters
msf2string=$filtersdir/msf2string;
string2msf=$filtersdir/string2msf;
msf2fasta=$filtersdir/msf2fasta;

######################################################################

echo "create alignments: clustal-w";
echo "using BAliBASE test directories for creation of raw sequences (.seq)"
echo "and CLUSTAL-W multiple sequence alignments (.msf)"

######################################################################
# first step: find all directories containing data

curdir=`pwd`;

for refdir in `find . -type d -name "*test*"`; do
    echo "Processing dir: $refdir";
# get into dir
    cd $refdir;

# clean up unneeded files ...
    echo "Removing extra files ...";
    rm -f *.ann *.html *.ftb *.rsf *.tfa *.sec *.gif *.dnd
    rm -f outputorder *.gnuplot statdata timestats
    
# process each .msf file individually:
    for reffile in `ls -x *.msf`; do 
	echo "Processing $reffile ...";
	
	# create sequence file (msf2string) without header info and
	# remove gap marks (with sed)
	# remove file extension (basename)
	# convert it back to GCG/MSF pileup format
	$msf2string < $reffile | sed -e "s/\.//g" | $string2msf $reffile | $msf2fasta > `basename $reffile .msf`.seq;
	rm -f $reffile;
	echo $reffile >> timestats;
	# invoke CLUSTAL-W
	( time $msaprogname -align -output=gcg -outorder=input -infile=`basename $reffile .msf`.seq > /dev/null ) 2>> timestats;
    done;

    # remove plain sequence and guide tree files
    rm -f *.seq *.dnd

# go back
    cd $curdir;
done
