#!/bin/bash

# short shell script creating multiple sequence alignments

######################################################################
# using: MSA
######################################################################

#        scanning directories for test directories containing the
#        *.msf alignment files and creating sequence files from them
#        and deleting all *.ann, *.html, *.ftb and *.rsf files,
#        which are of no use for creating the sequence files

######################################################################
# some global parameters

# full filename of alignment program
# (usually found in current dir)
msaprogname=`pwd`/msa;

# directory containing the filter programs
filtersdir=/home/goetz/multal/filter;

# filename of some filters
msf2string=$filtersdir/msf2string;
string2msf=$filtersdir/string2msf;
msf2fasta=$filtersdir/msf2fasta;
msa2msf=$filtersdir/msa2msf

######################################################################

echo "create alignments: msa";
echo "using BAliBASE test directories for creation of raw sequences (.seq)"
echo "and MSA multiple sequence alignments (.msf)"

######################################################################
# first step: find all directories containing data

curdir=`pwd`;

for refdir in `find . -type d -name "*test*"`; do
    echo "Processing dir: $refdir";
# get into dir
    cd $refdir;

# clean up unneeded files ...
    echo "Removing extra files ...";
    rm -f *.ann *.html *.ftb *.rsf *.tfa *.sec *.gif *.dnd
    rm -f outputorder *.gnuplot statdata timestats
    
# process each .msf file individually:
    for reffile in `ls -x *.msf`; do 
	echo "Processing $reffile ...";
	
	# create sequence file (msf2string) without header info and
	# remove gap marks (with sed)
	# remove file extension (basename)
	# convert it back to GCG/MSF pileup format
	# then, convert it to fasta format for MSA program input
	$msf2string < $reffile | sed -e "s/\.//g" | $string2msf $reffile | $msf2fasta > `basename $reffile .msf`.seq;
	# invoke MSA
	echo $reffile >> timestats;
	( time $msaprogname `basename $reffile .msf`.seq > `basename $reffile .msf`.msa 2>/dev/null ) 2>> timestats;
	# convert output file to .msf alignment file:
	# 1st convert .msa to .ms2 (because .msf is still needed)
	$msa2msf $reffile < `basename $reffile .msf`.msa > `basename $reffile .msf`.ms2  
	# 2nd remove .msf and rename .ms2 to .msf
	rm -f $reffile
	mv `basename $reffile .msf`.ms2 $reffile
    done;

    # remove plain sequence and msa output files
    rm -f *.seq *.msa

    # go back
    cd $curdir;
done
