#!/bin/bash

#
# dia2nex.sh -- Shell wrapper for the DIALIGN (version 2.2) multiple sequence
#   alignment program (see http://bibiserv.techfak.uni-bielefeld.de/dialign
#   for information on DIALIGN)
#
# (C) 2007 by
#     Markus Goeker (markus.goeker@uni-tuebingen.de)
#
# This program is distributed under the terms of the Gnu Public License V2.
# For further information, see http://www.gnu.org/licenses/gpl.html
#
# If you use this script in a publication, please cite the web page at
# http://www.goeker.org/scripts/
#

########################################################################

# Change this to another default location:
declare -r DIADEF="/usr/local/bin/dialign2-2"

########################################################################

# Chat Ramey's recommendations for secure shell scripts
IFS=$' \t\n'
unset -f unalias
\unalias -a
unset -f command
SYSPATH="$(command -p getconf PATH 2>/dev/null)"
if [[ -z "$SYSPATH" ]] ; then
  SYSPATH="/usr/bin:/bin"
fi
PATH="$SYSPATH:$PATH"

set -eu

########################################################################

# Check whether file has legible contents or is executable
function file_ok
{
  local checkexec=
  if [ "$1" = "-x" ]; then
    checkexec=yes
    shift
  fi
  if [ -z $checkexec ]; then
    for i; do
      if ! [ -r "$i" ] || ! [ -f "$i" ] || ! [ -s "$i" ]; then
        echo "Can't read contents of file \"$i\"." >&2
        return 1
      fi
    done
  else
    for i; do
      if ! [ -x "$i" ] || ! [ -f "$i" ]; then
        echo "Can't execute file \"$i\"." >&2
        return 1
      fi
    done
  fi
}
########################################################################

# Remove directory name and file extension from a file name
function trunk_name
{
  if [ $# -ne 1 ]; then
    echo "Wrong number of arguments for function $FUNCNAME." >&2
    exit 1
  fi
  local outname="${1##*/}"
  echo "${outname%.*}"
}

########################################################################

DIALIGN="$DIADEF"
helpmsg=
speedup=
datatype="-n"
printfasta=
substlabel=
MAPFILE=
ALIFILE=

while getopts "a:d:fhimpst" opt; do
  case $opt in
    a ) ALIFILE=$OPTARG;;
    d ) DIALIGN=$OPTARG;;
    f ) printfasta="-fa";;
    h ) helpmsg=yes;;
    i ) substlabel=yes;;
    m ) datatype="-n -ma";;
    p ) datatype=;;
    s ) speedup="-o";;
    t ) datatype="-nt";;
    * ) exit 1;;
  esac
done
shift $(($OPTIND-1))

########################################################################

if [ $# -ne 1 ] || [ $helpmsg ]; then
  cat >&2 <<-__EOF

	${0##*/} is a shell wrapper for the dialign program.

	Usage: ${0##*/} [options] FASTA_file [> NEXUS_file]

	Options:
	  -a  Treat this file as dialign outfile. Do not run dialign, just
	      create NEXUS output. The FASTA file is used for label mapping.
	  -d  Alternative location for dialign (default is "$DIADEF").
	  -f  Produce a FASTA file, too.
	  -h  Display this message.
	  -i  Do not include NEXUS taxon labels in "'", but remove PAUP*-
	      incompatible characters.
	  -m  Assume nucleotides, but assess homology at both the nucleotide
	      and the amino acid level. See also -s and -p.
	  -p  Assume amino acid sequences (default: nucleotide). See also -m
	      and -t.
	  -s  Slightly speed up alignment by use of an approximate algorithm.
	  -t  Assume nucleotides, but assess homology at the amino acid level.
	      See also -m and -p.

	In NEXUS output, dialign scores are available both as character sets
	and weights. At the PAUP* prompt or in a batch file, the least reliable
	alignment columns can be excluded by, e.g., "exclude score_0 score_1;".

__EOF
  exit 1
fi

########################################################################

[ "$ALIFILE" ] || file_ok -x "$DIALIGN"
declare -r FASTAFILE="$1"
file_ok "$FASTAFILE"

declare -r trunk=$(trunk_name "$FASTAFILE")
declare -r MAPFILE=${trunk}.map
awk -v OFS="\t" -v RS="\n|\r|\r\n" '
    /^>/ {
      label = substr($0, 2)
      sub(/^[ \t]+/, "", label)
      sub(/[ \t]+$/, "", label)
      if (! label) {
        print "Empty sequence header in file", FILENAME ",line",
          FNR > "/dev/stderr"
        exit 1
      }
      print label
    }
  ' "$FASTAFILE" > $MAPFILE

if [ -z "$ALIFILE" ]; then
  ALIFILE=$trunk.dialign
  command="\"$DIALIGN\" $datatype $printfasta $speedup -pst -fn $ALIFILE \"$FASTAFILE\""
  echo "Executing $command ..." >&2
  if ! eval $command; then
    rm -f $MAPFILE $ALIFILE $ALIFILE.sta $ALIFILE.fa
    exit 1
  fi
fi

########################################################################

awk -v mapfile=$MAPFILE -v datatype="$datatype" -v substlabel=$substlabel '
   BEGIN {
     se = "/dev/stderr"
     while ((getline < mapfile) > 0) {
       if (NF)
         realname[++taxcount] = $0
     }
   }
   /^   Aligned sequences:/, /^   Average seq. length:/ {
     if ($1 ~ /\)$/) {
       thislabel = substr($0, 7, 12)
       gsub(/\[/, "{", thislabel)
       gsub(/\]/, "}", thislabel)
       sub(/\r/, " ", thislabel)
       label[++ntax] = thislabel
     }
   }
   /^   ===========================/, /^   Sequence tree:/ {
     if (! NF)
       next
     if ($0 ~ /^                       /) {
       if (scount != ntax) {
         print "Invalid format in line " FNR ", file " FILENAME > se
         print > se
         print "Expected " ntax " sequences, got " scount > se
         error++
         exit 1
       }
       wts = (wts getseq($0))
       scount = 0
     } else if ($0 !~ /^ /) {
       scount++
       seq[scount] = (seq[scount] getseq($0))
     }
   }
   END { if (! error && ntax) {
     nchar = length(seq[1])
     for (i = 2; i <= ntax; i++) {
       if (length(seq[i]) != nchar) {
         print "Unequal lengths between sequences no. 1 and " i > se
         exit 1
       }
     }
     for (i = 1; i <= ntax; i++) {
       thisname = realname[i]
       if (substlabel)
         gsub(/[^A-Za-z0-9._]+/, "_", thisname)
       else
         thisname = ("'\''" thisname "'\''")
       realname[i] = thisname
       lablen = length(thisname)
       if (lablen > maxlablen)
         maxlablen = lablen
     }
     labformat = ("%-" maxlablen "s")
     print "#NEXUS"
     print ""
     print "begin data;"
     print "	dimensions ntax=" ntax " nchar=" nchar ";"
     print "	format datatype=" (datatype ~ /^-n/ ? "dna" : "protein") \
       " missing=? gap=-;"
     print "	matrix"
     for (i = 1; i <= ntax; i++)
       print sprintf(labformat, realname[i]), "[" label[i] "]", seq[i]
     print ""
     print sprintf(labformat,"["), "              ", wts "]"
     print "	;"
     print "end;"
     print ""
     print "begin assumptions;"
     getcharsets(wts, wtval, set)
     printf "	wtset dialign_weights (vector)="
     for (i = 1; i <= nchar; i++)
       printf " %g", wtval[i]
     print ";"
     printf "	wtset incremented_weights (vector)="
     for (i = 1; i <= nchar; i++)
       printf " %g", wtval[i] + 1
     print ";"
     for (i in set)
       print "	charset score_" i, "=" set[i] ";"
     print "end;"
     print ""
   }}
   function getseq(str) {
     str = substr(str, 24)
     gsub(/[ \t]/, "", str)
     gsub(/[a-z]/, "?", str)
     return str
   }
   function getcharsets(wts, wtval, set,    np, i, thiswt) {
     np = split(wts, wtval, "")
     for (i = 1; i <= np; i++) {
       thiswt = wtval[i]
       set[thiswt] = (set[thiswt] " " i)
     }
   }
  ' "$ALIFILE"

########################################################################

rm -f $MAPFILE $trunk.dialign.sta
