#!/bin/bash
export LC_NUMERIC="en_US.UTF-8"
if [ $# == "0" ] ; then
    echo " "
    echo " This script compares SEQRES to actual sequence from ATOM coordinates"
    echo " "
    echo " Syntax: sequence.bash [opt] pdbcode"
    echo "          where pdbcode is a four chars coding the PDB structure"
    echo " Options:                                                         " 
    echo "        -c " 
    echo "          selected chan (e.g A )" 
    echo "        -l " 
    echo "          get the pdb file from the current dir" 
    echo "          if '-l' is not specified wget the pdb file from the PDB " 
    exit 
fi
orac_bin=`which orac`
orac_home=`echo $orac_bin | sed "s?bin/orac??g"`
tdir=$orac_home/tools/templates
local=0

rm gaps seq.* chain.*  >& /dev/null
while getopts ":lc:" opt; do 
    case $opt in 
	c)
	    chain=$OPTARG; 
	    ;;
	l) 
	    local=1;
	    ;;
	r) 
	    local=1;
	    ;;
	\?) 
	    echo "invalid option"  "-"$OPTARG
	    exit
	    ;;
    esac
done

code=${!#}


if [ -z $chain ]; then
    echo "chain must be specified using the '-c' option "
    exit
fi
if [ $local != 1 ] ; then 
    wget  ftp://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/all/pdb/pdb${code}.ent.gz; 
    gzip -d pdb${code}.ent.gz ; mv   pdb${code}.ent ${code}.pdb
fi



# isolate selected chain (protein only)
if [ $chain != "?" ]; then
    awk -v c=$chain '{if($1=="ENDMDL") {exit}; if(substr($0,22,1)==c && $1 ~ "^ATOM") {print}}' $code.pdb > chain.pdb
    if [ ! -s chain.pdb ] ; then
        echo "<span style="color:#FF0000"> FILESIZE of chain.pdb <b> $chain </b> is ZERO!! <br>"
        echo "Possible reasons: </span>"
	echo "<ul>" 
        echo "<li>  chain code or SEQRES records not present in the file </li>"
        echo "<li>  chain <b> $chain </b> not present in the $code.pdb file </li>" 
	echo "</ul>" 
	exit
    fi
    size=`wc -l chain.pdb | awk '{print $1*2}'`
    if [ $size -gt 9000 ]; then
	echo " selected chain has more than 9K atoms; no solvated protein will be produced"
	water=0
    fi
else
    awk '{if($1 ~ "^ATOM") print}' $code.pdb > chain.pdb
fi

echo "<pre>" 
echo "fixing side-chains with vmd...."
echo " "
rm rec.pdb >&/dev/null 
vmd -dispdev text -e $orac_home/tools/scripts/psf.pgn >& /dev/null

if [ ! -s rec.pdb ]; then
    echo "<span style="color:#FF0000">ERROR: chain  $chain not processed by vmd as PROTEIN. DNA or unknown error</span>" 
    echo "ERROR: chain " $chain  "not processed."  > error
    exit
fi

# this should fix pdb files with alternate coordinates 
awk '{if(substr($0,27,1)!=" ") { if ($3=="CA" || $3=="N") {if(substr($0,23,5)!=nresold) { i++; nresold=substr($0,23,5)}} ; printf substr($0,1,22); printf "%4d",i; printf substr($0,27,50); printf "\n" } else {print} } ' rec.pdb > tmp.pdb ; mv tmp.pdb rec.pdb


#find the gap in the pdb
./findgp.awk  ${code}.pdb   > gaps.OUT 

if [ $chain != "?" ] ; then 
    awk -v c=$chain  '{if($3==c && $6==c) {g1=$1; g2=$4; print "GAP " g1, g2}}' gaps.OUT   > gaps
else
    awk '{if($5=="GAP") {g1=$1; g2=$3; print "GAP " g1, g2}}' $code.OUT   > gaps
fi
if [ ! -s gaps ] ; then
    lgaps=0
else
    echo  `wc gaps | awk '{print $1}'`" gaps found "
    lgaps=1
fi

# get the full residue sequence  from SEQRES in original PDB
if [ $chain != "?" ] ; then 
    grep -e ^SEQRES $code.pdb  | awk -v chain=$chain '{if(substr($0,12,1)==chain) for(i=5; i<=NF; i++) {j++; print $i,j}}' > seq.full
else
    grep -e ^SEQRES $code.pdb  |  awk '{for(i=4; i<=NF; i++) {j++; print $i,j}}'    > seq.full
fi    

# nres is the number of residues found in SEQRES; seq.full is the SEQRES sequence 
nres=`wc seq.full | awk '{print $1}'`

# get the first residue index in actual coordintaes 
sr=`head -1 chain.pdb | awk '{printf "%02d\n", substr($0,23,4)}'`

# generate the actual sequence with gaps marked as "xx"
find_seq.awk -v n0=$sr -v gaps=$lgaps chain.pdb | awk 'BEGIN{nold=-1000} {if($2 != nold) print; nold=$2}'> SEQ.full

# NRES is the number of residues found in pdb coordinate; SEQ.full is the correponding sequence 
NRES=`wc SEQ.full | awk '{print $1}'`
# align the SEQRES and the actual sequence
error=0
((j=-1))
for i in {1..5000} ; do
    ((j=j+1)) 
    ((tail=$nres-j)); tail -$tail seq.full > seq.tmp
    n=`paste seq.tmp SEQ.full | awk '{if($1==$3 || $3=="xxx") k++}END{print k}'`

    if [ -z $n ] ; then
	error=1
	break 
    fi
    if [ $n -eq $NRES ] ; then
	break
    fi
done

rmgaps=0
if [ $error == 1 ]; then # tries to eliminate gaps (maybe insertions as in 15c8)
    rm gaps* >&/dev/null
    touch gaps
    error=0
    awk '{if($1!="xxx") print $1,  NR }' SEQ.full > seq.tmp; mv seq.tmp SEQ.full
    ((j=-1))
    NRES=`wc SEQ.full | awk '{print $1}'`
    for i in {1..5000} ; do
	((j=j+1)) 
	((tail=$nres-j)); tail -$tail seq.full > seq.tmp
	n=`paste seq.tmp SEQ.full | awk '{if($1==$3 || $3=="xxx") k++}END{print k}'`
	if [ -z $n ] ; then
	    echo "<p style="color:#FF0000">  "
	    echo "ERROR: sequence alignment failed."
	    echo "       SEQRES in chain $chain of $code.pdb does not match residue numbering </p>"
	    error=1
	    exit
	fi
	if [ $n -eq $NRES ] ; then
	    rmgaps=1
	    break
	fi
    done
fi
if [ $rmgaps == 1 ]; then
    echo "Sequence aligned with no gaps; gaps are removed"
fi
    
# generare the SEQ with all residues except C/N-terminal missing residues

if [ $error == 0 ]; then
    paste SEQ.full seq.tmp | awk -v n=$NRES '{if(NR<=n) print $3,$2}' > SEQ
fi 
exit
