#!/bin/bash
export LC_NUMERIC="en_US.UTF-8"
orac_bin=`which orac`
orac_home=`echo $orac_bin | sed "s?bin/orac??g"`
chain_gap="NULL"


# fix  a PDBCODE from Protein Data Bank
function testpdb { 
    i=$1
    pdbrestore.bash -s $i 1> $i.OUT 2> /dev/null
    grep " chain " $i.OUT | awk '{print $2,$3}' > INPUT_CHA.$i
    grep " GAP " $i.OUT | grep -v " 0   " | grep -v "SECTION" | awk '{print $1"-"$4,$4-$1-1,$3}'  > INPUT_GAP.$i
    if [ -s INPUT_GAP.$i ] ;then
	chain_gap=`awk '{print $3}' INPUT_GAP.$i`
	ngaps=`wc INPUT_GAP.$i | awk '{print $1}'`
	len=`awk '{tot+=$2}END{print tot/NR}' INPUT_GAP.$i`
    fi
    grep " natom= " $i.OUT  | awk '{if($4>6) {j++; lig[$2]=$1; cha[j]=$2; j++}}END{for(i=1; i<=j; i++) {chain=cha[i]; if(chain!=""){print chain,lig[chain]}}}'  > INPUT_LIG.$i
    ((m=0))
#   loop on chain starts HERE    
    for j in `cat INPUT_CHA.$i | awk '{print $1}'`; do
	rm $j.no >& /dev/null 
	((m=m+1))
	chain_length=`awk -v n=$m '{if(NR==n) print $2}' INPUT_CHA.$i`
	printf "%5s%2s%6s%4s%5s" "CHAIN" $j $chain_length " of " $i
	if (( $chain_length  >   5000 ))  ; then
	    touch $j.no
	fi
	chain_gap=`grep $j  INPUT_GAP.$i | tail -1 |awk '{print $3}'`
	if [[ ! -z $chain_gap &&  ! -f $j.no  ]] ; then
	    pdbrestore.bash -g ALL -c $j $i >& Results/OUTPUT_CHAIN-$j.$i
	    grep "fixed!!" Results/OUTPUT_CHAIN-$j.$i > test_file
	    if [ -s test_file ]; then
		printf "%s %s %s\n"  " OK -- GAPS/LEN->" $ngaps $len 
	    else
		grep -v "fixed!!"  Results/OUTPUT_CHAIN-$j.$i | awk '{if(NR>1) print}' > test_file
		found=0
		for k in dna sequence completed gap awry zero ; do
		    grep -i $k  test_file > $k
		    if [ -s $k  ] ; then
			K=`echo $k | awk '{print toupper($1)}'`
			printf "%8s \n"  " FAILED ($K)"
			found=1
		    fi
		done
		if [ $found == 0 ]; then
		    printf "%8s \n"  " FAILED (UNKNOWN)"
		fi
		touch $j.no 
	    fi
	elif [ ! -f $j.no ]; then
	    pdbrestore.bash -g NONE -c $j $i >& Results/OUTPUT_CHAIN-$j.$i
	    grep "fixed!!" Results/OUTPUT_CHAIN-$j.$i > test_file
	    if [ -s test_file ]; then
		printf "%4s\n"  " OK "
	    else
		grep -v "fixed" Results/OUTPUT_CHAIN-$j.$i  | awk '{if(NR>1) print}' > test_file
		found=0
		for k in dna sequence completed gap awry zero; do
		    grep -i $k  test_file > $k
		    if [ -s $k  ] ; then
			K=`echo $k | awk '{print toupper($1)}'`
			printf "%8s \n"  " FAILED ($K)"
			found=1
		    fi
		done
		if [ $found == 0 ]; then
			printf "%8s \n"  " FAILED (UNKNOWN)"
		fi
		touch $j.no 
	    fi
	else
	    printf "%4s\n"  " SKIPPED gt  5000 "
	fi
    done

    #   ligand test(s) 

    for j in `grep "natom=" $i.OUT | awk '{if($NF>6) {if($1!=lig_old) {print $1"-"$2; lig_old=$1}}}'`; do
	l=`echo $j  | sed "s/-/ /g"  | awk '{print $1}'`
	c=`echo $j  | sed "s/-/ /g"  | awk '{print $2}'`
	m=`awk -v c=$c '{if($1==c) print $2}' INPUT_CHA.$i `
	if [ -f $c.no ]; then
	    rm $c.no
	    break
	fi
	printf "%5s%2s%6s%4s%5s%6s%4s" "CHAIN" $c $m " of " $i " with " $l 
	chain_gap=`grep $c  INPUT_GAP.$i | tail -1 |awk '{print $3}'`
	if [ ! -z $chain_gap ] ; then
	    pdbrestore.bash -g ALL -c $c -l $l  $i >& Results/OUTPUT_LIG-$j.$i
	    grep "fixed!!" Results/OUTPUT_LIG-$j.$i > test_file
	    if [ -s test_file ]; then
		printf "%s %s %s\n"  " OK -- GAPS/LEN->" $ngaps $len 
	    else
		grep -v "fixed!!" Results/OUTPUT_LIG-$j.$i | awk '{if(NR>1) print}' > test_file
		found=0
		for k in dna sequence completed gap awry zero ; do
		    grep -i $k  test_file > $k;
		    if [ -s $k  ] ; then
			K=`echo $k | awk '{print toupper($1)}'`
			printf "%8s \n"  " FAILED ($K)"
			found=1
		    fi
		done
		if [ $found == 0 ]; then
		    printf "%8s \n"  " FAILED (UNKNOWN)"
		fi
	    fi
	else
	    pdbrestore.bash -g NONE -c $c -l $l  $i >& Results/OUTPUT_LIG-$j.$i
	    grep "fixed!!" Results/OUTPUT_LIG-$j.$i > test_file
	    if [ -s test_file ]; then
		printf "%4s\n"  " OK "
	    else
		grep -v "fixed!!"  Results/OUTPUT_LIG-$j.$i | awk '{if(NR>1) print}' > test_file
		found=0
		for k in dna sequence completed gap awry zero ; do
		    grep -i $k  test_file > $k
		    if [ -s $k  ] ; then
			K=`echo $k | awk '{print toupper($1)}'`
			printf "%8s \n"  " FAILED ($K)"
			found=1
		    fi
		done
		if [ $found == 0 ]; then
			printf "%8s \n"  " FAILED (UNKNOWN)"
		fi
	    fi
	fi
    done
}


nsample=0
sample=0
readlist=0

if [ ! -d Results ] ; then
    mkdir Results
fi

while getopts ":s:l:" opt; do 
    case $opt in 
	s) 
	    nsample=$OPTARG;
	    sample=1
	    ;;
	l) 
	    list=$OPTARG; 
	    readlist=1
	    ;;
	\?) 
	    echo "invalid option"
	    ;;
    esac
done
if [ $# == "0" ] ; then 
    echo "This code process/fix PDB files from the PDB using pdbrestore producing essential info (OK/failed) "
    echo " "
    echo " Syntax: test_pdbrestore [opt] [LIST|PDBCODE]"
    echo "         PDBCODE is any four-alphanumeric pdb code on the PDB"
    echo " Options:                                                                                 " 
    echo "        -s num " 
    echo "          pick randomly num PDB files from the protein data bank"
    echo "        -l filename " 
    echo "          read the PDB files from the file 'filename' "
    echo " Examples: " 
    echo "          test_pdbrestore.bash 1fkg " 
    echo "             download and process 1fkg " 
    echo "          test_pdbrestore.bash -l Results/LIST.DATA1  " 
    echo "             process all pdb files listed in Results/LIST.DATA1 " 
    echo "          test_pdbrestore.bash -s 2000"
    echo "             download and process ~2000 files from the PDB" 
    exit
fi


if [ $# == 1 -a $sample == 0 ] ; then
    testpdb $1
fi

((j=0))
N=`wc $orac_home/pdb/PDB_list_16_01_2025 | awk '{print $1}'`
p=`echo $N $nsample | awk '{print $2/$1}'`
if [ $sample == 1 ]; then
    for i in `cat $orac_home/pdb/PDB_list_16_01_2025  | awk '{print substr($9,4,4)}'`; do
	((j=j+1))
	seed=`date +"%T.%N" | awk '{print substr($1,10,10)}'`
	go=`awk -v seed=$seed -v p=$p 'BEGIN{srand(seed); ok=0; x=rand(); if(x<p) {ok=1;print "OK"}else{print "no"}}' /dev/null`
	if [ $go == "OK" ]; then
	    testpdb $i 
	    if [ -s gaps.OUT ] ;then
		echo $i "with gaps" >> pdb_with_gaps
	    fi
	fi
    done
fi
if [ $readlist == 1 ]; then
    if [ -f $list ] ; then
	for i in `cat $list` ; do
	    testpdb $i
	done
    else
	echo " file $list non found"
    fi
fi    

exit
