#!/usr/bin/awk -f
# purge outlier from workfile qwrk and lwrk, generating a single wrk file
BEGIN{
    if(p==0) {p=0.2} # default is xs=3.1*sigma
    if(ARGC==1) {
	print " "
	print " Purge outliers from workfile. Outliers are marked with a '#' character."
	print " "
	print " Syntax: "
	print "     zscore_rec.awk [-v p=p ] workfile "
	print "     where the optional 'p' variable (default 0.2) is the probability "
	print "     to pick, in n trials, at least one value less (or greater) than"
	print "     xs, if x is drawn from a normal distribution with zero mean"
	print "     and unit variance."
	print "     The smaller is p, the smaller will be erfc and the larger xs."
        print "     p is related to erfc=int_(-infty)^{-xs} e^-(t^2/2) dt"    
	print "     as (1-p)=(1-erfc)^n where n is the number of work values"
	print "     so that the erfc = [1-(1-p)^(1/n)] and  "
	print "     xs is such that |w|> xs*sigma is an outlier given p"
	print " "
	print " Purged workfile is produced on standard output "
	print " "
	exit
    }
}    
{
    w+=$1
    wrk[NR]=$1
    w2+=$1^2
}
END{
    if(ARGC==1) {exit}
    thres=(1-(1-p)^(1/NR))
    x=-6
    dj=6/6000
    erfc=0.0
    for(i=1; i<=6000; i++)  {
	x+=dj
	argg=  (x)^2/(2)
	erfc+=(1/sqrt(2*3.14159265359))*exp(-argg)*dj
#	print "#" x,dj,argg,erfc,thres,p
	if(erfc>thres) {
	    xs=sqrt(x^2)
#	    print xs
	    break
	}
    }
    sw=sqrt(w2/NR-(w/NR)^2)
    mw=w/NR
    io=0
    for(i=1; i<=NR; i++) {
	zw=(wrk[i]-mw)/sw 
	if (sqrt(zw^2) > xs ) {
	    x=mw-2*sqrt(zw^2)*sw
	    dj=sqrt(zw^2)*sw/1000
	    erfc=0.0
	    for(j=1; j<=1000 ; j++) {
		x+=dj
		argg= (mw-x)^2/(2*sw^2)
		erfc+=(1/sw/sqrt(2*3.14159265359))*exp(-argg)*dj
#		print "#" x,mw,dj,(mw-x),argg
	    }
	    printf "#%5d zw= %5.2f xs= %5.2f w= %6.1f wm = %6.1f sig= %6.1f p =  %7.5f n = %5d\n",  i,zw,xs,wrk[i],mw, sw, erfc,int(1/erfc)
	}
	else {
	    print wrk[i],zw,xs
	}
    }
}
