#!/bin/csh

# output bigram features based on bsp

if ($#argv != 8) then 
   echo "Usage: coll2 task freq window stat score rank stoplist token"
   exit 1
endif

# $bsp is the directory where your stop.list and token.txt are located

set stoplist=$7
set token=$8

# parameters for bsp and sensetools

set freq=$2                # frequency cut-off (exclude bigrams < freq)
set window=$3              # window size for bigram counting (2 is default)
set stat=$4                # statistic to be computed ($stat.pm)
set score=$5               # cutoff value for statistic
				# ll and mi use "raw" score
				# leftFisher uses p-value

set rank=$6

# clean up old files

rm -f $1-training.coll2.cnt
rm -f $1-training.coll2.$stat
rm -f $1-training.coll2.hist
rm -f $1-training.coll2.regex

# newLine - don't count bigrams across end of line - needed for senseval

count.pl --extended --newLine --window $window --histogram $1-training.coll2.hist --token $token --stop $stoplist $1-training.coll2.cnt $1-training.count

statistic.pl --extended --rank $rank --frequency $freq --score $score $stat.pm $1-training.coll2.$stat.tmp $1-training.coll2.cnt 

grep "@count" $1-training.coll2.$stat.tmp > $1-training.coll2.$stat
grep "@statistic" $1-training.coll2.$stat.tmp >> $1-training.coll2.$stat
grep "<head>" $1-training.coll2.$stat.tmp >> $1-training.coll2.$stat
rm -f $1-training.$stat.tmp     

bsp2regex.pl $1-training.coll2.$stat --token $token > $1-training.coll2.regex

echo "coll2 finished $1"
