#!/bin/csh

# output bigram features based on bsp

if ($#argv != 8) then 
   echo "Usage: gram2 task freq window stat score rank stoplist token"
   exit 1
endif

set stoplist=$7
set token=$8

# parameters for bsp and sensetools

set freq=$2                # frequency cut-off (exclude bigrams < freq)
                           #
			   # note that a frequency cutoff in count changes
			   # the counting process, and will affect the
			   # computed statistics. Any bigram that occurs
			   # less than freq times is removed and the
			   # count of total bigrams is adjusted downward.
			   # in addition, this will reduce the word
		           # counts associated with the first and 
	                   # second word in a bigram that has been
			   # removed. This individual word counts and
		           # the total bigram sample size is reduced.
		           # this will change the value of computed
		           # statistics! - note however that using
	                   # a freq cutoff with statistic does not change
	                   # the computation of the statistic - it simply
                           # does not display those bigrams that occur
	                   # less than freq times - here we only use
                           # the freq cutoff with statistic, not count

set window=$3              # window size for bigram counting (2 is default)
set stat=$4                # statistic to be computed ($stat.pm)
set score=$5               # cutoff value for statistic
				# ll and mi use "raw" score
				# leftFisher uses p-value

set rank=$6                # used with statistic to display only the top
		           # ranked bigrams according to the statistic

# clean up old files

rm -f $1-training.gram2.cnt
rm -f $1-training.gram2.$stat
rm -f $1-training.gram2.hist
rm -f $1-training.gram2.regex

# newLine - don't count bigrams across end of line - needed for senseval

count.pl --extended --newLine --window $window --histogram $1-training.gram2.hist --token $token --stop $stoplist $1-training.gram2.cnt $1-training.count

statistic.pl --extended --rank $6 --frequency $freq --score $score $stat.pm $1-training.gram2.$stat $1-training.gram2.cnt 

bsp2regex.pl $1-training.gram2.$stat --token $token > $1-training.gram2.regex
echo "gram2 finished $1"
