#!/bin/csh

#######################################################################
#This system takes a supervised learning approach to word sense
#disambiguation, where three Naive Bayesian classifiers are induced from
#sense-tagged training examples. A weighted vote is taken among these to
#assign senses to test examples. No information from WordNet is utilized
#by this system. 

#Each Naive Bayesian classifier is based on a different set of features
#that are identified in a filtering step prior to learning. 

#The first feature set is based on bigrams (two word sequences) that meet
#the following criteria: 

#1) occur 2 or more times and 
#2) have a log-likelihood ratio >= to 6.635 (i.e., p=.01) and 
#3) are not made up of stop-listed words. 

#The second feature set is based on unigrams (one word sequences) that
#meet the following criteria: 

#1) occur 5 or more times and 
#2) are not found on the stop-list. 

#The third feature set is based on bigrams that may include one
#intervening word that is ignored and that meet the following criteria: 

#1) occur 2 or more times and 
#2) have a log-likelihood ratio >= to 2.706 (i.e., p=.1) and 
#3) are not made up of stop-listed words and 
#4) include the word to be disambiguated 

#A Naive Bayesian classifier is learned based on each feature set. When
#presented with a test example, each classifier assigns a probability to
#each possible sense. These probabilities are summed and sense with the
#largest value is assigned. 

#This is loosely based on the NAACL-00 paper "A Simple Approach to
#Building Ensembles of Naive Bayesian Classifiers for Word Sense 
#Disambiguation" by Ted Pedersen. 

#This is the same approach as taken in duluth1 for English. The only
#difference is in the stop list. 
#######################################################################


if ($#argv != 3) then
   echo "duluth6 source-dir stoplist token"
   echo 
   echo "source-dir : directory where data resides"
   echo "in this distribution source-dir is LexSample"
   echo 
   echo "stop-list: text file of stop words"
   echo "in this distribution stop-list is stop.list"
   echo 
   echo "token : text file of token definitions"
   echo "in this distribution token is token1.txt"
   echo 
   echo "run from directory where source-dir, stop-list, and token reside"
   echo
   echo "if you don't want to use stop.list and/or token just create"
   echo "a blank file via echo > dummy and use that instead"
   exit 1
endif   

# specify the name of your stop list and token definition file
# for some reason things work better if you specify the full
# path name of the stoplist and token files

set sourcedir=$1
set stoplist=$PWD/$2
set token=$PWD/$3

if !(-e $sourcedir) then 
	echo "$sourcedir sourcedir does not exit"
	exit 1
endif

if !(-e $stoplist) then 
	echo "$stoplist stoplist does not exit"
	exit 1
endif

if !(-e $token) then 
	echo "$token token does not exit"
	exit 1
endif

# the methods are feature extraction routines that build views of the
# text for the machine learning system weka.

foreach method (f0 g2 f3)

	# the results for each method are contained in a directory,
	# which has the same name as the method

 	if (-e $method) then
                echo $method already exists, aborting
                exit 1
        endif 

	mkdir $method

	# step into source directory and find out the names of
	# all the files to be processed. Each file is named after
        # a word to be processed.

	cd $sourcedir
        set wordlist = (*)
	cd ..

	# now process each of those words

	foreach word ($wordlist)
        
		# move the text for a word into the appropriate directory

		cd $sourcedir
		cp -r $word ../$method
		cd ..
		cd $method/$word

		# now process that text with the desired method
		
		$method $word $stoplist $token

		# convert the text into a form that weka likes (arff)

		xml2arff $word

		# now run weka to do machine learning and tag the test data

		wekarun $word NaiveBayes '-o'

		# score your results with senseval 2 scoring program
		# note that weka will provide some diagnostic output too
		
		score-word $word NaiveBayes     

		# get out of this word directory and move to the next!

		cd ../..
	
	end

end

# duluth6 is an ensemble of three naive bayesian classifiers,
# where one classifier uses the f0 view of the data, another
# uses the f2 view, and another uses f3. They all get combined
# below based on the distribution

ensembleByDist.pl f0 g2 f3

score ens-f0f2f3.NaiveBayes

