#!/bin/csh

# split the an xml into per word files and store the files 
# for each word in a separate directory 
# this script is specific to the english data
# the xml file should be in the senseval-2 format

if ($#argv != 2) then 
   echo "split data token"
   echo ".xml suffix is assumed, token definition file should be in pwd"
   exit 1
endif

set inputxml = $1.xml
set token = $2


# remove capitalization

tr '[A-Z]' '[a-z]' < $inputxml > $inputxml.lc
mv $inputxml.lc $inputxml

rm -f $inputxml.lc    

# remove spurious punctuations, hard to handle stuff 
# clean.pl $inputxml > $inputxml.tmp
# mv $inputxml.tmp $inputxml

# if we are running preprocess without the --removeNotTokens
# option, then we will want to use the token definition file
# token4.txt -- simply the regex /\S+/, meaning that string that is 
# not made up of spaces is a token. we do not want to put any <>'s
# around non-tokens as yet, since we may want to run preprocess
# again to split the xml data into test and training files
#
# set token=/home/cs/tpederse/BSP/token4.txt
# preprocess.pl -token $token $inputxml

# if we are using --removeNonTokens, then we want to use
# a normal token definition file since we will not have
# a second round of preprocessing, so we want to identify
# all of the tokens on the first pass
#
# set token=/home/cs/tpederse/BSP/token1.txt

# token1.txt contains:
#/<head.*>\w+<\/head>/
#/<sat.*>\w+<\/sat>/
#/(\w+)?(&(\w+);)+(\w+)?/
#/&bquo;\w+&equo;/    
#/\w+\.[nva]/
#/\w+/   

preprocess.pl -removeNonTokens -token $token $inputxml

# now create a directory for each word, and move the xml and count 
# files into that directory

## spanish senseval2 data
##

##foreach word ( actuar apoyar claro  gracia organo  tocar apuntar clavar grano partido autoridad  conducir  hermano pasaje  tratar  bomba  copiar local  popular  usar    brillante corazon masa programa   vencer   canal  corona natural saltar  verde  ciego  coronar   naturaleza  simple vital circuito  explotar  operacion   tabla )
                           
## senseval2 data
##

foreach word ( art.n authority.n bar.n begin.v blind.a bum.n call.v carry.v chair.n channel.n child.n church.n circuit.n collaborate.v colourless.a cool.a day.n detention.n develop.v draw.v dress.v  drift.v drive.v dyke.n face.v facility.n faithful.a fatigue.n feeling.n ferret.v find.v fine.a fit.a free.a graceful.a  green.a grip.n hearth.n holiday.n keep.v lady.n leave.v  live.v local.a match.v material.n mouth.n nation.n natural.a  nature.n oblique.a play.v post.n pull.v replace.v restraint.n  see.v sense.n serve.v simple.a solemn.a spade.n stress.n strike.v train.v treat.v turn.v use.v vital.a wander.v wash.v work.v yew.n )

## senseval1 data
##

##foreach word ( accident-n consume-v  scrap-n amaze-v invade-v scrap-v band-p derive-v knee-n  seize-v  behaviour-n  modest-a shake-p bet-n excess-n    onion-n    shirt-n  bet-v   float-n  promise-n  slight-a  bitter-p    float-v   promise-v  bother-v  floating-a  wooden-a  brilliant-a generous-a  sack-n  bury-v   giant-a     sack-v   calculate-v 	 giant-n   sanction-p )

        rm -fr $word
        mkdir $word
        mv $word.xml $word/$word-$inputxml
        mv $word.count $word/$word-$1.count   
end

echo "split finished"
