#!/usr/local/bin/perl 

# CS 8995 - Corpus Based Natural Language Processing
#
# Spring 2001
#
# Dr.Ted Pedersen

####################

# Team: Unam
#
#       Steve Holtz
#       Kiranmai Kodulkula
#       Krishna Kotnana

####################

# Usage Notes
# unam-eval.pl gold.utx aligned.utx
#
# Outputs precision, recall, accuracy to STDOUT in the same order.

####################


use utf8;

sub merge{
    my @mergearray = @_;
    @mergearray = sort{$a <=> $b} @mergearray;

    my @returnarray = $mergearray[0];
    for(my $i = 1;$i < @mergearray;$i++)
    {
	if($mergearray[$i] != $mergearray[$i-1])
	{
	    @returnarray = (@returnarray, $mergearray[$i]);
	}
    }
    return @returnarray;
}



sub findpos{
    my $var; my @givenarray;my $pos;
    ($var, @givenarray) = @_;
    
    for($i = 1;$i < @givenarray;$i ++)
    {
	if($var > $givenarray[$i-1] && $var <= $givenarray[$i])
	{
	    $pos = $i;
	}
    }
    return $pos;
}



sub formmaphash
{
    my @inarray = @_;
    my $len1; my $len2; my @array1; my $array2;my $x; my $y;
    my %hash;
    ($len1, @inarray) = @inarray;
    ($len2, @inarray) = @inarray;
    for(my $i = 0; $i < $len1; $i++)
    {
	($array1[$i],@inarray) = @inarray;
    }
    @array2 = @inarray;
    $length = @array1;
  
    for(my $i = 1;$i < $length; $i++)
    {
	$x = &findpos( $array1[$i-1]+1,@array2 );
	$y = &findpos( $array1[$i], @array2 );
	for($j = $x; $j <= $y; $j++)
	{
	    $hash{$i} .= $j."@";
	}
    }
    return %hash;
}

# Function to read the file into two parallel arrays of text  

sub textarray{
    
    my $file;
    my @textarray;
    my $filename = $_[0];
    open(FILE,"$filename");
    while(<FILE>)
    {
	$file .= $_;
    }
    $file =~ s/\n+//g;
    
    while($file =~ /\<article.*?>(.*?)\<\/article>/g)
    {
	@textarray = (@textarray,$1);
    }
    for($i =0;$i < @textarray;$i += 2)
    {
	    @lang1array = ( @lang1array , $textarray[$i] );
	    @lang2array = ( @lang2array, $textarray[$i+1] );
    }
    close(FILE);
}

# Function that reads a text and returns the character position of the sentence boundaries.

sub arraysplit {
    my @lenarray;
    my $file = $_[0];
    my @strings  = split(/\<\/alignment.*?\<alignment.*?>/,$file);
    $strings[0] =~ s/.*?\<alignment.*?>//;
    $strings[@strings - 1] =~ s/\<\/alignment.*//; 
    my $pos = 0;
    foreach my $element(@strings)
    {
	$element =~ s/\s+$//;
	$element =~ s/^\s+//;
	my $length_element = 0;
	while($element =~ /./g)
	{
	    $length_element++;
	}
	$pos += $length_element;
	@lenarray = (@lenarray,$pos);
    }
    @lenarray = ( 0, @lenarray);
    return @lenarray;
}


sub findalign
{
    my $file = $_[0]; my %hash;
    my $hash1 = $_[1]; 
    my $hash2 = $_[2];
    #$file =~ s/\n//g;
    while( $file =~ /\<alignment(.*?)=(.*?)>/g)
    {

	my @dollor2 = split(/\s+/,$2);
	my $one = $1;
	$one =~ s/\s//;
	my @array1 = split(/@/,$hash1->{$one});
	my @array2 = undef;
	foreach my $val (@dollor2)
	{
	    @array2 = ( @array2, split( /@/,$hash2->{$val} ) );
	    ($dummmy,@array2) = @array2;
	}
	foreach my $val1( @array1)
	{
	    foreach my $val2 (@array2)
	    {
		$hash{$val1} .= $val2."@";
	    }
	}
    }
    return %hash;
}
    


# Evaluation Program

die "usage $0 gold.utx aligned.utx\n" if ( $#ARGV != 1 );

&textarray("$ARGV[0]");
&textarray("$ARGV[1]");


$precision = $recall = $accuracy = $indexcount = 0;

for($index = 0; $index < @lang1array; $index += 2)
{
    $gfile1 = shift @lang1array;
    $gfile2 = shift @lang2array;
    
    $ofile1 = shift @lang1array;
    $ofile2 = shift @lang2array;

    @gfile1array = &arraysplit($gfile1);
    @gfile2array = &arraysplit($gfile2);

    @ofile1array = &arraysplit($ofile1);
    @ofile2array = &arraysplit($ofile2);


    @file1array = &merge( @gfile1array, @ofile1array );
    @file2array = &merge( @gfile2array, @ofile2array );

    $len1 = @gfile1array; $len2 = @file1array;
    %gfile1hash = &formmaphash( $len1, $len2, @gfile1array, @file1array );

    $len1 = @ofile1array; $len2 = @file1array;
    %ofile1hash = &formmaphash( $len1, $len2, @ofile1array, @file1array );

    $len1 = @gfile2array; $len2 = @file2array;
    %gfile2hash = &formmaphash( $len1, $len2, @gfile2array, @file2array );

    $len1 = @ofile2array; $len2 = @file2array;
    %ofile2hash = &formmaphash( $len1, $len2, @ofile2array, @file2array );
   
    %ghash = &findalign( $gfile1, \%gfile1hash, \%gfile2hash); 
    %ohash = &findalign( $ofile1, \%ofile1hash, \%ofile2hash);
    
    $tp = $fptp = $fntp = 0;   
    $count1 = $count2 = 0;
    foreach $key( keys %ghash )
    {
	my @array1 = split(/@/, $ghash{$key});
	my @array2 = split(/@/, $ohash{$key});
	
	$len1 = $file1array[$key] - $file1array[$key-1];
	foreach my $val1 (@array1)
	{
	    $len2 = $file2array[$val1] - $file2array[$val1-1];$fptp++;
	    foreach my $val2 (@array2)
	    {
		if( $val1 == $val2)
		{
		    $inter += $len1 * $len2;$tp++;
		}
	    }
	    $denom += $len1 * $len2;
	}
    }
    
    $precision += $inter / $denom;
    $denom = 0;
        
    foreach $key( keys %ghash )
    {
	my @array1 = split(/@/, $ohash{$key});
	my @array2 = split(/@/, $ghash{$key});	       
	$len1 = $file1array[$key] - $file1array[$key-1];$count1++;
	foreach my $val1 (@array1)
	{
	    $len2 = $file2array[$val1] - $file2array[$val1-1];$fntp++;$count2++;
	    $denom += $len1 * $len2;
	}
    }
    $recall += $inter / $denom;
    $fptpfn = $fptp + $fntp - $tp;
    $univ = $count1 * $count2;
    $accuracy += ($univ - $fptpfn + $tp)/$univ;
    $indexcount++;
}    

$precision /= $indexcount;
$recall /= $indexcount;
$accuracy /= $indexcount;

print "$precision $recall $accuracy\n";



