#!/usr/local/bin/perl 

# how to use the program: pachuca-eval GOLD.utx file.gale 
# output of this program: precision recall accuracy
# instead of file.gale, you can give file.base too!

# program to accept two aligned texts and output precision, recall and
# F-measure

use utf8;

$debug = 0;

$gold = shift;
$new = shift;

open GOLD, $gold || die "Cant open file $gold";
open NEW, $new || die "Cant open file $new";

$precisionAcrossBitexts = 0;
$recallAcrossBitexts = 0;
$accuracyAcrossBitexts = 0;
$totalBitexts = 0;

$flag = 1; # to get things rolling
$lang = 0; 
while ($flag)
{
    # look for the next article starting in the gold data
    do
    {
	if ( !(defined ($line = <GOLD>))) { $flag = 0; last; }
    }   while ( !( $line =~ /<article/ ));

    if ( $flag == 0 ) { next; }

    # found the start of an article... 
    $articleTag = $line;

    # undefine a few arrays...
    if ( defined @sentenceGold0 ) { undef @sentenceGold0; }
    if ( defined @sentenceGold1 ) { undef @sentenceGold1; }
    if ( defined @sentenceMappingGold ) { undef @sentenceMappingGold; }
    
    # do the following until you get a </article>
    $inAlignment = 0;
    
    $line = "";
    while ( !( $line =~ /<\/article/ ))
    {
	$line = <GOLD>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ ) 
	{
	    # mark the start of the alignment...
	    $inAlignment = 1;

	    # and record the alignment for this sentence
	    $temp = $line;
	    $temp =~ s/^[^=]*=//g;
	    $temp =~ s/[^\d ]//g;
	    push @sentenceMappingGold, $temp;

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # clean up the sentence a bit
	    $sentence =~ s/\s+/ /g;

	    # push the sentence into the array
	    push @sentenceGold0, $sentence;
	    
	    next;
	}
	    
	if ( $inAlignment ) { $sentence .= $line; }
    }   
    

    # that completes for the one half of the translation. calculate the other half!
    do 
    {
	$line = <GOLD>;
    }   while (!($line =~ /<article/ ));
    
    # found the other article... save the sentences
    # found the start of an article... 
    $articleTag = $line;

    while (!( $line =~ /<\/article/ )) 
    {
	$line = <GOLD>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ )
	{
	    
	    # mark start of alignment...
	    $inAlignment = 1;
	    
	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # clean up the sentence a bit
	    $sentence =~ s/\s+/ /g;

	    # get the sentence
	    push @sentenceGold1, $sentence;

	    next;
	}

	if ( $inAlignment ) { $sentence .= $line; }
    }   
    
    # having done all that for the gold data, do much the same thing for the other data!
	   
    do
    {
	$line = <NEW>;
    }   while (!($line =~ /<article/ ));

    # found the start of an article... 
    $articleTag = $line;

    # undefine a few arrays...
    if ( defined @sentenceNew0 ) { undef @sentenceNew0; }
    if ( defined @sentenceNew1 ) { undef @sentenceNew1; }
    if ( defined @sentenceMappingNew ) { undef @sentenceMappingNew; }
    
    # do the following until you get a </article>
    $inAlignment = 0;

    while ( !( $line =~ /<\/article/ ))
    {
	$line = <NEW>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ ) 
	{
	    # mark the start of the alignment...
	    $inAlignment = 1;

	    # and record the alignment for this sentence
	    $temp = $line;
	    $temp =~ s/^[^=]*=//g;
	    $temp =~ s/[^\d ]//g;
	    push @sentenceMappingNew, $temp;

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # clean up the sentence a bit
	    $sentence =~ s/\s+/ /g;

	    # find the lenght of $sentence, and push into array
	    push @sentenceNew0, $sentence;
	    
	    next;
	}
	    
	if ( $inAlignment ) { $sentence .= $line; }
    }   

    # that completes for the one half of the translation. calculate the other half!

    do
    {
	$line = <NEW>;
    }   while (!( $line =~ /<article/ ));
    
    # found the other article... record lines... we dont need the line mappings, 
    # since we already have them from the other article!
    $articleTag = $line;
    
    while ( !( $line =~ /<\/article/ ))
    {
	$line = <NEW>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ )
	{
	    # mark start of alignment...
	    $inAlignment = 1;

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # clean up the sentence a bit
	    $sentence =~ s/\s+/ /g;

	    # push into array
	    push @sentenceNew1, $sentence;

	    next;
	}

	if ( $inAlignment ) { $sentence .= $line; }
    }   

    # at this point we have put all the sentences of the four articles
    # (two sentences in two different languages in two different
    # files) into four arrays, viz sentenceGold[0,1] and
    # sentenceNew[0,1]

    if ( $debug == 1 )
    {
	print "The Gold E sentences:\n";
	foreach $sentence (@sentenceGold0)
	{
	    print "{$sentence}\n";
	}

	print "The Gold F sentences:\n";
	foreach $sentence (@sentenceGold1)
	{
	    print "{$sentence}\n";
	}

	print "The New E sentences:\n";
	foreach $sentence (@sentenceNew0)
	{
	    print "{$sentence}\n";
	}

	print "The New F sentences:\n";
	foreach $sentence (@sentenceNew1)
	{
	    print "{$sentence}\n";
	}

	exit;
    }

    # to calculate the accuaracy we need the sample size

    my $temp1 = 0;
    foreach $sentence (@sentenceGold1)
    {
	$temp1 += getLength($sentence);
    }

    my $temp2 = 0;
    foreach $sentence (@sentenceNew1)
    {
	$temp2 += getLength($sentence);
    }

    $sampleSize = $temp1 * $temp2;

    # now starts the fun!
    $precision = 0;
    $recall = 0;
    $nextNewSentence = 0;

    for ( $i = 0; $i <= $#sentenceGold0; $i++ )
    {
	if ( !( defined ( $sentenceGold0[$i] ))) { next; }

	# the next function will look at the New0 sentences to find
	# out which New0 sentence(s) match this Gold0 sentence. having
	# found that set, this function will return to us the
	# sentences in New1 that are mapped to by the sentences in the
	# set just found above above match to.

	if ( defined @proposedSet ) { undef @proposedSet; }
	getProposedSet($sentenceGold0[$i]);

	# if proposed set is not found, carry on!
	if ( defined (@proposedSet) && $#proposedSet < 0 ) { next; }

	# now concatenate the target

	my $temp = $sentenceMappingGold[$i];
	$temp =~ s/^\s*//;
	$temp =~ s/\s*$//;
	$temp =~ s/\s+/ /g;

	if ( !( $temp =~ /\d/ ) ) { next; }
	my @mapTo = split / /, $temp;
	
	my $target = "";
	for ( $j = 0; $j <= $#mapTo; $j++ )
	{
	    $mapTo[$j] =~ s/[^\d]//g;
	    if ( defined ( $sentenceGold1[$mapTo[$j]-1] ))
	    {
		$target .= $sentenceGold1[$mapTo[$j]-1];
	    }
	}
	
	# now concatenate the proposed set 
	my $proposed = "";

	for ( $j = 0; $j <= $#proposedSet; $j++ )
	{
	    $proposed .= $sentenceNew1[$proposedSet[$j]-1];
	}

	# that gives us the two sets in the shape of two strings, $target
	# and $proposed. the following function will calculate for us the 
	# intersection. we shall add that to the intersection!
	
	# but first clean up the strings to be sent
	$target =~ s/\s+/ /g;
	$target =~ s/^\s*//g;
	$target =~ s/\s*$//g;

	$proposed =~ s/\s+/ /g;
	$proposed =~ s/^\s*//g;
	$proposed =~ s/\s*$//g;

	if ( getLength($target) == 0 || getLength($proposed) == 0 ) { next; }
	
	my $intersection = getIntersection($target, $proposed);
	if ( $intersection == 0 ) { next; }

	my $proposedLength = getLength($proposed);
	my $targetLength = getLength($target);

	$precision += $intersection / $proposedLength;
	$recall += $intersection / $targetLength;

	# finally the accuracy.
	$accuracy = ($sampleSize - $targetLength - $proposedLength + 2 * $intersection) / $sampleSize;

	if ( $debug == 2 )
	{
	    print "Precision = $precision\n";
	    print "Recall = $recall\n";
	}
    }

    $denom = $#sentenceGold0;

    # divide the prec/recall/accuracy values by denominator
    $precision /= $denom;
    $recall /= $denom;
    # $accuracy /= $denom;
    
    $precisionAcrossBitexts += $precision;
    $recallAcrossBitexts += $recall;
    $accuracyAcrossBitexts += $accuracy;
    $totalBitexts++;
}

# final values of precision, recall and accuracy
$precisionAcrossBitexts /= $totalBitexts;
$recallAcrossBitexts /= $totalBitexts;
$accuracyAcrossBitexts /= $totalBitexts;

if ( $precisionAcrossBitexts > 1 ) { $precisionAcrossBitexts = 1; }
if ( $recallAcrossBitexts > 1 ) { $recallAcrossBitexts = 1; }
if ( $accuracyAcrossBitexts > 1 ) { $accuracyAcrossBitexts = 1; } 

print "$precisionAcrossBitexts $recallAcrossBitexts $accuracyAcrossBitexts\n";

# End of program!

# function to get the proposed set
sub getProposedSet
{
    my $goldE = shift;

    # clean up the sentence a bit
    $goldE =~ s/\s+/ /g;
    $goldE =~ s/^\s*//g;
    $goldE =~ s/\s*$//g;

    my @source = (); # variable to capture the e lines in new that match gold
    
    # back up the nextNewSentence variable
    my $backUp = $nextNewSentence;

    # our gold sentence will become our regex... so escape it!
    $regex = $goldE;
    $regex = escape($regex);

    my $newE = ""; # variable to contain the continuously growing new e sentence
    my $foundFlag = 0;
    my $temp = "";

    while ( $foundFlag == 0 )
    {
	$newE .= $sentenceNew0[$nextNewSentence];
	push @source, $nextNewSentence;

	$temp = $newE;

	# clean up
	$temp =~ s/\s+/ /g;
	$temp =~ s/^\s*//g;
	$temp =~ s/\s*$//g;
	
	if ( $temp =~ /$regex/ )
	{
	    # found!
	    $foundFlag = 1;
	    
	    # if we are completely using up this line, then start increment nextNewSentence
	    if ( $temp =~ /$regex$/ ) { $nextNewSentence++; }
	}
	else
	{
	    $nextNewSentence++;
	    if ( $nextNewSentence > $#sentenceNew0 )
	    {
		# opps! thats it then!
		last;
	    }
	}
    }

    if ( $foundFlag )
    {
	# now to get the mappings from @source

	@tempSet = ();
	foreach $src (@source)
	{
	    my $map = $sentenceMappingNew[$src];
	    $map =~ s/^\s*//;
	    $map =~ s/\s*$//;
	    $map =~ s/\s+/ /g;

	    if ( !( $map =~ /\d/ ) ) { next; }
	    my @mapTo = split / /, $map;
	
	    for ( $j = 0; $j <= $#mapTo; $j++ )
	    {
		$mapTo[$j] =~ s/[^\d]//g;
		push @tempSet, $mapTo[$j];
	    }
	}

	# having got all that, remove the repeated mapping numbers

	push @proposedSet, $tempSet[0];
	for ( my $i = 1; $i <= $#tempSet; $i++ )
	{
	    if ( $tempSet[$i] < $tempSet[$i-1] ) { last; }
	    if ( $tempSet[$i] == $tempSet[$i-1] ) { next; }
	    push @proposedSet, $tempSet[$i];
	}

	if ( $debug == 2 )
	{
	    print "Found!!\n";
	    print "English sentence in gold: {$goldE}\n";
	    print "English sentence in new : {$temp}\n";
	    print "The proposed set: @proposedSet\n";
	}
    }
    
    else 
    { 
	$nextNewSentence = $backUp;
	@proposedSet;
    }
}

sub getIntersection
{
    # get the parameter strings
    my $param1 = shift;
    my $param2 = shift;
    
    if ( $debug == 2 )
    {
	print "Param 1: {$param1}\n";
	print "Param 2: {$param2}\n";
    }

    # sort the params to make param2 the shorter one keep a flag to
    # decide if we have swapped or not... necessary to decide whether
    # our division is giving us precision or accuracy :-)
    my $swapFlag = 0;
    if ( getLength($param2) > getLength($param1) ) 
    {
	$swapFlag = 1;
	my $temp = $param1;
	$param1 = $param2;
	$param2 = $temp;
    }

    my $inter = 0; # variable to find out how many characters match

    # check if param2 is completely inside param2... that is the best
    # possible situation!    

    # first escape the necessary characters
    $string = $param2;
    $string = escape($string);

    if ( $param1 =~ /$string/ )
    {
	$inter = getLength($param2)
    }

    # now we havent got that, so start checking on each end!
    else
    {
	my $rightPart = $param2;
	my $leftPart = "";

	while ( getLength($rightPart)>1 )
	{
	    $rightPart =~ s/.$//;
	    $leftPart = $& . $leftPart;

	    # escape the special characters if any
	    $string = $leftPart;
	    $string = escape($string);

	    if ( $param1 =~ /^$string/ && getLength($leftPart) > $inter)
	    {
		$inter = getLength($leftPart);
	    }

	    # escape the special characters if any
	    $string = $rightPart;
	    $string = escape($string);

	    if ( $param1 =~ /$string$/ && getLength($rightPart) > $inter)
	    {
		$inter = getLength($rightPart);
	    }
	}
    }

    if ( $debug == 2 ) { print "Intersection = $inter\n"; }
    return $inter;
}

sub getLength
{
    my $sentence = shift;

    if ( $articleTag =~ /chinese/ ) 
    {
	my $index = 0;

	while ( $sentence =~ /\p{IsPrint}/g )
	{ 
	    $index++;
	}
	
	return $index;
    }
    else { return(length($sentence)); }
}

sub escape 
{
    my $param = shift;
    my @temp = split //, $param;
    my @new = ();

    foreach $item (@temp)
    {
	$item =~ s/([\/\|\(\)\[\]\{\}\^\$\*\+\?\.])/\\$1/g;
	push @new, $item; 
    }

    my $new = join "", @new;
    return $new;
}
