#!/usr/local/bin/perl -w

# program to accept to aligned texts and output a combined precision/recall value
# accept the Ar and then A (see Arcade sentence track metrics)

use utf8;

$gold = shift;
$new = shift;

open GOLD, $gold || die "Cant open file $gold";
open NEW, $new || die "Cant open file $new";

$flag = 1;
$v = 0;
$numberOfF = 0;
$totalF = 0;

while ($flag)
{
    # look for the next article starting in the gold data
    do
    {
	if ( !(defined ($line = <GOLD>))) { $flag = 0; last; }
    }   while ( !( $line =~ /<article/ ));

    if ( $flag == 0 ) { next; }

    # found the start of an article... 

    # undefine a few arrays...
    if ( defined @sentenceLengthGold0 ) { undef @sentenceLengthGold0; }
    if ( defined @sentenceLengthGold1 ) { undef @sentenceLengthGold1; }
    if ( defined @sentenceMappingGold ) { undef @sentenceMappingGold; }
    
    # do the following until you get a </article>
    $inAlignment = 0;
    
    $line = "";
    while ( !( $line =~ /<\/article/ ))
    {
	$line = <GOLD>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ ) 
	{
	    
	    # mark the start of the alignment...
	    $inAlignment = 1;

	    # and record the alignment for this sentence
	    $temp = $line;
	    $temp =~ s/^[^=]*=//g;
	    $temp =~ s/[^\d ]//g;
	    push @sentenceMappingGold, $temp;

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # find the lenght of $sentence, and push into array
	    push @sentenceLengthGold0, getLength($sentence);
	    
	    next;
	}
	    
	if ( $inAlignment ) { $sentence .= $line; }
    }   
    

    # that completes for the one half of the translation. calculate the other half!
    do 
    {
	$line = <GOLD>;
    }   while (!($line =~ /<article/ ));
    
    # found the other article... record linelengths... we dont need the line mappings, 
    # since we already have them from the other article!
    
    
    while (!( $line =~ /<\/article/ )) 
    {
	$line = <GOLD>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ )
	{
	    
	    # mark start of alignment...
	    $inAlignment = 1;
	    
	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # find the length of $sentence, and push into array
	    push @sentenceLengthGold1, getLength($sentence);

	    next;
	}

	if ( $inAlignment ) { $sentence .= $line; }
    }   
    
    # having done all that for the gold data, do much the same thing for the other data!
	   
    do
    {
	$line = <NEW>;
    }   while (!($line =~ /<article/ ));

    # found the start of an article... 

    # undefine a few arrays...
    if ( defined @sentenceLengthNew0 ) { undef @sentenceLengthNew0; }
    if ( defined @sentenceLengthNew1 ) { undef @sentenceLengthNew1; }
    if ( defined @sentenceMappingNew ) { undef @sentenceMappingNew; }
    
    # do the following until you get a </article>
    $inAlignment = 0;
    
    

    while ( !( $line =~ /<\/article/ ))
    {
	$line = <NEW>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ ) 
	{
	    # mark the start of the alignment...
	    $inAlignment = 1;

	    

	    # and record the alignment for this sentence
	    $temp = $line;
	    $temp =~ s/^[^=]*=//g;
	    push @sentenceMappingNew, $temp;

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # find the lenght of $sentence, and push into array
	    push @sentenceLengthNew0, getLength($sentence);
	    
	    next;
	}
	    
	if ( $inAlignment ) { $sentence .= $line; }
    }   
    
    

    # that completes for the one half of the translation. calculate the other half!

    do
    {
	$line = <NEW>;
    }   while (!( $line =~ /<article/ ));
    
    # found the other article... record linelengths... we dont need the line mappings, 
    # since we already have them from the other article!
    

    while ( !( $line =~ /<\/article/ ))
    {
	$line = <NEW>;
	if ( $line =~ /<\/article>/ ) { next; }
	if ( $line =~ /<alignment/ )
	{
	    # mark start of alignment...
	    $inAlignment = 1;
	    
	    

	    $sentence = "";
	    next;
	}

	if ( $line =~ /<\/alignment/ )
	{
	    # found the end of the alignment... mark that!
	    $inAlignment = 0;

	    # find the length of $sentence, and push into array
	    push @sentenceLengthNew1, getLength($sentence);

	    next;
	}

	if ( $inAlignment ) { $sentence .= $line; }
    }   

    # at this point we have put all the sentence lengths of the four articles (two 
    # sentences in two different languages in two different files) into four arrays, viz
    # sentenceLengthGold[0,1] and sentenceLengthNew[0,1]

    # calculate mod ar. 
    $modAr = 0;

    for ( $i = 0; $i <= $#sentenceMappingGold; $i++ )
    {
	if ( !( defined ( $sentenceLengthGold0[$i] ))) { next; }

	$temp = $sentenceMappingGold[$i];
	$temp =~ s/^\s*//;
	$temp =~ s/\s*$//;
	$temp =~ s/\s+/ /g;
	
	if ( !( $temp =~ /\d/ ) ) { next; }

	@mapTo = split / /, $temp;
	
	$tempSum = 0;
	for ( $j = 0; $j <= $#mapTo; $j++ )
	{
	    $mapTo[$j] =~ s/[^\d]//g;
	    if ( defined ( $sentenceLengthGold1[$mapTo[$j]-1] ))
	    {
		$tempSum += $sentenceLengthGold1[$mapTo[$j]-1];
	    }
	}

	$modAr += $sentenceLengthGold0[$i] * $tempSum;
    }

    # calculate mod a
    $modA = 0;

    for ( $i = 0; $i <= $#sentenceMappingNew; $i++ )
    {
	if ( !( defined ( $sentenceLengthNew0[$i] ))) { next; }

	$temp = $sentenceMappingNew[$i];
	$temp =~ s/^\s*//;
	$temp =~ s/\s*$//;
	$temp =~ s/\s+/ /g;
	
	if ( !( $temp =~ /\d/ ) ) { next; }

	@mapTo = split / /, $temp;
	
	$tempSum = 0;
	for ( $j = 0; $j <= $#mapTo; $j++ )
	{
	    $mapTo[$j] =~ s/[^\d]//g;
	    if (!($mapTo[$j] =~ /\d/)) { next; }
	    if ( defined ( $sentenceLengthNew1[$mapTo[$j]-1] ))
	    {
		$tempSum += $sentenceLengthNew1[$mapTo[$j]-1];
	    }
	}

	$modA += $sentenceLengthNew0[$i] * $tempSum;
    }

    # now find the intersection
    $modInter = 0;
    for ( $i = 0; $i <= $#sentenceMappingGold; $i++ )
    {
	if ( $i > $#sentenceMappingNew ) { last; }

	if (!(defined($sentenceMappingGold[$i]))) { next; }	
	if (!(defined($sentenceMappingNew[$i]))) { next; }
	if (!(defined($sentenceLengthGold0[$i]))) { next; }

	$temp = $sentenceMappingGold[$i]; 
	$temp =~ s/[^\d ]//g;
	$temp =~ s/^\s*//g;
	$temp =~ s/\s*$//g;
	
	if ( !($temp =~ /\d/)) { next; }

	@arrayGold = split / /, $temp;

	$temp = $sentenceMappingNew[$i]; 
	$temp =~ s/[^\d ]//g;
	$temp =~ s/^\s*//g;
	$temp =~ s/\s*$//g;
	
	if ( !($temp =~ /\d/)) { next; }

	@arrayNew = split / /, $temp;
	$tempSum = 0;

	foreach $eltGold (@arrayGold)
	{
	    if (!(defined($sentenceLengthGold1[$eltGold]))) { next; }

	    $found = 0;
	    foreach $eltNew (@arrayNew)
	    {
		if ( $eltGold == $eltNew ) { $found = 1; last; }
	    }

	    if ( $found == 0 ) { next; }
	    else { $tempSum += $sentenceLengthGold1[$eltGold-1]; }
	}
	$modInter += $sentenceLengthGold0[$i] * $tempSum;
    }

    if ( $v ) { print "ModAr = $modAr, ModA = $modA, ModInter = $modInter\n"; }
    $recall = $modInter / $modAr;
    $precision = $modInter / $modA;
    $f = 2 * $recall * $precision / ( $recall + $precision);

    $totalF += $f;
    $numberOfF ++;
}
	    
$totalF /= $numberOfF;
print "$totalF\n";

sub getLength
{
    my $sentence = shift;
    my $index = 0;

    while ( $sentence =~ /\p{IsPrint}/g )
    { 
	$index++;
    }

    return $index;
}
