#!/usr/local/bin/perl 
#use utf8;

#TEAM MEMBERS:
#Devdatta Kulkarni
#Kiranmayee Nakka
#Nitin Varma

#DESCRIPTION:
#The evaluation used in this stage is a variation of F-measure used in the first stage. 
#In this stage the modification made to the F-measure is as follows:
#1. Take the first english sentence from the GOLD data. 
#2. Find the sentences in the data aligned by our program into which this sentence from the gold data is reflected.
#3. Now take the corresponding sentences of unknown language from the GOLD data and the data aligned by our program.
#4. Now find the intersection of these sentences. i.e. intersection of the
#sentence of the unknown language from the GOLD data and the sentence of the 
#unknown language the data aligned by our program.
#5. The length of this intesection set is = true_positive.
#6. The length of the part of the sentence from the GOLD data not found in the sentence from our aligned data is = false_negative.
#7. The length of the part of the sentence from our aligned data not found in the sentence from the GOLD data is = false_positive.

#8. We calculate these values for all the english sentences in the GOLD data.

#9. At the end precision, recall and accuracy are calculated using the following formulae:

#a. Precision += (length of the intersection)/(length of the sentence of unkwnown found from our aligned data)

#b. Recall += (length of the intesectio)/(length of the sentence of unknown found from GOLD data)

#c. Accuracy += (true_positive + true_negative)/(true_positive+true_negative+false_positive+false_negative)



#TO RUN: 
#./tigres-eval.pl tigres.utx gc.utx|base.utx
#where gc.utx and base.utx and file created by GALE & CHURCH alignment method and our baseline alignment method respectively.



#OUTPUT:
#The output of this program are values for precision recall and accuracy.



sub Readfile{
    my $filename = $_[0];my $file;
    my @files;
    open(FILE, "$filename");
    $flag = $camein = $cameout = 0;
    while( defined( my $line = <FILE> ) ){
        if($line =~ /\<bitext.*>/){
            @bitextarray = (@bitextarray,$line);
            next;
        }
	
        if($flag == 0 and $line =~ /\<article.*>/){
            @articlearray= (@articlearray, $line);
            $flag = 1;$camein = 1;
        }
        elsif($flag == 1 and $line =~ /\<\/article>/){
            $flag = 0; $cameout  = 1;
        }              
	if($flag == 1 and $camein == 0){
            $file = $file.$line;
        }
        if($cameout == 1){
            @files = (@files, $file);
            $file = undef;
        }
        $camein = $cameout = 0;
    }
    close(FILE);
    
    return @files;
}          

sub form_arrays{
    $count=0;
    open(TEMP,">temp.txt");
    print TEMP $_[0];
    close(TEMP);
    open(FILE,"temp.txt");
    while($line=<FILE>)
    {
	if($line=~m/alignment ([0-9]*=[0-9]*\s*[0-9]*)/)
	{ 
	    
	    $count++;
	    $match=$1;
	    undef $rightside;
	    ($leftnum,$rightside)=split(/=/,$match);
	    if($rightside eq NULL)
		
	    {
		
		$alignment[$leftnum][0]=-1;
		$alignment[$leftnum][1]=-1;
		
	    }
	    
	    else
	    {
		
		@array=split(/ /,$rightside);
		$len=@array;
		
		for($i=0;$i<$len;$i++)
		{
		    $alignment[$leftnum][$i]=$array[$i];
		    
		}
		if($len == 1)
		{ 
		    $alignment[$leftnum][1] = -1;
		}
		
		
	    }
	    undef $rightside;
	    $line = <FILE>; 
	    
	    while(!($line =~  m/<\/alignment>/))
	    {
		$full_line .= $line;
		$line = <FILE>;
	    }
	    $array_sentences[$leftnum][0] = $full_line;     
	    undef $full_line; 
	}
	
    }
    
}

($file1,$file2)=@ARGV;
@filearray1=Readfile($file1);
$len = @filearray1;


@filearray2=Readfile($file2);


$total_bi_texts = @filearray1;

$precision = 0;
$recall = 0;
for($textno=0;$textno<$total_bi_texts;$textno+=2,)
{
   
    form_arrays($filearray1[$textno]);
    @alignment_file1A = @alignment;
    open (FIL1 ,">f1");
    $gold_english_length = $count;   
    @array_sentences_file1A = @array_sentences;
  
  for($u=1;$u<=$gold_english_length;$u++)
      {
	$save_1A[$u][0] = $array_sentences_file1A[$u][0];
        

      }
   
    undef @alignment;
    undef @array_sentences;
    undef @array_sentences_file1A;

    form_arrays($filearray1[$textno+1]);
    @alignment_file1B = @alignment;
    @array_sentences_file1B = @array_sentences;

    $gold_unknown_length = $count;
  for($u=1;$u<=$gold_unknown_length;$u++)
      {
	$save_1B[$u][0] = $array_sentences_file1B[$u][0];
        $temp = $save_1B[$u][0];
        $temp =~ s/^\s+//g;
        $temp =~ s/\s+$//g;
        $temp =~ s/\n+//g;
        @len_array = split(//,$temp);
        $total_chars_in_unknown += @len_array;       
 
      }
   
    #print "\n chars_in_eng :: $total_chars_in_eng";
    #print "\n chars_in_unknown :: $total_chars_in_unknown";
  
    undef @alignment;
    undef @array_sentences;
    undef  @array_sentences_file1B;
    form_arrays($filearray2[$textno]);
    @alignment_file2A = @alignment;
    @array_sentences_file2A = @array_sentences;
  $our_english_length = $count;  
 for($u=1;$u<=$our_english_length;$u++)
      {
	$save_2A[$u][0] = $array_sentences_file2A[$u][0];
      }
    
    undef @alignment;
    undef @array_sentences;
    undef @array_sentences_file2A;   


    form_arrays($filearray2[$textno+1]);
    @alignment_file2B = @alignment;
    @array_sentences_file2B = @array_sentences;

    $our_unknown_length = $count;
 for($u=1;$u<=$our_unknown_length;$u++)
      {
	$save_2B[$u][0] = $array_sentences_file2B[$u][0];
        $temp = $save_2B[$u][0];
        $temp =~ s/^\s+//g;
        $temp =~ s/\s+$//g;
        $temp =~ s/\n+//g;
        $total_chars_in_eng += length($temp);

      }
    
    undef @alignment;
    undef @array_sentences;
    undef @array_sentences_file2B; 


    open (INT,">i1");
    
    $remp = 1;
  
    $num = $gold_english_length ;
#    $num = 2;
    for($i=1,$j=1;$i<$num;$i++)
    {

	$gold_eng_sent = $save_1A[$i][0];
	$our_eng_sent  = $save_2A[$j][0];
	chomp($gold_eng_sent);
        chomp($our_eng_sent); 
	
	$gold_eng_sent =~ s/\s+$//;
	$our_eng_sent =~ s/\s+$//;
	
	$gold_eng_sent =~ s/^\s//;
	$our_eng_sent =~ s/^\s//;
	
#	$gold_french = $save_1B[$i][0];
#	$our_french = $save_2B[$i][0];

	$gold_eng_sent =~ s/([\/\|\(\)\[\]\{\}\^\$\*\+\?\.\"\`])/\\$1/g;
#	print FIL1 "$i--->$gold_eng_sent\n";
#        print FIL1 "$j--->$our_eng_sent\n"; 
  
#	$our_eng_sent =~ m/$gold_eng_sent/;
	
$flag = 0;
while($flag == 0)
{ 
  print FIL1 "\n I am in while ";
  if($j==$our_english_length)
    {
      $flag = 1;
      last;
    }
  print FIL1 "\n Our eng :: [$our_eng_sent]";
  print FIL1 "\n Gold eng :: [$gold_eng_sent]";
  
	if($our_eng_sent =~ m/$gold_eng_sent/)
	{
	  $save_2A[$j][0]= $';
	  #print "\n j:: $j  val :: $'";
	  $flag = 1;
	  print FIL1 " \n $j  Matched part :: [$&]\n";
	}
	else
	{	  
	  $j++;
	  $next_sentence = $save_2A[$j][0];
	  
	  $next_sentence =~ s/\s+$//;
	  $next_sentence =~ s/^\s+//;
	  $next_sentence = " ".$next_sentence;
	  $our_eng_sent .= $next_sentence;  	  
	}
}#end of while
print FIL1 "\n Outside while ";
 

#print "\n out of while ";    


#print FIL1 "j---> $j";

undef %temp_hash;
for($k=$remp;$k<=$j;$k++)
  {
    if( $alignment_file2A[$k][0] != -1 )
      {
	$temp_hash{$alignment_file2A[$k][0]}++;
      }
    
    
    if( $alignment_file2A[$k][1] != -1 )
      {
	$temp_hash{$alignment_file2A[$k][1]}++;
      }        
    
  }
$remp = $k-1;
undef $combined_sentence;

foreach $key(sort keys %temp_hash)
  {
      $next_to_combine = $save_2B[$key][0];
      $next_to_combine =~ s/\s+$//;
      $next_to_combine =~ s/^\s+//;
#      $next_to_combine = " ".$next_sentence;
      
   #   $combined_sentence .= $save_2B[$key][0];
      $combined_sentence .= $next_to_combine;    
#    print FIL1 "key :: $key $combined_sentence\n";
      delete $temp_hash{$key};
  }
	
$combined_sentence =~ s/\n//g;
#$combined_sentence =~ s/\s+$//;
#$combined_sentence =~ s/^\s+//;

undef %t1_hash;
if($alignment_file1A[$i][0] != -1 )
  {
#      print "\nalignment[$i][0]:: $alignment[$i][0]";
    $t1_hash{$alignment_file1A[$i][0]}++;
  }

if ($alignment_file1A[$i][1] != -1 )
  {
    $t1_hash{$alignment_file1A[$i][1]}++;
  }

undef $combined_sentence_gold;
foreach $key(sort keys %t1_hash)
  {
      $next_to_combine_gold = $save_1B[$key][0];
      $next_to_combine_gold =~ s/\s+$//;
      $next_to_combine_gold =~ s/^\s+//;
#      $next_to_combine_gold = " ".$next_sentence;
     
      $combined_sentence_gold .= $next_to_combine_gold;
      
#    $combined_sentence_gold .= $save_1B[$key][0];
 #   print FIL1 "key :: $key $combined_sentence_gold\n";
    delete $t1_hash{$key};
  }
$combined_sentence_gold =~ s/\n//g; 
#$combined_sentence_gold =~ s/\s+$//;
#$combined_sentence_gold =~ s/^\s+//;

#print "\n i:: $i";
#if( defined($combined_sentence) and defined($combined_sentence_gold))
#{
#    print "\n I am in if ";
#    print "\nOur:: $combined_sentence";
#    print "\nGold:: $combined_sentence_gold";
    intersect($combined_sentence,$combined_sentence_gold,$i);
#}
	
}

#print "\n Outside the for ";

$precision /= $num;

$recall /= $num;

$total_space = $total_chars_in_eng * $total_chars_in_unknown ;
$true_negative = $total_space - $true_positive - $false_positive - $false_negative;

$accuracy = ($true_positive + $true_negative) / $total_space;
 
}
print "\n$precision  $recall $accuracy";




sub intersect 
{
	
$fir = shift; #our
$sec = shift; #gold
$g = shift;

#if( $g == 1)
#  {
 #   print INT "i: $g\n";
  #  print INT $fir ;
  #  print INT "\n";
  #  print INT $sec;
#    print INT "\n-----------------------\n";

#  }

chomp($fir);
chomp($sec);

$fir =~ s/\s+$//g;
$sec =~ s/\s+$//g;

print INT "ekade --->[$fir]";
print INT "\n";
print INT "ethe ---->[$sec]";
	
@arr_fir = split(//,$fir);
@arr_sec = split(//,$sec);
	
$small = @arr_fir; #our
$large = @arr_sec; #gold

#print INT "small :: $small \n";
#print INT "large :: $large \n";
	
@B = @arr_fir;
@A = @arr_sec;
	
$match = 0;
$total = 0;
	
if($small > $large)
{
	$temp = $small;
	$small = $large;
	$large = $temp;
	
	@A = @arr_fir;
	@B = @arr_sec;
}

$la = $large;
$lb = $small;

for($w=1;$w<=$lb;$w++)
{
	$bcnt = $w-1;
	undef $andar;
	undef $to_check;
	for($jk=0;$jk<$w;$jk++)
	{
		$andar .= $B[$lb-$bcnt-1];
		$to_check .= $A[$jk];
		$bcnt--;
	}
	$save_andar = $andar;
	$andar =~ s/([\/\|\(\)\[\]\{\}\^\$\*\+\?\.\"\'])/\\$1/g;

	if( $to_check =~ m/^$andar/ )
	{
#	  print INT "\nAndar ::{$andar} "; 
#          print INT "\ntocheck ::{$to_check} "; 
#	  print INT "\nI am in if ";
		$match = length $save_andar;
	#	print INT "\nlength :: $match ";
		if($total < $match)
		{
		#  print INT "In the frist if total :: $total ";
			$total = $match;
		}
	}
}

undef $to_check;

$nw_cnt = 0;
for($w=1;$w<=$la-$lb;$w++)
{
	$andar = $save_andar;

	for($k=$nw_cnt;$k<$nw_cnt+$lb;$k++)
	{
		$to_check .= $A[$k];
	}
	$save_andar = $andar;

	$andar =~ s/([\/\|\(\)\[\]\{\}\^\$\*\+\?\.])/\\$1/g;
	print INT " i :: $g andar :: $andar \n";
	print INT " j :: $j \n";
	if( $to_check =~ m/^$andar$/ )
	{
		$match = length $save_andar;
		if($total < $match)
		{
	#	    print INT "In the second if total :: $total ";
			$total = $match;
		}		
	}
    
	$nw_cnt++;
	undef $to_check;
}


for($w=$lb-1;$w>0;$w--)
{
	$bcnt=$w;
	undef $baher;
	undef $to_check;
	for($jk=0;$jk<=$w;$jk++)
	{
		$baher .= $B[$jk];
		$to_check .= $A[$la-$bcnt-1];
		$bcnt--;
	}
	$save_baher = $baher;
	$baher =~ s/([\/\|\(\)\[\]\{\}\^\$\*\+\?\.])/\\$1/g;
	if( $to_check =~ m/$baher$/ )
	{
		$match = length $save_baher;
		if($total < $match)
		{  
		#  print INT "In the third if total :: $total ";
			$total = $match;
		}
	}

}

#print INT "\n Precision :: [$precision]";
#print INT "\n Recall :: [$recall]";

$true_positive += $total;

$false_positive += ($small-$total);

$false_negative += ($large-$total);

if($small != 0)
{
$precision += ($total/$small);
}

if($large != 0)
{
$recall += ($total/$large);
}

#print INT "Total :: $total \n";

#print "\n Number of intersections :: $total\n";

}








