#!/usr/local/bin/perl #use utf8; #TEAM MEMBERS: #Devdatta Kulkarni #Kiranmayee Nakka #Nitin Varma #DESCRIPTION: #The evaluation used in this stage is a variation of F-measure used in the first stage. #In this stage the modification made to the F-measure is as follows: #1. Take the first english sentence from the GOLD data. #2. Find the sentences in the data aligned by our program into which this sentence from the gold data is reflected. #3. Now take the corresponding sentences of unknown language from the GOLD data and the data aligned by our program. #4. Now find the intersection of these sentences. i.e. intersection of the #sentence of the unknown language from the GOLD data and the sentence of the #unknown language the data aligned by our program. #5. The length of this intesection set is = true_positive. #6. The length of the part of the sentence from the GOLD data not found in the sentence from our aligned data is = false_negative. #7. The length of the part of the sentence from our aligned data not found in the sentence from the GOLD data is = false_positive. #8. We calculate these values for all the english sentences in the GOLD data. #9. At the end precision, recall and accuracy are calculated using the following formulae: #a. Precision += (length of the intersection)/(length of the sentence of unkwnown found from our aligned data) #b. Recall += (length of the intesectio)/(length of the sentence of unknown found from GOLD data) #c. Accuracy += (true_positive + true_negative)/(true_positive+true_negative+false_positive+false_negative) #TO RUN: #./tigres-eval.pl tigres.utx gc.utx|base.utx #where gc.utx and base.utx and file created by GALE & CHURCH alignment method and our baseline alignment method respectively. #OUTPUT: #The output of this program are values for precision recall and accuracy. sub Readfile{ my $filename = $_[0];my $file; my @files; open(FILE, "$filename"); $flag = $camein = $cameout = 0; while( defined( my $line = ) ){ if($line =~ /\/){ @bitextarray = (@bitextarray,$line); next; } if($flag == 0 and $line =~ /\/){ @articlearray= (@articlearray, $line); $flag = 1;$camein = 1; } elsif($flag == 1 and $line =~ /\<\/article>/){ $flag = 0; $cameout = 1; } if($flag == 1 and $camein == 0){ $file = $file.$line; } if($cameout == 1){ @files = (@files, $file); $file = undef; } $camein = $cameout = 0; } close(FILE); return @files; } sub form_arrays{ $count=0; open(TEMP,">temp.txt"); print TEMP $_[0]; close(TEMP); open(FILE,"temp.txt"); while($line=) { if($line=~m/alignment ([0-9]*=[0-9]*\s*[0-9]*)/) { $count++; $match=$1; undef $rightside; ($leftnum,$rightside)=split(/=/,$match); if($rightside eq NULL) { $alignment[$leftnum][0]=-1; $alignment[$leftnum][1]=-1; } else { @array=split(/ /,$rightside); $len=@array; for($i=0;$i<$len;$i++) { $alignment[$leftnum][$i]=$array[$i]; } if($len == 1) { $alignment[$leftnum][1] = -1; } } undef $rightside; $line = ; while(!($line =~ m/<\/alignment>/)) { $full_line .= $line; $line = ; } $array_sentences[$leftnum][0] = $full_line; undef $full_line; } } } ($file1,$file2)=@ARGV; @filearray1=Readfile($file1); $len = @filearray1; @filearray2=Readfile($file2); $total_bi_texts = @filearray1; $precision = 0; $recall = 0; for($textno=0;$textno<$total_bi_texts;$textno+=2,) { form_arrays($filearray1[$textno]); @alignment_file1A = @alignment; open (FIL1 ,">f1"); $gold_english_length = $count; @array_sentences_file1A = @array_sentences; for($u=1;$u<=$gold_english_length;$u++) { $save_1A[$u][0] = $array_sentences_file1A[$u][0]; } undef @alignment; undef @array_sentences; undef @array_sentences_file1A; form_arrays($filearray1[$textno+1]); @alignment_file1B = @alignment; @array_sentences_file1B = @array_sentences; $gold_unknown_length = $count; for($u=1;$u<=$gold_unknown_length;$u++) { $save_1B[$u][0] = $array_sentences_file1B[$u][0]; $temp = $save_1B[$u][0]; $temp =~ s/^\s+//g; $temp =~ s/\s+$//g; $temp =~ s/\n+//g; @len_array = split(//,$temp); $total_chars_in_unknown += @len_array; } #print "\n chars_in_eng :: $total_chars_in_eng"; #print "\n chars_in_unknown :: $total_chars_in_unknown"; undef @alignment; undef @array_sentences; undef @array_sentences_file1B; form_arrays($filearray2[$textno]); @alignment_file2A = @alignment; @array_sentences_file2A = @array_sentences; $our_english_length = $count; for($u=1;$u<=$our_english_length;$u++) { $save_2A[$u][0] = $array_sentences_file2A[$u][0]; } undef @alignment; undef @array_sentences; undef @array_sentences_file2A; form_arrays($filearray2[$textno+1]); @alignment_file2B = @alignment; @array_sentences_file2B = @array_sentences; $our_unknown_length = $count; for($u=1;$u<=$our_unknown_length;$u++) { $save_2B[$u][0] = $array_sentences_file2B[$u][0]; $temp = $save_2B[$u][0]; $temp =~ s/^\s+//g; $temp =~ s/\s+$//g; $temp =~ s/\n+//g; $total_chars_in_eng += length($temp); } undef @alignment; undef @array_sentences; undef @array_sentences_file2B; open (INT,">i1"); $remp = 1; $num = $gold_english_length ; # $num = 2; for($i=1,$j=1;$i<$num;$i++) { $gold_eng_sent = $save_1A[$i][0]; $our_eng_sent = $save_2A[$j][0]; chomp($gold_eng_sent); chomp($our_eng_sent); $gold_eng_sent =~ s/\s+$//; $our_eng_sent =~ s/\s+$//; $gold_eng_sent =~ s/^\s//; $our_eng_sent =~ s/^\s//; # $gold_french = $save_1B[$i][0]; # $our_french = $save_2B[$i][0]; $gold_eng_sent =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.\"\`])/\\$1/g; # print FIL1 "$i--->$gold_eng_sent\n"; # print FIL1 "$j--->$our_eng_sent\n"; # $our_eng_sent =~ m/$gold_eng_sent/; $flag = 0; while($flag == 0) { print FIL1 "\n I am in while "; if($j==$our_english_length) { $flag = 1; last; } print FIL1 "\n Our eng :: [$our_eng_sent]"; print FIL1 "\n Gold eng :: [$gold_eng_sent]"; if($our_eng_sent =~ m/$gold_eng_sent/) { $save_2A[$j][0]= $'; #print "\n j:: $j val :: $'"; $flag = 1; print FIL1 " \n $j Matched part :: [$&]\n"; } else { $j++; $next_sentence = $save_2A[$j][0]; $next_sentence =~ s/\s+$//; $next_sentence =~ s/^\s+//; $next_sentence = " ".$next_sentence; $our_eng_sent .= $next_sentence; } }#end of while print FIL1 "\n Outside while "; #print "\n out of while "; #print FIL1 "j---> $j"; undef %temp_hash; for($k=$remp;$k<=$j;$k++) { if( $alignment_file2A[$k][0] != -1 ) { $temp_hash{$alignment_file2A[$k][0]}++; } if( $alignment_file2A[$k][1] != -1 ) { $temp_hash{$alignment_file2A[$k][1]}++; } } $remp = $k-1; undef $combined_sentence; foreach $key(sort keys %temp_hash) { $next_to_combine = $save_2B[$key][0]; $next_to_combine =~ s/\s+$//; $next_to_combine =~ s/^\s+//; # $next_to_combine = " ".$next_sentence; # $combined_sentence .= $save_2B[$key][0]; $combined_sentence .= $next_to_combine; # print FIL1 "key :: $key $combined_sentence\n"; delete $temp_hash{$key}; } $combined_sentence =~ s/\n//g; #$combined_sentence =~ s/\s+$//; #$combined_sentence =~ s/^\s+//; undef %t1_hash; if($alignment_file1A[$i][0] != -1 ) { # print "\nalignment[$i][0]:: $alignment[$i][0]"; $t1_hash{$alignment_file1A[$i][0]}++; } if ($alignment_file1A[$i][1] != -1 ) { $t1_hash{$alignment_file1A[$i][1]}++; } undef $combined_sentence_gold; foreach $key(sort keys %t1_hash) { $next_to_combine_gold = $save_1B[$key][0]; $next_to_combine_gold =~ s/\s+$//; $next_to_combine_gold =~ s/^\s+//; # $next_to_combine_gold = " ".$next_sentence; $combined_sentence_gold .= $next_to_combine_gold; # $combined_sentence_gold .= $save_1B[$key][0]; # print FIL1 "key :: $key $combined_sentence_gold\n"; delete $t1_hash{$key}; } $combined_sentence_gold =~ s/\n//g; #$combined_sentence_gold =~ s/\s+$//; #$combined_sentence_gold =~ s/^\s+//; #print "\n i:: $i"; #if( defined($combined_sentence) and defined($combined_sentence_gold)) #{ # print "\n I am in if "; # print "\nOur:: $combined_sentence"; # print "\nGold:: $combined_sentence_gold"; intersect($combined_sentence,$combined_sentence_gold,$i); #} } #print "\n Outside the for "; $precision /= $num; $recall /= $num; $total_space = $total_chars_in_eng * $total_chars_in_unknown ; $true_negative = $total_space - $true_positive - $false_positive - $false_negative; $accuracy = ($true_positive + $true_negative) / $total_space; } print "\n$precision $recall $accuracy"; sub intersect { $fir = shift; #our $sec = shift; #gold $g = shift; #if( $g == 1) # { # print INT "i: $g\n"; # print INT $fir ; # print INT "\n"; # print INT $sec; # print INT "\n-----------------------\n"; # } chomp($fir); chomp($sec); $fir =~ s/\s+$//g; $sec =~ s/\s+$//g; print INT "ekade --->[$fir]"; print INT "\n"; print INT "ethe ---->[$sec]"; @arr_fir = split(//,$fir); @arr_sec = split(//,$sec); $small = @arr_fir; #our $large = @arr_sec; #gold #print INT "small :: $small \n"; #print INT "large :: $large \n"; @B = @arr_fir; @A = @arr_sec; $match = 0; $total = 0; if($small > $large) { $temp = $small; $small = $large; $large = $temp; @A = @arr_fir; @B = @arr_sec; } $la = $large; $lb = $small; for($w=1;$w<=$lb;$w++) { $bcnt = $w-1; undef $andar; undef $to_check; for($jk=0;$jk<$w;$jk++) { $andar .= $B[$lb-$bcnt-1]; $to_check .= $A[$jk]; $bcnt--; } $save_andar = $andar; $andar =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.\"\'])/\\$1/g; if( $to_check =~ m/^$andar/ ) { # print INT "\nAndar ::{$andar} "; # print INT "\ntocheck ::{$to_check} "; # print INT "\nI am in if "; $match = length $save_andar; # print INT "\nlength :: $match "; if($total < $match) { # print INT "In the frist if total :: $total "; $total = $match; } } } undef $to_check; $nw_cnt = 0; for($w=1;$w<=$la-$lb;$w++) { $andar = $save_andar; for($k=$nw_cnt;$k<$nw_cnt+$lb;$k++) { $to_check .= $A[$k]; } $save_andar = $andar; $andar =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; print INT " i :: $g andar :: $andar \n"; print INT " j :: $j \n"; if( $to_check =~ m/^$andar$/ ) { $match = length $save_andar; if($total < $match) { # print INT "In the second if total :: $total "; $total = $match; } } $nw_cnt++; undef $to_check; } for($w=$lb-1;$w>0;$w--) { $bcnt=$w; undef $baher; undef $to_check; for($jk=0;$jk<=$w;$jk++) { $baher .= $B[$jk]; $to_check .= $A[$la-$bcnt-1]; $bcnt--; } $save_baher = $baher; $baher =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; if( $to_check =~ m/$baher$/ ) { $match = length $save_baher; if($total < $match) { # print INT "In the third if total :: $total "; $total = $match; } } } #print INT "\n Precision :: [$precision]"; #print INT "\n Recall :: [$recall]"; $true_positive += $total; $false_positive += ($small-$total); $false_negative += ($large-$total); if($small != 0) { $precision += ($total/$small); } if($large != 0) { $recall += ($total/$large); } #print INT "Total :: $total \n"; #print "\n Number of intersections :: $total\n"; }