#!/usr/local/bin/perl #use utf8; # FORMAT # ------ # puebla-eval.pl gold.utx output.utx # Example # ------- # The following command should produce the corresponding output (as shown below) # $$ puebla-eval.pl tigres.utx output.utx # P: precision-value R: recall-value Acc: accuracy # General Info # ------------ # PUEBLA # Date: 04/16/01 # Members: Alark, Hari & Inder. # Comments can be mailed to: hbommaga@d.umn.edu # PUEBLA EVALUATION REPORT # ------------------------ # The puebla evaluation procedure uses the following method for calculating precision and recall. The calculation of the precision and recall might slightly vary from the "more generally" accepted approach. This evaluation procedure takes advantage of the fact that the actaul "runnung" text in both gold as well as our output are the same. Infact, it strictly maintains this notion. A malfunctioning alignment program "forgets" to "put" a sentence in place, our evaluation program will complain. # Possible Causes for failure of evaluation program: # ------------------------------------------------- # * program strictly looks for same "overall" sequence of text for the first language in both gold data and program-aligned data. If not so, evaluation program might "hang". Unfortunately, we havenot provided enough error-checking though we do consider this to be a "grave" mistake on the part of alignment program. # We did not find any other significant cases of failure. Many cases were considered during the process of the development. We would like to hear from the users of this program, if they had any problems. In such a case, please contact, # hbommaga@d.umn.edu or ajoshi@d.umn.edu or sing0174@d.umn.edu # NOTE: The evaluation process might take some time. It uses a simple algorithm for a complex string search to find the largest intersection. $file1 = shift; $file2 = shift; @gold_eng, @gold_fr; @out_eng, @out_fr; open(ROUTE, ">>results"); open( FILE1, $file1 ); open( FILE2, $file2 ); # FIXME: remove the bitext tag and pass the file-handle to the function. #creating the arrays... $cnt = 0; $cumu_p = 0; $cumu_r = 0; $cumu_acc = 0; while($flag == 0){ last if(!($line1 = ) || !($line2 = )); while(!($line1 =~ m/)){ $flag = 1; last; } } while(!($line2 =~ m/)){ $flag = 1; last; } } if($flag != 1){ $cnt++; create_array( *FILE1, \@gold_eng, \@gold_fr ); create_array( *FILE2, \@out_eng, \@out_fr ); eval_precision(); $cumu_acc += $accuracy; $cumu_r += $recall; $cumu_p += $precision; @gold_eng = undef; @gold_fr = undef; @out_eng = undef; @out_fr = undef; } } #print ROUTE "Dataset: $file1\n"; print "Dataset: $file1\n"; print ROUTE "P: $precision \t R: $recall\t"; print ROUTE "Acc: $accuracy\n"; print "P: $precision \t R: $recall\t"; print "Acc: $accuracy\n"; close ROUTE; ####### # eval_precision: finds the precision and recall for an article set, # given its four arrays gold_eng, gold_fr, out_fr, out_eng... I do not # pass them as references. Assume they exist. ####### sub eval_precision { $gold_fr_len = @gold_fr; $gold_eng_len = @gold_eng; $out_eng_len = @out_eng; $num_char_eng_gold = 0; $num_char_fr_gold = 0; for($j=0; $j<$gold_eng_len;$j++){ $num_char_eng_gold += sent_length($gold_eng[$j][0]); } for($j=0; $j<$gold_fr_len;$j++){ $num_char_fr_gold += sent_length($gold_fr[$j][0]); } # for($j=0; $j<$out_eng_len;$j++){ # print"OUT ENG: {$out_eng[$j][0]}\n"; # } $pr_index[0] = 0; $pr_index[1] = 0; $present[0] = $gold_eng[$pr_index[0]][0]; # gold $present[1] = $out_eng[$pr_index[1]][0]; # out # while we have not seen all the sentences in the gold data and are # still seeing them!!! # $string =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; $iterations = 0; $precision = 0; $recall = 0; $fp = 0; $fn = 0; $tp = 0; $accuracy = 0; while( $pr_index[0] < $gold_eng_len ){ $iterations++; chomp $present[0]; chomp $present[1]; $present[0] =~ s/^[\s*]//g; $present[1] =~ s/^[\s*]//g; $present[0] =~ s/\s+/ /g; $present[1] =~ s/\s+/ /g; $present[0] =~ s/[\s+]$//g; $present[1] =~ s/[\s+]$//g; $len_out = sent_length($present[1]); $len_gold = sent_length($present[0]); my $min_len, $max_len; if($len_out < $len_gold) { $min_len = 1; $max_len = 0; } else { $min_len = 0; $max_len = 1; } $present[$min_len] =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; #comparing the two sentences # print "PRESENT: {($min_len)[$present[$min_len]]}\n{($max_len)[$present[$max_len]]}\n"; $factor = 0; if($present[$min_len] eq ""){ # $flag_skip = 1; $next_iter_index[$min_len] = $pr_index[$min_len] + 1; } else{ if($present[$max_len] =~ m/$present[$min_len]/) { $len_match = sent_length($&); $next_max_sent = $'; # print "PREV: [$`] MATCH: [$&] NEXT SENT: [$']\n"; $pista = sent_length($gold_eng[$pr_index[0]][0]); $badam = sent_length($out_eng[$pr_index[1]][0]); $factor = $len_match / ( sent_length($gold_eng[$pr_index[0]][0]) + sent_length($out_eng[$pr_index[1]][0]) - $len_match); # print "FACTOR ::: $factor $len_match $pista $badam "; ######### FINE HERE..... $next_iter_index[$min_len] = $pr_index[$min_len] + 1; if(!$'){ # print "!!!COMPLETE MATCH!!!\n"; $next_iter_index[$max_len] = $pr_index[$max_len] + 1; $partial_match = 0; } else { # print "!!!PARTIAL MATCH!!!"; $next_iter_index[$max_len] = $pr_index[$max_len]; $partial_match = 1; } } ### found the factor... find the french connections... # my $pr_fr_gold = undef; my $pr_fr_out = undef; @pr_fr = undef; ## fr_gold first for(my $u=0; $u<$gold_eng[$pr_index[0]][1]; $u++){ # print "f: [$u] \t"; $pr_fr[0] = $pr_fr[0] . " " . $gold_fr[$gold_eng[$pr_index[0]][$u+2]][0]; } # print "\n"; for(my $u=0; $u < $out_eng[$pr_index[1]][1]; $u++){ # print "f: [$u] \t"; $pr_fr[1] = $pr_fr[1] . " ". $out_fr[$out_eng[$pr_index[1]][$u+2]][0]; } # print "\n"; # print "FRENCH GOLD:: {$pr_fr[0]}\n"; # print "FRENCH OUT:: {$pr_fr[1]}\n"; ### I have found the french sentences... and concatenated them!!! ### find the smaller one... do regex match and find ### intersection!!! chomp $pr_fr[0]; chomp $pr_fr[1]; $pr_fr[0] =~ s/^[\s*]//g; $pr_fr[1] =~ s/^[\s*]//g; $pr_fr[0] =~ s/\s+/ /g; $pr_fr[1] =~ s/\s+/ /g; $pr_fr[0] =~ s/[\s+]$//g; $pr_fr[1] =~ s/[\s+]$//g; $len_gold = sent_length($pr_fr[0]); $len_out = sent_length($pr_fr[1]); # my $min_len; if($len_out < $len_gold) { $min_len_fr = 1; $max_len_fr = 0; } else { $min_len_fr = 0; $max_len_fr = 1; } # get the intersection and put it in precision and recall. intersection($pr_fr[1], $pr_fr[0]) if( $len_out != 0 && $len_gold != 0); # $pr_fr[$min_len_fr] =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; # $inter_section = 0; # # print "$max_len_fr --- I am fine here.. will print a match ----\n"; # if($pr_fr[$max_len_fr] =~ m/$pr_fr[$min_len_fr]/) { # $inter_section = sent_length($&); # ## Okay if, inter_section is zero... fine... but if len_out or # ## len_gold is zero??? # $precision += $inter_section / $len_out if($len_out != 0); # $recall += $inter_section / $len_gold if($len_gold != 0); # print "P ::: $precision R ::: $recall\n"; # } } ### else $pr_index[0] = $next_iter_index[0]; $pr_index[1] = $next_iter_index[1]; # print "MAXLEN :: $max_len MINLEN :: $min_len \n"; if($min_len == 0){ $present[$min_len] = $gold_eng[$pr_index[$min_len]][0]; # gold if( $partial_match == 0 ){ $present[$max_len] = $out_eng[$pr_index[$max_len]][0]; # out } else { $present[$max_len] = $next_max_sent; # out ... } } else { $present[$min_len] = $out_eng[$pr_index[$min_len]][0]; # out if( $partial_match == 0 ){ $present[$max_len] = $gold_eng[$pr_index[$max_len]][0]; # out } else { $present[$max_len] = $next_max_sent; # out ... } } # print "Looping back with the indices: $pr_index[0] $pr_index[1] \n"; } #### sample space size $sample_space = $num_char_fr_gold * $num_char_eng_gold; $precision = $precision / $iterations; $recall = $recall / $iterations; $tn = $sample_space - $fp - $fn - $tp; $accuracy = ($tp + $tn)/$sample_space; $precision = sprintf("%.5f",$precision); $recall = sprintf("%.5f",$recall); $accuracy = sprintf("%.7f", $accuracy); } # parameters: array for english as pointer, array for french as ptr. sub create_array { #----------------------------------- # get hold of the parameters. my $file_handle = shift; my $eng_arr = shift; my $fr_arr = shift; #------------------------------------ # $bak_article = 1, implies we use french array... remember!!! $bak_article = 0; while(<$file_handle>) { $line = $_; # print "Line: [$line]\n"; if($line =~ /<\/article.*>/){ $bak_article++; # print " in bak article \n"; last if($bak_article == 2); } if($line =~ /){ last if(/<\/alignment>/); $pr_sentence = $pr_sentence . $_; } chomp $pr_sentence; $pr_sentence =~ s/^[\s*]//g; $pr_sentence =~ s/\s+/ /g; $pr_sentence =~ s/[\s+]$//g; if($bak_article == 0){ # print "SENTENCE: [$pr_sentence] Indices = "; chomp $pr_sentence; $eng_arr->[$ind[0]-1][0] = $pr_sentence; $eng_arr->[$ind[0]-1][1] = @ind - 1; for( my $k = 1; $k < @ind; $k++ ){ my $k1 = $k + 1; $eng_arr->[$ind[0]-1][$k1] = $ind[$k] - 1; # print "$ind[$k] "; } # print "\n"; } elsif($bak_article == 1){ # print "SENTENCE: [$pr_sentence] Indices = {@ind} \n"; $fr_arr->[$ind[0]-1][0] = $pr_sentence; $fr_arr->[$ind[0]-1][1] = @ind - 1; for( my $k = 1; $k < @ind; $k++ ){ $fr_arr->[$ind[0]-1][$k+1] = $ind[$k] - 1; } } } } } # end of the sub routine. sub sent_length { $sent = shift; $len = split(//, $sent); return $len; } sub intersection { $sen_first = shift; $sen_second = shift; # print INT $sen_first ; # print INT "\n"; # print INT $sen_second; # removing the last newline character chomp($sen_first); chomp($sen_second); # removing multiple spaces at the end of the line, replacing them with NULL. $sen_first =~ s/\s+$//g; $sen_second =~ s/\s+$//g; print INT "[$sen_first]"; print INT "\n"; print INT "[$sen_second]"; # Creation of the array of characters, splitting on NULL @first_array = split(//,$sen_first); @second_array = split(//,$sen_second); $smaller_len = @first_array; $larger_len = @second_array; @B = @first_array; @A = @second_array; $matched = 0; $total_matched = 0; if($smaller_len > $larger_len) { $temp = $smaller_len; $smaller_len = $larger_len; $larger_len = $temp; @A = @first_array; @B = @second_array; } $len_first = $larger_len; $len_second = $smaller_len; # When a part of the second string is being compared with the First string. # Eventually the entire string is contained in the first. for($wsize=1;$wsize<=$len_second;$wsize++) { $buf_size = $wsize-1; undef $string_second; undef $string_first; for($inner_counter=0;$inner_counter<$wsize;$inner_counter++) { $string_second .= $B[$len_second-$buf_size-1]; $string_first .= $A[$inner_counter]; $buf_size--; } $copy_second = $string_second; $string_second =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.\"\'])/\\$1/g; if( $string_first =~ m/^$string_second/ ) { $matched = length $copy_second; if($total_matched < $matched) { $total_matched = $matched; } } } undef $string_first; # When the second string is contained in the first string. $current_index = 0; for($wsize=1;$wsize<=$len_first-$len_second;$wsize++) { $string_second = $copy_second; for($inner_counter=$current_index;$inner_counter<$current_index+$len_second;$inner_counter++) { $string_first .= $A[$inner_counter]; } $copy_second = $string_second; $string_second =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; if( $string_first =~ m/^$string_second$/ ) { $matched = length $copy_second; if($total_matched < $matched) { $total_matched = $matched; } } $current_index++; undef $string_first; } # In the case when the second string has past the first string and the end is being checked. for($wsize=$len_second-1;$wsize>0;$wsize--) { $buf_size=$wsize; undef $string_out; undef $string_first; for($inner_counter=0;$inner_counter<=$wsize;$inner_counter++) { $string_out .= $B[$inner_counter]; $string_first .= $A[$len_first-$buf_size-1]; $buf_size--; } $save_string1 = $string_out; $string_out =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; if( $string_first =~ m/$string_out$/ ) { $matched = length $save_string1; if($total_matched < $matched) { $total_matched = $matched; } } } # Calculate precision, recall over the entire text $precision += ($total_matched/$smaller_len); $fp += $smaller_len - $total_matched; $recall += ($total_matched/$larger_len); $fn += $larger_len - $total_matched; $tp += $total_matched; }