$inAlignment = 0; $line = ""; while ( !( $line =~ /<\/article/ )) { $line = ; if ( $line =~ /<\/article>/ ) { next; } if ( $line =~ /; } while (!($line =~ /

; if ( $line =~ /<\/article>/ ) { next; } if ( $line =~ /; } while (!($line =~ /

$inAlignment = 0; while ( !( $line =~ /<\/article/ )) { $line = ; if ( $line =~ /<\/article>/ ) { next; } if ( $line =~ /; } while (!( $line =~ /

; if ( $line =~ /<\/article>/ ) { next; } if ( $line =~ / 1 ) { $precisionAcrossBitexts = 1; } if ( $recallAcrossBitexts > 1 ) { $recallAcrossBitexts = 1; } if ( $accuracyAcrossBitexts > 1 ) { $accuracyAcrossBitexts = 1; } print "$precisionAcrossBitexts $recallAcrossBitexts $accuracyAcrossBitexts\n"; # End of program! # function to get the proposed set sub getProposedSet { my $goldE = shift; # clean up the sentence a bit $goldE =~ s/\s+/ /g; $goldE =~ s/^\s*//g; $goldE =~ s/\s*$//g; my @source = (); # variable to capture the e lines in new that match gold # back up the nextNewSentence variable my $backUp = $nextNewSentence; # our gold sentence will become our regex... so escape it! $regex = $goldE; $regex = escape($regex); my $newE = ""; # variable to contain the continuously growing new e sentence my $foundFlag = 0; my $temp = ""; while ( $foundFlag == 0 ) { $newE .= $sentenceNew0[$nextNewSentence]; push @source, $nextNewSentence; $temp = $newE; # clean up $temp =~ s/\s+/ /g; $temp =~ s/^\s*//g; $temp =~ s/\s*$//g; if ( $temp =~ /$regex/ ) { # found! $foundFlag = 1; # if we are completely using up this line, then start increment nextNewSentence if ( $temp =~ /$regex$/ ) { $nextNewSentence++; } } else { $nextNewSentence++; if ( $nextNewSentence > $#sentenceNew0 ) { # opps! thats it then! last; } } } if ( $foundFlag ) { # now to get the mappings from @source @tempSet = (); foreach $src (@source) { my $map = $sentenceMappingNew[$src]; $map =~ s/^\s*//; $map =~ s/\s*$//; $map =~ s/\s+/ /g; if ( !( $map =~ /\d/ ) ) { next; } my @mapTo = split / /, $map; for ( $j = 0; $j <= $#mapTo; $j++ ) { $mapTo[$j] =~ s/[^\d]//g; push @tempSet, $mapTo[$j]; } } # having got all that, remove the repeated mapping numbers push @proposedSet, $tempSet[0]; for ( my $i = 1; $i <= $#tempSet; $i++ ) { if ( $tempSet[$i] < $tempSet[$i-1] ) { last; } if ( $tempSet[$i] == $tempSet[$i-1] ) { next; } push @proposedSet, $tempSet[$i]; } if ( $debug == 2 ) { print "Found!!\n"; print "English sentence in gold: {$goldE}\n"; print "English sentence in new : {$temp}\n"; print "The proposed set: @proposedSet\n"; } } else { $nextNewSentence = $backUp; @proposedSet; } } sub getIntersection { # get the parameter strings my $param1 = shift; my $param2 = shift; if ( $debug == 2 ) { print "Param 1: {$param1}\n"; print "Param 2: {$param2}\n"; } # sort the params to make param2 the shorter one keep a flag to # decide if we have swapped or not... necessary to decide whether # our division is giving us precision or accuracy :-) my $swapFlag = 0; if ( getLength($param2) > getLength($param1) ) { $swapFlag = 1; my $temp = $param1; $param1 = $param2; $param2 = $temp; } my $inter = 0; # variable to find out how many characters match # check if param2 is completely inside param2... that is the best # possible situation! # first escape the necessary characters $string = $param2; $string = escape($string); if ( $param1 =~ /$string/ ) { $inter = getLength($param2) } # now we havent got that, so start checking on each end! else { my $rightPart = $param2; my $leftPart = ""; while ( getLength($rightPart)>1 ) { $rightPart =~ s/.$//; $leftPart = $& . $leftPart; # escape the special characters if any $string = $leftPart; $string = escape($string); if ( $param1 =~ /^$string/ && getLength($leftPart) > $inter) { $inter = getLength($leftPart); } # escape the special characters if any $string = $rightPart; $string = escape($string); if ( $param1 =~ /$string$/ && getLength($rightPart) > $inter) { $inter = getLength($rightPart); } } } if ( $debug == 2 ) { print "Intersection = $inter\n"; } return $inter; } sub getLength { my $sentence = shift; if ( $articleTag =~ /chinese/ ) { my $index = 0; while ( $sentence =~ /\p{IsPrint}/g ) { $index++; } return $index; } else { return(length($sentence)); } } sub escape { my $param = shift; my @temp = split //, $param; my @new = (); foreach $item (@temp) { $item =~ s/([\/\|\[\]\{\}\^\$\*\+\?\.])/\\$1/g; push @new, $item; } my $new = join "", @new; return $new; }