moses开源的机器翻译系统
源代码在线查看: filter-phrase-table.pl
#!/usr/bin/perl -w
# $Id: filter-phrase-table.pl 1307 2007-03-14 22:22:36Z hieuhoang1972 $
#by Philipp Koehn, de-augmented by Evan Herbst
#filter a phrase table for a specific input corpus
#arguments: phrasetable_filename input_filename factor_index (0...)
#outputs to phrasetable_filename.short
#similar function to filter-model-given-input.pl, but only operates
#on the phrase table and doesn't require that any subdirectories exist
use strict;
my $MAX_LENGTH = 10;
my ($file, $input, $source_factor) = @ARGV;
my $dir = ".";
# get tables to be filtered (and modify config file)
my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
push @TABLE,$file;
my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
push @TABLE_FACTORS,$source_factor;
# get the phrase pairs appearing in the input text
my %PHRASE_USED;
die("could not find input file $input") unless -e $input;
open(INPUT,$input);
while(my $line = ) {
chop($line);
my @WORD = split(/ +/,$line);
for(my $i=0;$i for(my $j=0;$j foreach (keys %CONSIDER_FACTORS) {
my @FACTOR = split(/,/);
my $phrase = "";
for(my $k=$i;$k my @WORD_FACTOR = split(/\|/,$WORD[$k]);
for(my $f=0;$f $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
}
chop($phrase);
$phrase .= " ";
}
chop($phrase);
$PHRASE_USED{$_}{$phrase}++;
}
}
}
}
close(INPUT);
# filter files
for(my $i=0;$i my ($used,$total) = (0,0);
my $file = $TABLE[$i];
my $factors = $TABLE_FACTORS[$i];
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
elsif (-e $file) { open(FILE,$file); }
else { die("could not find model file $file"); }
open(FILE_OUT,">$new_file");
while(my $entry = ) {
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
$foreign =~ s/ $//;
if (defined($PHRASE_USED{$factors}{$foreign})) {
print FILE_OUT $entry;
$used++;
}
$total++;
}
close(FILE);
close(FILE_OUT);
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
}