解码器是基于短语的统计机器翻译系统的核心模块
源代码在线查看: filter-phrase-table.pl.svn-base
#!/usr/bin/perl -w #by Philipp Koehn, de-augmented by Evan Herbst #filter a phrase table for a specific input corpus #arguments: phrasetable_filename input_filename factor_index (0...) #outputs to phrasetable_filename.short #similar function to filter-model-given-input.pl, but only operates #on the phrase table and doesn't require that any subdirectories exist use strict; my $MAX_LENGTH = 10; my ($file, $input, $source_factor) = @ARGV; my $dir = "."; # get tables to be filtered (and modify config file) my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS); push @TABLE,$file; my $new_name = "$file.short";#"$dir/phrase-table.$source_factor"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; push @TABLE_FACTORS,$source_factor; # get the phrase pairs appearing in the input text my %PHRASE_USED; die("could not find input file $input") unless -e $input; open(INPUT,$input); while(my $line = ) { chop($line); my @WORD = split(/ +/,$line); for(my $i=0;$i for(my $j=0;$j foreach (keys %CONSIDER_FACTORS) { my @FACTOR = split(/,/); my $phrase = ""; for(my $k=$i;$k my @WORD_FACTOR = split(/\|/,$WORD[$k]); for(my $f=0;$f $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; } chop($phrase); $phrase .= " "; } chop($phrase); $PHRASE_USED{$_}{$phrase}++; } } } } close(INPUT); # filter files for(my $i=0;$i my ($used,$total) = (0,0); my $file = $TABLE[$i]; my $factors = $TABLE_FACTORS[$i]; my $new_file = $TABLE_NEW_NAME[$i]; print STDERR "filtering $file -> $new_file...\n"; if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); } elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); } elsif (-e $file) { open(FILE,$file); } else { die("could not find model file $file"); } open(FILE_OUT,">$new_file"); while(my $entry = ) { my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); $foreign =~ s/ $//; if (defined($PHRASE_USED{$factors}{$foreign})) { print FILE_OUT $entry; $used++; } $total++; } close(FILE); close(FILE_OUT); printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; }