moses开源的机器翻译系统

源代码在线查看: filter-phrase-table.pl

软件大小: 8836 K
上传用户: myhpgnl
关键词: moses 开源 机器翻译系统
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/bin/perl -w
				
				# $Id: filter-phrase-table.pl 1307 2007-03-14 22:22:36Z hieuhoang1972 $
				#by Philipp Koehn, de-augmented by Evan Herbst
				#filter a phrase table for a specific input corpus
				#arguments: phrasetable_filename input_filename factor_index (0...)
				#outputs to phrasetable_filename.short
				
				#similar function to filter-model-given-input.pl, but only operates
				#on the phrase table and doesn't require that any subdirectories exist
				
				use strict;
				
				my $MAX_LENGTH = 10;
				
				my ($file, $input, $source_factor) = @ARGV;
				my $dir = ".";
				
				    # get tables to be filtered (and modify config file)
				    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
						push @TABLE,$file;
				
						my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
						push @TABLE_NEW_NAME,$new_name;
				
						$CONSIDER_FACTORS{$source_factor} = 1;
						push @TABLE_FACTORS,$source_factor;
				
				    # get the phrase pairs appearing in the input text
				    my %PHRASE_USED;
				    die("could not find input file $input") unless -e $input;
				    open(INPUT,$input);
				    while(my $line = ) {
					chop($line);
					my @WORD = split(/ +/,$line);
					for(my $i=0;$i					    for(my $j=0;$j						foreach (keys %CONSIDER_FACTORS) {
						    my @FACTOR = split(/,/);
						    my $phrase = "";
						    for(my $k=$i;$k							my @WORD_FACTOR = split(/\|/,$WORD[$k]);
							for(my $f=0;$f							    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
							}
							chop($phrase);
							$phrase .= " ";
						    }
						    chop($phrase);
						    $PHRASE_USED{$_}{$phrase}++;
						}
					    }
					}
				    }
				    close(INPUT);
				
				    # filter files
				    for(my $i=0;$i					my ($used,$total) = (0,0);
					my $file = $TABLE[$i];
					my $factors = $TABLE_FACTORS[$i];
					my $new_file = $TABLE_NEW_NAME[$i];
					print STDERR "filtering $file -> $new_file...\n";
				
				        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
				        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
				        elsif (-e $file) { open(FILE,$file); }
					else { die("could not find model file $file");  }
				
					open(FILE_OUT,">$new_file");
				
					while(my $entry = ) {
					    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
					    $foreign =~ s/ $//;
					    if (defined($PHRASE_USED{$factors}{$foreign})) {
						print FILE_OUT $entry;
						$used++;
					    }
					    $total++;
					}
					close(FILE);
					close(FILE_OUT);
					printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
				    }
							

相关资源