解码器是基于短语的统计机器翻译系统的核心模块

源代码在线查看: filter-phrase-table.pl.svn-base

软件大小: 5827 K
上传用户: lyyfengyutongzh
关键词: 解码器 机器翻译系统 核心 模块
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/bin/perl -w								#by Philipp Koehn, de-augmented by Evan Herbst				#filter a phrase table for a specific input corpus				#arguments: phrasetable_filename input_filename factor_index (0...)				#outputs to phrasetable_filename.short								#similar function to filter-model-given-input.pl, but only operates				#on the phrase table and doesn't require that any subdirectories exist								use strict;								my $MAX_LENGTH = 10;								my ($file, $input, $source_factor) = @ARGV;				my $dir = ".";								    # get tables to be filtered (and modify config file)				    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);						push @TABLE,$file;										my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";						push @TABLE_NEW_NAME,$new_name;										$CONSIDER_FACTORS{$source_factor} = 1;						push @TABLE_FACTORS,$source_factor;								    # get the phrase pairs appearing in the input text				    my %PHRASE_USED;				    die("could not find input file $input") unless -e $input;				    open(INPUT,$input);				    while(my $line = ) {					chop($line);					my @WORD = split(/ +/,$line);					for(my $i=0;$i					    for(my $j=0;$j						foreach (keys %CONSIDER_FACTORS) {						    my @FACTOR = split(/,/);						    my $phrase = "";						    for(my $k=$i;$k							my @WORD_FACTOR = split(/\|/,$WORD[$k]);							for(my $f=0;$f							    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";							}							chop($phrase);							$phrase .= " ";						    }						    chop($phrase);						    $PHRASE_USED{$_}{$phrase}++;						}					    }					}				    }				    close(INPUT);								    # filter files				    for(my $i=0;$i					my ($used,$total) = (0,0);					my $file = $TABLE[$i];					my $factors = $TABLE_FACTORS[$i];					my $new_file = $TABLE_NEW_NAME[$i];					print STDERR "filtering $file -> $new_file...\n";								        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }				        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }				        elsif (-e $file) { open(FILE,$file); }					else { die("could not find model file $file");  }									open(FILE_OUT,">$new_file");									while(my $entry = ) {					    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);					    $foreign =~ s/ $//;					    if (defined($PHRASE_USED{$factors}{$foreign})) {						print FILE_OUT $entry;						$used++;					    }					    $total++;					}					close(FILE);					close(FILE_OUT);					printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';				    }							

相关资源