这是一款很好用的工具包

源代码在线查看: make-hiddens-lm.gawk

软件大小: 3034 K
上传用户: wanghaihah
关键词: 工具包
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/local/bin/gawk -f				#				# make-hiddens-lm --				#	Create a hidden-sentence-boundary ngram LM from a standard one				#				# This script edits a ARPA backoff model file as follows:				#				# 1 - ngrams involving  and  are duplicated using the				#     hidden segment boundary token .				# 2 - ngrams starting with  are eliminated.				# 3 - the backoff weight of  is set to 1.				#     this together with the previous change sets all probabilities conditioned				#     on  to the respective marignal probabilities without .				# 4 - ngrams ending in  get probability 1.				#     this avoids an end-of-sentence penalty in rescoring.				#				# $Header: /home/srilm/devel/utils/src/RCS/make-hiddens-lm.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $				#				BEGIN {					sent_start = "";					sent_end = "";					hiddens = "";									remove_old_ngrams = 0;				}				NF==0 {					print; next;				}				/^ngram *[0-9][0-9]*=/ {					print;					next;				}				/^.[0-9]-grams:/ {					currorder=substr($0,2,1);				}				/^\\/ {					print; next;				}				# 				currorder && currorder < highorder {					if (NF < currorder + 2) {						print $0 "\t0";					} else {						print;					}					next;				}				$0 ~ sent_start || $0 ~ sent_end {					oldline = $0;									# modify sentence initial/final ngrams					if ($2 == sent_end && currorder == 1) {					    sos_uniprob = $1;									    if (no_s_end) {						# set  prob to 1						$1 = 0;					    }					    if (!remove_old_ngrams) {						print;					    }					    next;					} else if ($2 == sent_start && currorder == 1) {					    if (no_s_start) {						# set  backoff weight to 1						$3 = 0;					    }					    if (!remove_old_ngrams) {						print;					    }									    # use unigram prob from 					    if (sos_uniprob == "") {						print "warning: could not find " sent_end " unigram" \											    >> "/dev/stderr";					    } else {						oldline = sos_uniprob "\t" $2 "\t" $3;					    }					} else if ($2 == sent_start) {					    # suppress other ngrams starting with 					    if (!no_s_start && !remove_old_ngrams) {						print;					    }					} else if ($(currorder + 1) == sent_end) {					    if (no_s_end) {						# set  prob to 1						$1 = 0;					    }					    if (!remove_old_ngrams) {					        print;					    }					}									# replace  and  with  and output result					gsub(sent_start, hiddens, oldline);					gsub(sent_end, hiddens, oldline);					print oldline;					next;				}				{ print }							

相关资源