这是一款很好用的工具包

源代码在线查看: make-ngram-pfsg.gawk

软件大小: 3034 K
上传用户: wanghaihah
关键词: 工具包
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/local/bin/gawk -f				#				# make-ngram-pfsg --				#	Create a Decipher PFSG from an N-gram language model				#				# usage: make-ngram-pfsg [debug=1] [check_bows=1] [maxorder=N] backoff-lm > pfsg				#				# $Header: /home/srilm/devel/utils/src/RCS/make-ngram-pfsg.gawk,v 1.28 2004/11/01 22:25:42 stolcke Exp $				#								#########################################				#				# Output format specific code				#								BEGIN {					logscale = 2.30258509299404568402 * 10000.5;					round = 0.5;					start_tag = "";					end_tag = "";					null = "NULL";					version = 0;					top_level_name = "";									if ("pid" in PROCINFO) {					    pid = PROCINFO["pid"];					} else {					    getline pid < "/dev/pid";					}					tmpfile = "/tmp/pfsg." pid;									# hack to remove tmpfile when killed					print "" | "trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null";									debug = 0;									write_contexts = "";					read_contexts = "";				}								function rint(x) {					if (x < 0) {					    return int(x - round);					} else {					    return int(x + round);					}				}								function scale_log(x) {					return rint(x * logscale);				}								function output_for_node(name) {					num_words = split(name, words);									if (num_words == 0) {					    print "output_for_node: got empty name" >> "/dev/stderr";					    exit(1);					} else if (words[1] == bo_name) {					    return null;					} else if (words[num_words] == end_tag || \						   words[num_words] == start_tag) 					{					    return null;					} else {					    return words[num_words];					}				}								function node_exists(name) {					return (name in node_num);				}								function node_index(name) {					i = node_num[name];					if (i == "") {					    i = num_nodes ++;					    node_num[name] = i;					    node_string[i] = output_for_node(name);									    if (debug) {						print "node " i " = " name ", output = " node_string[i] \								>> "/dev/stderr";					    }					}					return  i;				}								function start_grammar(name) {					num_trans = 0;					num_nodes = 0;					return;				}								function end_grammar(name) {					if (!node_exists(start_tag)) {						print start_tag " tag undefined in LM" >> "/dev/stderr";						exit(1);					} else if (!node_exists(end_tag)) {						print end_tag " tag undefined in LM" >> "/dev/stderr";						exit(1);					}									printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr";					printf "%d pfsg transitions\n", num_trans >> "/dev/stderr";									# output version id if supplied					if (version) {						print "version " version "\n";					}									# use optional top-level grammar name if given					print "name " (top_level_name ? top_level_name : name);					printf "nodes %s", num_nodes;					for (i = 0; i < num_nodes; i ++) {						printf " %s", node_string[i];					}					printf "\n";										print "initial " node_index(start_tag);					print "final " node_index(end_tag);					print "transitions " num_trans;					fflush();									if (close(tmpfile) < 0) {						print "error closing tmp file" >> "/dev/stderr";						exit(1);					}					system("/bin/cat " tmpfile);				}								function add_trans(from, to, prob) {				#print "add_trans " from " -> " to " " prob >> "/dev/stderr";					num_trans ++;					print node_index(from), node_index(to), scale_log(prob) > tmpfile;				}								#########################################				#				# Generic code for parsing backoff file				#								BEGIN {					maxorder = 0;					grammar_name = "PFSG";					bo_name = "BO";					check_bows = 0;					epsilon = 1e-5;		# tolerance for lowprob detection				}								NR == 1 {					start_grammar(grammar_name);										if (read_contexts) {					    while ((getline context < read_contexts) > 0) {						is_context[context] = 1;					    }					    close(read_contexts);					}				}								NF == 0 {					next;				}								/^ngram *[0-9][0-9]*=/ {					num_grams = substr($2,index($2,"=")+1);					if (num_grams > 0) {					    order = substr($2,1,index($2,"=")-1);										    # limit maximal N-gram order if desired					    if (maxorder > 0 && order > maxorder) {						order = maxorder;					    }									    if (order == 1) {						grammar_name = "UNIGRAM_PFSG";					    } else if (order == 2) {						grammar_name = "BIGRAM_PFSG";					    } else if (order == 3) {						grammar_name = "TRIGRAM_PFSG";					    } else {						grammar_name = "NGRAM_PFSG";					    }					}					next;				}								/^\\[0-9]-grams:/ {					currorder = substr($0,2,1);					next;				}				/^\\/ {					next;				}								#				# unigram parsing				#				currorder == 1 {					first_word = last_word = ngram = $2;					ngram_prefix = ngram_suffix = "";									# we need all unigram backoffs (except for ),					# so fill in missing bow where needed					if (NF == 2 && last_word != end_tag) {						$3 = 0;					}				}								#				# bigram parsing				#				currorder == 2 {					ngram_prefix = first_word = $2;					ngram_suffix = last_word = $3;					ngram = $2 " " $3;				}								#				# trigram parsing				#				currorder == 3 {					first_word = $2;					last_word = $4;					ngram_prefix = $2 " " $3;					ngram_suffix = $3 " " $4;					ngram = ngram_prefix " " last_word;				}								#				# higher-order N-gram parsing				#				currorder >= 4 && currorder 					first_word = $2;					last_word = $(currorder + 1);					ngram_infix = $3;					for (i = 4; i 						ngram_infix = ngram_infix " " $i;					}					ngram_prefix = first_word " " ngram_infix;					ngram_suffix = ngram_infix " " last_word;					ngram = ngram_prefix " " last_word;				}								# 				# shared code for N-grams of all orders				#				currorder 					prob = $1;					bow = $(currorder + 2);									# skip backoffs that exceed maximal order,					# but always include unigram backoffs					if (bow != "" && (currorder == 1 || currorder < order)) {					    # remember all LM contexts for creation of N-gram transitions					    bows[ngram] = bow;									    # insert backoff transitions					    if (read_contexts ? (ngram in is_context) : \						                (currorder < order - 1)) \					    {						add_trans(bo_name " " ngram, bo_name " " ngram_suffix, bow);						add_trans(ngram, bo_name " " ngram, 0);					    } else {						add_trans(ngram, bo_name " " ngram_suffix, bow);					    }									    if (write_contexts) {						print ngram_suffix > write_contexts;					    }					}									if (last_word == start_tag) {					    if (currorder > 1) {						printf "warning: ignoring ngram into start tag %s -> %s\n", \							    ngram_prefix, last_word >> "/dev/stderr";					    }					} else {					    # insert N-gram transition to maximal suffix of target context					    if (last_word == end_tag) {						target = end_tag;					    } else if (ngram in bows || currorder == 1) {						# the minimal context is unigram						target = ngram;					    } else if (ngram_suffix in bows) {						target = ngram_suffix;					    } else {						target = ngram_suffix;						for (i = 3; i 						    target = substr(target, length($i) + 2);						    if (target in bows) break;						}					    }									    if (currorder == 1 || \						(read_contexts ? (ngram_prefix in is_context) : \								 (currorder < order))) \					    {						add_trans(bo_name " " ngram_prefix, target, prob);					    } else {						add_trans(ngram_prefix, target, prob);					    }									    if (check_bows) {						if (currorder < order) {						    probs[ngram] = prob;						}												if (ngram_suffix in probs && \						    probs[ngram_suffix] + bows[ngram_prefix] - prob > epsilon)						{						    printf "warning: ngram loses to backoff %s -> %s\n", \							    ngram_prefix, last_word >> "/dev/stderr";						}					    }					}				}								END {					end_grammar(grammar_name);				}							

相关资源