这是一款很好用的工具包

源代码在线查看: rank-vocab.gawk

软件大小: 3034 K
上传用户: wanghaihah
关键词: 工具包
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/local/bin/gawk -f				#				# rank-vocab --				#	Given K different rankings of candidate vocabularies, and 				# 	a held-out optimization unigram count file, optimize the 				#	combined ranking of words				#				# usage: rank-vocab counts words1 words2 ... worksK				#				# $Header: /home/srilm/devel/utils/src/RCS/rank-vocab.gawk,v 1.2 2004/11/02 02:00:35 stolcke Exp $				#								BEGIN {					num_sources = 0;					num_output = 0;					num_oovs = 0;									debug = 0;				}												ARGIND == 1 {					word_count[$1] = $2;									num_oovs += $2;									next;				}								ARGIND > 1 {					k = ARGIND - 1;					num_sources = k;									num_words[k] ++;									word_ranked[k, num_words[k]] = $1;					next;				}								function dump_words(k) {					print "source " k " words:";									for (i = 1; i 					    print i, word_ranked[k,i];					}				}								# find the next word from source k that occurs in the test set				# return 0 if no more words are available				function find_next(k) {					for (j = last_chosen[k] + 1; j 					    if (word_count[word_ranked[k,j]] > 0) {						if (debug) {						    print "next word rank for source " k ": " j >> "/dev/stderr";						}										return j;					    }					}					if (debug) {					    print "no more words from source " k >> "/dev/stderr";					}					return 0;				}								# compute gain (number of OOVs tokens reduced per number of word types added)				# by adding the next word from source k				function compute_gain(k) {					if (next_word[k] == 0) {					    # no more words in source k, no gain					    return -1;					} else {					    g = word_count[word_ranked[k,next_word[k]]] / (next_word[k] - last_chosen[k]);					    if (debug) {						print "next gain for source " k " = " g;					    }					    return g;					}				}								END {				#	for (k = 1; k 				#	    dump_words(k);				#	}									for (k = 1; k 					    last_chosen[k] = 0;					    next_word[k] = find_next(k);					    gain[k] = compute_gain(k);					}									print "INITIAL OOVS = " num_oovs;									# add words until no more gain possible (i.e., until all source					# words have been used up)					while (1) {					    best_gain = -1;					    best_source = 0;									    # find next best source to pick word from					    for (k = 1; k 						if (gain[k] > best_gain) {							best_source = k;							best_gain = gain[k];						}					    }									    if (best_gain < 0) break;									    # process all the words from source k up to the one chosen 					    for (i = last_chosen[best_source] + 1; \						 i 						 i ++) {						word_chosen = word_ranked[best_source,i] 										if (debug) {						    print "source = " best_source \							  " gain = " best_gain \							  " word = " word_chosen >> "/dev/stderr";						}										# output the word if it hasn't been already						if (!was_output[word_chosen]) {						    num_output ++;										    num_oovs -= word_count[word_chosen];										    print "RANK " num_output " WORD " word_chosen \								" OOVS " num_oovs;										    was_output[word_chosen] = 1;						}					    }									    # update the statistics for the source that was chosen					    last_chosen[best_source] = next_word[best_source];					    next_word[best_source] = find_next(best_source);					    gain[best_source] = compute_gain(best_source);					}				}											

相关资源