这是一款很好用的工具包

源代码在线查看: make-gt-discounts.gawk

软件大小: 3034 K
上传用户: wanghaihah
关键词: 工具包
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/local/bin/gawk -f				#				# make-gt-discounts --				#	generate Good-Turing discounting parameters from a count-of-count				#	file				#				#	The purpose of this script is to do the GT computation off-line,				#	without ngram-count having to read all counts into memory.				#	The output is compatible with the ngram-count -gt options.				#				# $Header: /home/srilm/devel/utils/src/RCS/make-gt-discounts.gawk,v 1.3 2004/11/02 02:00:35 stolcke Exp $				#				# usage: make-gt-discounts min= max= countfile				#				BEGIN {				    min=1;				    max=7;				}				/^#/ {				    # skip comments				    next;				}				{				    countOfCounts[$1] = $2;				}				END {				    # Code below is essentially identical to GoodTuring::estimate()				    # (Discount.cc).				    minCount = min;				    maxCount = max;								    if (!countOfCounts[1]) {					printf "warning: no singleton counts\n" >> "/dev/stderr";					maxCount = 0;				    }								    while (maxCount > 0 && countOfCounts[maxCount + 1] == 0) {					printf "warning: count of count %d is zero -- lowering maxcount\n", \					       maxCount + 1 >> "/dev/stderr";					maxCount --;				    }								    if (maxCount 					printf "GT discounting disabled\n" >> "/dev/stderr";				    } else {					commonTerm = (maxCount + 1) * \								countOfCounts[maxCount + 1] / \								    countOfCounts[1];									for (i = 1; i 									    if (countOfCounts[i] == 0) {						printf "warning: count of count %d is zero\n", \							i >> "/dev/stderr";						coeff = 1.0;					    } else {						coeff0 = (i + 1) * countOfCounts[i+1] / \									    (i * countOfCounts[i]);						coeff = (coeff0 - commonTerm) / (1.0 - commonTerm);						if (coeff  1.0) {						    printf "warning: discount coeff %d is out of range: %g\n", \							 i, coeff >> "/dev/stderr";						    coeff = 1.0;						}					    }					    discountCoeffs[i] = coeff;					}				    }								    printf "mincount %d\n", minCount;				    printf "maxcount %d\n", maxCount;								    for (i = 1; i 					printf "discount %d %g\n", i, discountCoeffs[i];				    }				}							

相关资源