这是一款很好用的工具包

源代码在线查看: compute-oov-rate.gawk

软件大小: 3034 K
上传用户: wanghaihah
关键词: 工具包
下载地址: 免注册下载 普通下载 VIP

相关代码

				#!/usr/local/bin/gawk -f				#				# compute-oov-rate --				#	Compute OOV word rate from a vocabulary and a unigram count file				#				# usage: compute-oov-rate vocab countfile ...				#				# Assumes unigram counts do not have repeated words.				#				# $Header: /home/srilm/devel/utils/src/RCS/compute-oov-rate.gawk,v 1.8 2003/03/08 03:59:39 stolcke Exp $				#								BEGIN {					# high bit characters also detect multibyte characters					letter = "[[:alpha:]\x80-\xFF]";					if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";				}								# Read vocab				#				ARGIND == 1 {					vocab[$1] = 1;				}								function is_fragment(word) {					return word ~ (letter "-$") || word ~ ("^-" letter);				}								#				# Read counts				#				ARGIND > 1 {					if ($1 == "" || $1 == "" || $1 == "-pau-") {						next;					}									total_count += $2;					total_types ++;									if (!vocab[$1]) {						oov_count += $2;						oov_types ++; 										if (!is_fragment($1)) {						    if (write_oov_words) {							    print > write_oov_words;						    }						} else {						    if (write_oov_frags) {							    print > write_oov_frags;						    }						}					}									if (!is_fragment($1)) {						total_nofrag_count += $2;						total_nofrag_types ++;										if (!vocab[$1]) {							oov_nofrag_count += $2;							oov_nofrag_types ++; 						}					}								}				END {					printf "OOV tokens: %d / %d (%.2f%%) ", \							oov_count, total_count, 100 * oov_count/total_count;					printf "excluding fragments: %d / %d (%.2f%%)\n", \							oov_nofrag_count, total_nofrag_count, \							100 * oov_nofrag_count/total_nofrag_count;					printf "OOV types: %d / %d (%.2f%%) ", \							oov_types, total_types, 100 * oov_types/total_types;					printf "excluding fragments: %d / %d (%.2f%%)\n", \							oov_nofrag_types, total_nofrag_types, \							100 * oov_nofrag_types/total_nofrag_types;				}							

相关资源