这是一款很好用的工具包
源代码在线查看: make-kn-counts.gawk
#!/usr/local/bin/gawk -f # # make-kn-counts -- # Modify N-gram counts for KN smoothing # # This duplicates the action of ModKneserNey::prepareCounts(). # # $Header: /home/srilm/devel/utils/src/RCS/make-kn-counts.gawk,v 1.4 2002/07/27 00:57:36 stolcke Exp $ # BEGIN { order = 3; no_max_order = 0; sent_start = ""; output = "-"; max_per_file = 0; file_no = 0; ngram_no = 0; } function set_output () { close(output_cmd); ngram_count = "ngram-count -order " order " -read - -sort -write "; if (max_per_file > 0) { output_cmd = ngram_count output "-" ++file_no ".ngrams.gz"; } else { output_cmd = ngram_count output; } } NR == 1 { kndiscount[1] = kndiscount1; kndiscount[2] = kndiscount2; kndiscount[3] = kndiscount3; kndiscount[4] = kndiscount4; kndiscount[5] = kndiscount5; kndiscount[6] = kndiscount6; kndiscount[7] = kndiscount7; kndiscount[8] = kndiscount8; kndiscount[9] = kndiscount9; if (output == "-") { max_per_file = 0; } set_output(); } # discard ngrams not used in LM building NF - 1 > order { next; } # keep ngrams not subject to KN discounting, or those starting with # if desired, highest-order ngrams are discarded to save space NF - 1 == order || !kndiscount[NF - 1] || $1 == sent_start { if (!no_max_order || NF - 1 < order) { if (max_per_file > 0 && ++ngram_no % max_per_file == 0) { ngram_no = 0; set_output(); } print | output_cmd; } } # modify lower-order ngrams subject to KN discounting NF - 2 < order && kndiscount[NF - 2] && $2 != sent_start { $1 = $NF = ""; if (max_per_file > 0 && ++ngram_no % max_per_file == 0) { ngram_no = 0; set_output(); } # we let ngram-count add up the new counts for us print $0, 1 | output_cmd; }