这是一款很好用的工具包
源代码在线查看: continuous-ngram-count.gawk
#!/usr/local/bin/gawk -f # # continuous-ngram-count -- # Generate ngram counts ignoring line breaks # # usage: continous-ngram-count order=ORDER textfile | ngram-count -read - # # $Header: /home/srilm/devel/utils/src/RCS/continuous-ngram-count,v 1.1 1998/08/24 00:52:30 stolcke Exp $ # BEGIN { order = 3; head = 0; # next position in ring buffer } function process_word(w) { buffer[head] = w; ngram = ""; for (j = 0; j < order; j ++) { w1 = buffer[(head + order - j) % order]; if (w1 == "") { break; } ngram = w1 " " ngram; print ngram 1; } head = (head + 1) % order; } { for (i = 1; i process_word($i); } }