#!/usr/local/bin/gawk -f # # pfsg-vocab -- # extract vocabulary used in PFSG # # usage: pfsg-vocab PFSG-FILE ... > VOCAB # # $Header: /home/srilm/devel/utils/src/RCS/pfsg-vocab.gawk,v 1.1 2003/02/18 18:33:04 stolcke Exp $ # BEGIN { null = "NULL"; } $1 == "nodes" { for (i = 3; i if ($i != null) { is_word[$i] = 1; } } next; } $1 == "name" { # sub-pfsg names are not words, and might have been added during the # processing of the nodes list delete is_word[$2]; } END { for (word in is_word) { print word; } }