这是一款很好用的工具包
源代码在线查看: wlat-to-pfsg.gawk
#!/usr/local/bin/gawk -f # # wlat-to-pfsg -- # Create a Decipher PFSG from a nbest-lattice(1) word lattice # # usage: wlat-to-pfsg word-lattice> pfsg # # $Header: /home/srilm/devel/utils/src/RCS/wlat-to-pfsg.gawk,v 1.5 2004/11/02 02:00:35 stolcke Exp $ # ######################################### # # Output format specific code # BEGIN { logscale = 10000.5; round = 0.5; null = "NULL"; tmpfile = "tmp.pfsg.trans"; } function rint(x) { if (x < 0) { return int(x - round); } else { return int(x + round); } } function scale_log(x) { return rint(x * logscale); } function start_grammar(name) { num_trans = 0; num_nodes = 0; return; } function end_grammar(name) { printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr"; printf "%d pfsg transitions\n", num_trans >> "/dev/stderr"; print "name " name; printf "nodes %s", num_nodes; for (i = 0; i < num_nodes; i ++) { if (node_string[i] == "") { print "warning: node word " i " is undefined" >> "/dev/stderr"; node_string[i] = null; } printf " %s", node_string[i]; } printf "\n"; print "initial " initial_node; print "final " final_node; print "transitions " num_trans; fflush(); close(tmpfile); system("/bin/cat " tmpfile "; /bin/rm -f " tmpfile); } function set_initial(node) { initial_node = node; } function set_final(node) { final_node = node; } function add_node(node, word) { if (node >= num_nodes) { num_nodes = node + 1; } node_string[node] = word; } function add_trans(from, to, prob) { num_trans ++; print from, to, scale_log(prob) > tmpfile; } ######################################### # # Word lattice parsing # BEGIN { grammar_name = "PFSG"; } NF == 0 { next; } $1 == "name" { grammar_name = $2; next; } # # nbest-lattice output (without -use-mesh) # $1 == "version" { if ($2 != 2) { print "need word lattice version 2" >> "/dev/stderr"; exit 1; } } $1 == "initial" { start_grammar(grammar_name); set_initial($2); } $1 == "final" { set_final($2); } $1 == "node" { add_node($2, $3); posterior = $5; if (posterior == 0) { print "node " $2 ": zero posterior, omitting it" >> "/dev/stderr"; next; } for (i = 6; i if ($(i + 1) == 0) { print "node " $2 ": omitting zero posterior transition" \ >> "/dev/stderr"; } else { add_trans($2, $i, log($(i + 1)/posterior)); } } } # # nbest-lattice -use-mesh output (confusion networks) # $1 == "numaligns" { start_grammar(grammar_name); numaligns = $2; # # use nodes 0 ... numaligns between the words # then allocate additional nodes for each word set_initial(0); set_final(numaligns); for (i = 0; i add_node(i, null); } numnodes = numaligns + 1; } $1 == "align" { pos = $2; for (i = 3; i word = $i; posterior = $(i + 1); if (posterior == 0) { print "align " pos ", word " word \ ": zero posterior, omitting it" >> "/dev/stderr"; continue; } if (word == "*DELETE*") { add_trans(pos, pos + 1, log(posterior)); } else { add_node(numnodes, word); add_trans(pos, numnodes, log(posterior)); add_trans(numnodes, pos + 1, 0); numnodes ++; } } } END { end_grammar(grammar_name); }