#!/usr/local/bin/gawk -f # # sentid-to-sclite -- # convert sentid transcription format to sclite 'trn' format # # $Header: /home/srilm/devel/utils/src/RCS/sentid-to-sclite.gawk,v 1.4 2001/03/27 07:32:25 stolcke Exp $ # # i.e.: # sentid word1 word2 .... # # becomes # # word1 word2 ... (sentid) # # The sentid is formatted to contain exactly one underscore, # as sclite uses the first portion of the id as a speaker label to # group results. # { sentid = $1; $1 = ""; # reformat sentid # __ -> _ sub("[-_]A", "A", sentid); sub("[-_]B", "B", sentid); sub("[-_]ch1", "ch1", sentid); sub("[-_]ch2", "ch2", sentid); # reformat sentid # remove underscore after corpus tag, if any if (sentid ~ /^[a-z][a-z]*[-_][0-9]/) { sub("[-_]", "", sentid); } # __ -> _ sub("[-_]A", "A", sentid); sub("[-_]B", "B", sentid); sub("[-_]ch1", "ch1", sentid); sub("[-_]ch2", "ch2", sentid); # work around problems with negative start times in sentids sub("_-", "_m", sentid); # # for sentid not containing _ or -, fake a speaker id out of the first # three characters (this works for ATIS ...) # if (! (sentid ~ /[-_]/)) { sentid = substr(sentid, 1, 3) "_" sentid; } print $0, "(" sentid ")"; }