20201201

Kaldi

cd /home/johanf/data/ceqwong/moutdata/about_Bayek_River/
awk '{sub(/.lab/,"",FILENAME); print FILENAME,$0}' bay-*.lab > text
for f in bay-*.wav; do bf=$(basename $f .wav); dur=$(soxi -D $f); echo $bf $bf 0 $dur; done > segments
for f in bay-*.wav; do bf=$(basename $f .wav); ff=$(readlink -f $f); echo $bf $ff; done > wav.scp
for f in bay-*.wav; do bf=$(basename $f .wav); echo $bf bay; done > utt2spk
utils/fix_data_dir.sh data/train (creates spk2utt)

cut -d ' ' -f 2- text | sed 's/ /\n/g' | sort -u > words.txt

cat words.txt | awk 'BEGIN{FS=""}{printf("%s ",$0);for(i=1;i<=NF;i++)printf("%s ",$i); print ""}' > lexicon.txt