Syllabifier

This is somewhat mad: first do phone recognition using sphinx, then syllabify using syllabify.py. Works pretty well, even for Swedish!

There were a few hurdles in getting sphinx to run, but it boiled down to using the correct version of sphinxbase and sphinx3 as mentioned here.

I built my own language model using lmtool, see info here. This was based on ked_timit + some of the arctic databases. This seems to work better for short sentences of read-type speech.

#nocheck select all
#nocheck Remove

sphinxDir$="~/OperaDownloads/sphinxtest/"
phonesTier=1
sylTier=2

include ~/.praat-dir/split.proc.praat

s1 = selected("Sound")
#s1=do ("Read from file...", sphinxDir$+"in.sph")

do ("Save as NIST file...", sphinxDir$+"test.sph")
tg1=do ("To TextGrid...", "sphinx_seg sylls", "")

system sphinx3_decode
... -mode allphone
... -ctl 'sphinxDir$'test.ctl
... -cepdir 'sphinxDir$'
... -cepext .sph
... -adcin yes
... -adchdr 1024
... -hmm /home/johanf/OperaDownloads/sphinx3-0.8/model/hmm/hub4_cd_continuous_8gau_1s_c_d_dd
... -lm 'sphinxDir$'7035.lm
... -dict 'sphinxDir$'7035.dic
... -fdict 'sphinxDir$'filler.dict
... -hypseg 'sphinxDir$'test.lab

#... -lm 'sphinxDir$'7666.lm
#... -lm 'sphinxDir$'interp_nodx.arpa.dmp
# we build our own lm from cmudict


str1=do ("Read Strings from raw text file...", sphinxDir$+"test.lab")
sphinx_str$=Get string... 1

select 'tg1'

writeInfoLine("--sphinx output--")
@split (" ", sphinx_str$)

#for i to split.length
#       str$[i] = split.array$[i]
#       appendInfoLine(i, tab$, str$[i])
#endfor

co=13
labNo=0
sylString$=""
while co < split.length
        lab$ = split.array$[co]
        #if (lab$ != "SIL")
                sylString$=sylString$+lab$+" "
        #endif
        frame = number(split.array$[co+1])*0.01

        labNo=labNo+1
        do ("Insert boundary...", phonesTier, frame)
        do ("Set interval text...", phonesTier, labNo, lab$)

#       # if first is SIL then add sylbound at end of interval
#       if (labNo==1)
#               do ("Insert boundary...", sylTier, frame)
#       endif

        appendInfoLine(co, tab$, lab$, tab$, frame)
        co = co + 4
endwhile
# if last is SIL then keep a reminder to add sylbound at start of interval
#addLast=0
if (lab$ == "SIL")
#       addLast=1
        addLastTime=do ("Get start point...", phonesTier, labNo)
else
        addLastTime=do ("Get end point...", phonesTier, labNo)
endif

writeFile(sphinxDir$+"phones.txt", sylString$)

system python 'sphinxDir$'syllabifier.py English < 'sphinxDir$'phones.txt > 'sphinxDir$'syllables.txt
syllables$ = readFile$ (sphinxDir$+"syllables.txt")
appendInfoLine(syllables$)
@split (" ", syllables$)

sylIndex=0
for i to split.length
        if (split.array$[i] == "SIL" && i == 1)
                sylTime=do ("Get end point...", phonesTier, 1)
                do ("Insert boundary...", sylTier, sylTime)
        endif
        if (split.array$[i] == ".")
                sylIndex=sylIndex+1
                phoneWithSyl = i - sylIndex
                sylTime=do ("Get end point...", phonesTier, phoneWithSyl)
                do ("Insert boundary...", sylTier, sylTime)
                nInt=do ("Get number of intervals...", sylTier)
                do ("Set interval text...", sylTier, nInt-1, "syl")
        endif
endfor
#if (addLast)
        do ("Insert boundary...", sylTier, addLastTime)
        nInt=do ("Get number of intervals...", sylTier)
        do ("Set interval text...", sylTier, nInt-1, "syl")
#endif 


plusObject (s1)