VoxForge
Hi there,
I've been working on research that requires Spanish speech recognition in Sphinx4, and for the past couple of weeks I've been trying to use the Mexican Spanish Broadcast News Model (located at https://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/Mexican%20Spanish%20Broadcast%20News/ ), yet all of my results are completely off. It probably has to do with the file configuration, config.xml. The problem is, I have no idea how the configuration should look.
I've read the guide for using SphinxTrain models with Sphinx4, but after weeks of Googling my recognizer is still just nonsense. I've changed the file paths appropriately for acoustic & language models and the dictionaries, I've tried using both 8kHz and 16kHz clips 16-bit signed PCM audio (and adjusting the melFilterBank parameters accordingly). I just have no clue. There was no README or instruction set for configuring the acoustic/language model in the tarball either. I'm really hoping someone knows the correct configuration.
<?xml version="1.0" encoding="UTF-8"?>
<!--
Sphinx-4 Configuration file
-->
<!-- ******************************************************** -->
<!-- biship configuration file -->
<!-- ******************************************************** -->
<config>
<!-- ******************************************************** -->
<!-- frequently tuned properties -->
<!-- ******************************************************** -->
<property name="absoluteBeamWidth" value="500"/>
<property name="relativeBeamWidth" value="1E-80"/>
<property name="absoluteWordBeamWidth" value="20"/>
<property name="relativeWordBeamWidth" value="1E-60"/>
<property name="wordInsertionProbability" value="0.2"/>
<property name="languageWeight" value="10.5"/>
<property name="silenceInsertionProbability" value=".1"/>
<property name="frontend" value="epFrontEnd"/>
<property name="recognizer" value="recognizer"/>
<property name="showCreations" value="false"/>
<!-- ******************************************************** -->
<!-- word recognizer configuration -->
<!-- ******************************************************** -->
<component name="recognizer"
type="edu.cmu.sphinx.recognizer.Recognizer">
<property name="decoder" value="decoder"/>
<propertylist name="monitors">
<item>accuracyTracker </item>
<item>speedTracker </item>
<item>memoryTracker </item>
<item>recognizerMonitor </item>
</propertylist>
</component>
<!-- ******************************************************** -->
<!-- The Decoder configuration -->
<!-- ******************************************************** -->
<component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
<property name="searchManager" value="wordPruningSearchManager"/>
<property name="featureBlockSize" value="50"/>
</component>
<!-- ******************************************************** -->
<!-- The Search Manager -->
<!-- ******************************************************** -->
<component name="wordPruningSearchManager"
type="edu.cmu.sphinx.decoder.search.WordPruningBreadthFirstSearchManager">
<property name="logMath" value="logMath"/>
<property name="linguist" value="lexTreeLinguist"/>
<property name="pruner" value="trivialPruner"/>
<property name="scorer" value="threadedScorer"/>
<property name="activeListManager" value="activeListManager"/>
<property name="growSkipInterval" value="0"/>
<property name="checkStateOrder" value="false"/>
<property name="buildWordLattice" value="true"/>
<property name="acousticLookaheadFrames" value="1.7"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<!-- ******************************************************** -->
<!-- The Active Lists -->
<!-- ******************************************************** -->
<component name="activeListManager"
type="edu.cmu.sphinx.decoder.search.SimpleActiveListManager">
<propertylist name="activeListFactories">
<item>standardActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
</propertylist>
</component>
<component name="standardActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<component name="wordActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteWordBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeWordBeamWidth}"/>
</component>
<!-- ******************************************************** -->
<!-- The Pruner -->
<!-- ******************************************************** -->
<component name="trivialPruner"
type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>
<!-- ******************************************************** -->
<!-- TheScorer -->
<!-- ******************************************************** -->
<component name="threadedScorer"
type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
<property name="frontend" value="${frontend}"/>
</component>
<!-- ******************************************************** -->
<!-- The linguist configuration -->
<!-- ******************************************************** -->
<component name="lexTreeLinguist"
type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist">
<property name="logMath" value="logMath"/>
<property name="acousticModel" value="wsj"/>
<property name="languageModel" value="trigramModel"/>
<property name="dictionary" value="dictionary"/>
<property name="addFillerWords" value="false"/>
<property name="fillerInsertionProbability" value="1E-10"/>
<property name="generateUnitStates" value="false"/>
<property name="wantUnigramSmear" value="true"/>
<property name="unigramSmearWeight" value="1"/>
<property name="wordInsertionProbability"
value="${wordInsertionProbability}"/>
<property name="silenceInsertionProbability"
value="${silenceInsertionProbability}"/>
<property name="languageWeight" value="${languageWeight}"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******************************************************** -->
<!-- The Dictionary configuration -->
<!-- ******************************************************** -->
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath"
value="/Users/Eric/Documents/Research/Sphinx4/bld/es_MX_broadcast_cont_2500/etc/h4.dict"/>
<property name="fillerPath"
value="/Users/Eric/Documents/Research/Sphinx4/bld/es_MX_broadcast_cont_2500/etc/filler.dict"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="<sil>"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******************************************************** -->
<!-- The Language Model configuration -->
<!-- ******************************************************** -->
<component name="trigramModel"
type="edu.cmu.sphinx.linguist.language.ngram.large.LargeTrigramModel">
<property name="unigramWeight" value=".7"/>
<property name="maxDepth" value="3"/>
<property name="logMath" value="logMath"/>
<property name="dictionary" value="dictionary"/>
<property name="location"
value="/Users/Eric/Documents/Research/Sphinx4/bld/es_MX_broadcast_cont_2500/etc/H4.arpa.Z.DMP"/>
</component>
<!-- ******************************************************** -->
<!-- The acoustic model configuration -->
<!-- ******************************************************** -->
<component name="wsj"
type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel">
<property name="loader" value="wsjLoader"/>
<property name="unitManager" value="unitManager"/>
</component>
<component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.Sphinx3Loader">
<property name="logMath" value="logMath"/>
<property name="unitManager" value="unitManager"/>
<property name="location" value="/Users/Eric/Documents/Research/Sphinx4/bld/es_MX_broadcast_cont_2500/model_parameters/hub4_spanish_itesm.cd_cont_2500"/>
</component>
<!-- ******************************************************** -->
<!-- The unit manager configuration -->
<!-- ******************************************************** -->
<component name="unitManager"
type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>
<!-- ******************************************************** -->
<!-- The frontend configuration -->
<!-- ******************************************************** -->
<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>audioFileDataSource </item>
<item>dataBlocker </item>
<item>speechClassifier </item>
<item>speechMarker </item>
<item>nonSpeechDataFilter </item>
<item>preemphasizer </item>
<item>windower </item>
<item>fft </item>
<item>melFilterBank </item> <item>dct </item>
<item>liveCMN </item>
<item>featureExtraction </item>
</propertylist>
</component>
<component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>
<component name="microphone"
type="edu.cmu.sphinx.frontend.util.Microphone">
<property name="closeBetweenUtterances" value="false"/>
</component>
<component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/>
<component name="speechClassifier"
type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">
<property name="threshold" value="13"/>
</component>
<component name="nonSpeechDataFilter"
type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>
<component name="speechMarker"
type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker">
<property name="speechTrailer" value="50"/>
</component>
<component name="preemphasizer"
type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>
<component name="windower"
type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"/>
<component name="fft"
type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"/>
<component name="melFilterBank"
type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank" />
<component name="dct"
type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>
<component name="liveCMN"
type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>
<component name="featureExtraction"
type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>
<!-- ******************************************************* -->
<!-- monitors -->
<!-- ******************************************************* -->
<component name="accuracyTracker"
type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showRawResults" value="false"/>
<property name="showAlignedResults" value="false"/>
</component>
<component name="memoryTracker"
type="edu.cmu.sphinx.instrumentation.MemoryTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showDetails" value="false"/>
<property name="showSummary" value="false"/>
</component>
<component name="speedTracker"
type="edu.cmu.sphinx.instrumentation.SpeedTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="frontend" value="${frontend}"/>
<property name="showDetails" value="false"/>
</component>
<component name="recognizerMonitor"
type="edu.cmu.sphinx.instrumentation.RecognizerMonitor">
<property name="recognizer" value="${recognizer}"/>
<propertylist name="allocatedMonitors">
<item>configMonitor </item>
</propertylist>
</component>
<component name="configMonitor"
type="edu.cmu.sphinx.instrumentation.ConfigMonitor">
<property name="showConfig" value="false"/>
</component>
<!-- ******************************************************* -->
<!-- Miscellaneous components -->
<!-- ******************************************************* -->
<component name="logMath" type="edu.cmu.sphinx.util.LogMath">
<property name="logBase" value="1.0001"/>
<property name="useAddTable" value="true"/>
</component>
</config>
--- (Edited on 10/2/2012 8:14 pm [GMT-0500] by ) ---
Thanks a bunch, I never would have found that out on my own!
My recognizer is now producing sensible results, though they are fairly inaccurate. Is there anything I could change on the front end for improvement?
--- (Edited on 10/3/2012 9:31 am [GMT-0500] by e.fields) ---