Speech Recognition Engines

Flat
Reading Input from wavfiles in sphinx4
User: [email protected]
Date: 12/28/2009 12:50 am
Views: 9713
Rating: 3

Hello. I m trying to read from an wavfile. Basically the wavfile is supposed to contain digits as well as words. I need to extract only digits from this file. First i tried the same program with microphone..it was working fine(with 20% accuracy), but now as i m trying to provide input from wavfile it returns me NULL.

Here's the .java and .xml file

.java file

import edu.cmu.sphinx.frontend.util.Microphone;

import edu.cmu.sphinx.frontend.util.AudioFileDataSource;

import edu.cmu.sphinx.recognizer.Recognizer;

import edu.cmu.sphinx.result.Result;

import edu.cmu.sphinx.util.props.ConfigurationManager;

import java.util.StringTokenizer;

import java.io.File;

import java.net.MalformedURLException;

import java.net.URL;

 

 

/** A simple Sphinx-4 application that decodes a .WAV file containing connnected-digits audio data. */

public class WavFile 

{

    public static void main(String[] args) throws MalformedURLException 

{

        URL audioFileURL;

        URL configURL;

        // use defaults that are loaded from the WavFile.jar or use values provided as arguments to main

       // if (args.length == 2) {

        configURL = new File(args[0]).toURI().toURL();

        audioFileURL = new File(args[1]).toURI().toURL();

      //  } else {

            //audioFileURL = WavFile.class.getResource("12345.wav");

           // configURL = WavFile.class.getResource("config.xml");

       // }

 

        System.out.println("Loading Recognizer as defined in '" + configURL.toString() + "'...\n");

        ConfigurationManager cm = new ConfigurationManager(configURL);

 

        // look up the recognizer (which will also lookup all its dependencies

        Recognizer recognizer = (Recognizer) cm.lookup("recognizer");

       // recognizer.allocate();

//Microphone microphone = (Microphone) cm.lookup("microphone");

//System.out.println(microphone);

 

 

            /* allocate the resource necessary for the recognizer */

        recognizer.allocate();

 

            /* the microphone will keep recording until the program exits 

            if (microphone.startRecording()) 

{

                System.out.println("Say any digit(s): e.g. \"two oh oh four\", " +

                         "\"three six five\".");

                while (true) 

{

                    System.out.println

                        ("Start speaking. Press Ctrl-C to quit.\n");*/

 

 

        // configure the audio input for the recognizer

       AudioFileDataSource dataSource = (AudioFileDataSource) cm.lookup("audioFileDataSource");

       dataSource.setAudioFile(audioFileURL, null);

 

        // decode the audio file.

     //   System.out.println("Decoding " + audioFileURL);

       Result result = recognizer.recognize();

 

  System.out.println("The result is:" + result);

 

      //  String str=result.getBestPronunciationResult();

 

                       // if(!str.isEmpty()){

                        //   StringTokenizer st = new StringTokenizer (dataSource);

// while (st.hasMoreTokens()) {

// System.out.println(st.nextToken());

     

//System.out.println(st);}

//System.out.println("Result: " + (result !=null ? result.getBestPronunciationResult() : null));

       System.out.println("Result: " + (result != null ? result.getBestFinalResultNoFiller() : null));

 

}        

}

 

 

This is .xml file

<?xml version="1.0" encoding="UTF-8"?>

 

<!--

   Sphinx-4 Configuration file

-->

 

<!-- ******************************************************** -->

<!--  tidigits configuration file                             -->

<!-- ******************************************************** -->

 

<config>

 

 

    <!-- ******************************************************** -->

    <!-- frequently tuned properties                              -->

    <!-- ******************************************************** -->

<property name="logLevel" value="WARNING"/>

    <property name="absoluteBeamWidth" value="-1"/>

    <property name="relativeBeamWidth" value="1E-80"/>

    <property name="wordInsertionProbability" value="1E-36"/>

    <property name="languageWeight" value="8"/>

    <property name="recognizer" value="recognizer"/>

    <property name="linguist" value="flatLinguist"/>

    <property name="frontend" value="mfcFrontEnd"/>

<property name="showCreations" value="true"/>

 

    <!-- ******************************************************** -->

    <!-- The connectedDigitsRecognizer configuration               -->

    <!-- ******************************************************** -->

 

    <component name="recognizer"

               type="edu.cmu.sphinx.recognizer.Recognizer">

        <property name="decoder" value="decoder"/>

        <propertylist name="monitors">

<item>accuracyTracker </item>

            <item>speedTracker </item>

            <item>memoryTracker </item>

        </propertylist>

    </component>

 

 

    <!-- ******************************************************** -->

    <!-- The Decoder   configuration                              -->

    <!-- ******************************************************** -->

 

    <component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">

        <property name="searchManager" value="searchManager"/>

    </component>

 

    <component name="searchManager"

               type="edu.cmu.sphinx.decoder.search.SimpleBreadthFirstSearchManager">

        <property name="logMath" value="logMath"/>

        <property name="linguist" value="flatLinguist"/>

        <property name="pruner" value="trivialPruner"/>

        <property name="scorer" value="threadedScorer"/>

        <property name="activeListFactory" value="activeList"/>

    </component>

 

 

    <component name="activeList"

               type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">

        <property name="logMath" value="logMath"/>

        <property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>

        <property name="relativeBeamWidth" value="${relativeBeamWidth}"/>

    </component>

 

    <component name="trivialPruner"

               type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>

 

    <component name="threadedScorer"

               type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">

        <property name="frontend" value="${frontend}"/>

    </component>

 

    <!-- ******************************************************** -->

    <!-- The linguist  configuration                              -->

    <!-- ******************************************************** -->

 

    <component name="flatLinguist"

               type="edu.cmu.sphinx.linguist.flat.FlatLinguist">

        <property name="logMath" value="logMath"/>

        <property name="grammar" value="jsgfGrammar"/>

        <property name="acousticModel" value="wsj"/>

        <property name="wordInsertionProbability"

                  value="${wordInsertionProbability}"/>

        

        <property name="languageWeight" value="${languageWeight}"/>

        <property name="unitManager" value="unitManager"/>

    </component>

 

 

    <!-- ******************************************************** -->

    <!-- The Grammar  configuration                               -->

    <!-- ******************************************************** -->

 

    <component name="jsgfGrammar" type="edu.cmu.sphinx.jsapi.JSGFGrammar">

        <property name="dictionary" value="dictionary"/>

        <property name="grammarLocation"

                  value="file:/C:/F/Speech Recognition Project/sphinx4/"/>

        <property name="grammarName" value="digits"/>

        <property name="logMath" value="logMath"/>

    </component>

 

 

    <!-- ******************************************************** -->

    <!-- The Dictionary configuration                            -->

    <!-- ******************************************************** -->

 

    <component name="dictionary"

               type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">

        <property name="dictionaryPath"

                  value="file:/C:/F/Speech Recognition Project/sphinx4/edu/cmu/sphinx/model/acoustic/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/alN.dict"/>

        <property name="fillerPath"

                  value="file:/C:/F/Speech Recognition Project/sphinx4/edu/cmu/sphinx/model/acoustic/WSJ_8gau_13dCep_8kHz_31mel_200Hz_3500Hz/dict/fillerdict"/>

        <property name="addSilEndingPronunciation" value="false"/>

        <property name="unitManager" value="unitManager"/>

    </component>

 

 

    <!-- ******************************************************** -->

    <!-- The acoustic model configuration                         -->

    <!-- ******************************************************** -->

 

    <component name="wsj"

               type="edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.Model">

        <property name="loader" value="wsjLoader"/>

        <property name="unitManager" value="unitManager"/>

    </component>

 

    <component name="wsjLoader"

               type="edu.cmu.sphinx.model.acoustic.WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz.ModelLoader">

        <property name="logMath" value="logMath"/>

        <property name="unitManager" value="unitManager"/>

    </component>

 

 

    <!-- ******************************************************** -->

    <!-- The unit manager configuration                           -->

    <!-- ******************************************************** -->

 

    <component name="unitManager"

               type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>

 

     <!-- ******************************************************** -->

    <!-- The live frontend configuration                          -->

    <!-- ******************************************************** -->

    <component name="mfcFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">

        <propertylist name="pipeline">

            <item>audioFileDataSource</item>

            <item>dataBlocker </item>

            <item>speechClassifier </item>

            <item>speechMarker </item>

            <item>nonSpeechDataFilter </item>

            <item>preemphasizer </item>

            <item>windower </item>

            <item>fft </item>

            <item>melFilterBank </item>

            <item>dct </item>

            <item>liveCMN </item>

            <item>featureExtraction </item>

        </propertylist>

    </component>

 

 

  <component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>

 

   <!--   <component name="preemphasizer"

               type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>

 

    <component name="dither"

               type="edu.cmu.sphinx.frontend.filter.Dither"/>

 

    <component name="windower"

               type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"/>

 

    <component name="fft"

               type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"/>

 

    <component name="melFilterBank"

               type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank"/>

 

    <component name="dct"

               type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>

 

    <component name="batchCMN"

               type="edu.cmu.sphinx.frontend.feature.BatchCMN"/>

 

    <component name="featureExtraction"

               type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>-->

 

<!-- ******************************************************** -->

    <!-- The frontend pipelines                                   -->

    <!-- ******************************************************** -->

 

    <component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker">

        <!--<property name="blockSizeMs" value="10"/>-->

    </component>

 

    <component name="speechClassifier"

               type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">

        <property name="threshold" value="13"/>

    </component>

 

    <component name="nonSpeechDataFilter"

               type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>

 

    <component name="speechMarker"

               type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker" >

        <property name="speechTrailer" value="50"/>

    </component>

 

 

    <component name="preemphasizer"

               type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>

 

    <component name="windower"

               type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower">

    </component>

 

    <component name="fft"

            type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform">

    </component>

 

    <component name="melFilterBank"

        type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank">

    </component>

 

    <component name="dct"

            type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>

 

    <component name="liveCMN"

               type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>

 

    <component name="featureExtraction"

               type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>

 

  <!-- <component name="microphone" type="edu.cmu.sphinx.frontend.util.Microphone">

        <property name="closeBetweenUtterances" value="false"/>

    </component>

 

 

     ******************************************************* 

      monitors                                             

     ******************************************************* -->

 

    <component name="accuracyTracker"

               type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">

        <property name="recognizer" value="${recognizer}"/>

<property name="showAlignedResults" value="false"/>

        <property name="showRawResults" value="false"/>

    </component>

 

<component name="memoryTracker"

                type="edu.cmu.sphinx.instrumentation.MemoryTracker">

        <property name="recognizer" value="${recognizer}"/>

<property name="showSummary" value="false"/>

<property name="showDetails" value="false"/>

    </component>

 

  <component name="speedTracker"

                type="edu.cmu.sphinx.instrumentation.SpeedTracker">

        <property name="recognizer" value="${recognizer}"/>

        <property name="frontend" value="${frontend}"/>

<property name="showSummary" value="true"/>

<property name="showDetails" value="false"/>

    </component>

 

 

    <!-- ******************************************************* -->

    <!--  Miscellaneous components                               -->

    <!-- ******************************************************* -->

 

    <component name="logMath" type="edu.cmu.sphinx.util.LogMath">

        <property name="logBase" value="1.0001"/>

        <property name="useAddTable" value="true"/>

    </component>

 

</config>

Please Help!

 

--- (Edited on 12/28/2009 12:50 am [GMT-0600] by [email protected]) ---

Re: Reading Input from wavfiles in sphinx4
User: kmaclean
Date: 1/26/2010 9:58 pm
Views: 3763
Rating: 2

>Hello. I m trying to read from an wavfile.

Can you get Sphinx4 to recognize the speech from the wavfile directly (as opposed to doing it from your Java app)?

--- (Edited on 1/26/2010 10:58 pm [GMT-0500] by kmaclean) ---

PreviousNext