Source code for sphinxwrapper.pocketsphinx_wrap

#
# Copyright (c) 2017-2022 Dane Finlay
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#

"""
CMU Pocket Sphinx Decoder Class
--------------------------------------------------------------------------------

"""

import os
import tempfile

from pocketsphinx import Decoder, Config

from .config import (set_hmm_and_dict_paths, search_arguments_set, set_lm_path,
                     ConfigError)


[docs]class PocketSphinx(Decoder):
    """
    Pocket Sphinx decoder subclass with processing methods providing callback
    functionality, as well as some other things.

    This class will try to set the '-hmm' and '-dict' configuration arguments
    automatically if they are not set prior to initialisation.

    If no search argument is present, the class will also try to set the '-lm'
    argument so that the default language model is used.  Search arguments
    include the following:

     * '-lm'
     * '-jsgf'
     * '-fsg'
     * '-keyphrase'
     * '-kws'

    Construct arguments:

     - *config* -- Decoder configuration object.  Will be initialised using
       :meth:`default_config` if unspecified.

    .. note::

       An error will be raised if the configuration object specifies more than
       search argument.  In this event, the decoder class will not be
       initialised.

    """

    # Internal values used in process_audio to keep track of the utterance
    # state between method calls.
    # This is similar to how Pocket Sphinx handles utterance state in C.
    _UTT_IDLE = object()
    _UTT_STARTED = object()
    _UTT_ENDED = object()

    def __init__(self, config=None):
        if config is None:
            config=Decoder.default_config()
        assert isinstance(config, Config)

        # Get the number of search arguments set.
        search_args_set = search_arguments_set(config)

        # If nothing else is set, use the language model.
        if len(search_args_set) == 0:
            set_lm_path(config)
        elif len(search_args_set) > 1:
            raise ConfigError("More than one search argument was set in the"
                              " Config object")

        # Set the required config paths if they aren't already set.
        if not (config.get_string("-hmm") and config.get_string("-dict")):
            set_hmm_and_dict_paths(config)

        self._speech_start_callback = None
        self._hypothesis_callback = None
        self._utterance_state = self._UTT_ENDED

        # Call the super constructor.
        super(PocketSphinx, self).__init__(config)

[docs]    def process_audio(self, buf, no_search=False, full_utterance=False,
                      use_callbacks=True):
        """
        Process an audio buffer and return the speech hypothesis, if there is
        one.

        This method processes the given buffer with the :meth:`process_raw`
        decoder method, invoking :attr:`speech_start_callback` and
        :attr:`hypothesis_callback` when appropriate.

        :param buf: Audio buffer
        :param no_search: Whether to perform feature extraction, but no
            recognition yet (default: *False*).
        :param full_utterance: Whether this block of data contains a full
            utterance worth of data (default: *False*).  This may produce
            more accurate results.
        :param use_callbacks: Whether speech start and hypothesis callbacks
            should be called (default: *True*).
        :type buf: str
        :type no_search: bool
        :type full_utterance: bool
        :type use_callbacks: bool
        :rtype: Hypothesis | None
        :returns: The decoder's hypothesis, or *None* if there isn't one (yet).
        """
        if self.utt_ended:
            self.start_utt()

        self.process_raw(buf, no_search, full_utterance)

        # Note: get_in_speech() moves the state from IDLE to STARTED if
        #  returning True, so check utt_idle before calling that method.
        was_idle = self.utt_idle

        # Check if we're in speech.
        in_speech = self.get_in_speech()

        # In speech and IDLE -> STARTED transition just occurred, so call the
        # speech start callback, if appropriate.
        if in_speech and was_idle and self.utt_started:
            if use_callbacks and self.speech_start_callback:
                self.speech_start_callback()

        elif not in_speech and self.utt_started:
            # We're not in speech any more; utterance is over.
            self.end_utt()
            hyp = self.hyp()

            # Call the hypothesis callback, if appropriate.
            if use_callbacks and self.hypothesis_callback:
                self.hypothesis_callback(hyp)

            # Return the hypothesis.
            return hyp

[docs]    def batch_process(self, buffers, no_search=False, full_utterance=False,
                      use_callbacks=True):
        """
        Process a list of audio buffers and return the speech hypothesis, if
        there one.

        This method uses the :meth:`process_audio` method.

        .. note::

           If *buffers* contains more than one utterance worth of audio, only
           the final ``Hypothesis`` object is returned.

        :param buffers: List of audio buffers
        :param no_search: Whether to perform feature extraction, but no
            recognition yet (default: *False*).
        :param full_utterance: Whether this block of data contains a full
            utterance worth of data (default: *False*).  This may produce
            more accurate results.
        :param use_callbacks: Whether speech start and hypothesis callbacks
            should be called (default: *True*).
        :type buffers: list
        :type no_search: bool
        :type full_utterance: bool
        :type use_callbacks: bool
        :rtype: Hypothesis | None
        :returns: The decoder's hypothesis, or *None* if there isn't one (yet).

        """
        final_result = None
        for buf in buffers:
            result = self.process_audio(buf, no_search, full_utterance,
                                        use_callbacks)
            if result: final_result = result
        return final_result

[docs]    def get_in_speech(self):
        """
        Check if the last audio buffer contained speech.

        :returns: Whether the last audio buffer contained speech.
        :rtype: bool
        """
        in_speech = super(PocketSphinx, self).get_in_speech()

        # Move idle -> started to make utterance properties compatible with using
        # methods like process_raw instead of process_audio.
        if in_speech and self.utt_idle:
            self._utterance_state = self._UTT_STARTED
        return in_speech

[docs]    def start_utt(self):
        """
        Starts a new utterance if one is not already in progress.

        Does nothing if an utterance is already in progress.
        """
        if self.utt_ended:
            super(PocketSphinx, self).start_utt()
            self._utterance_state = self._UTT_IDLE

[docs]    def end_utt(self):
        """
        Ends the current utterance if one was in progress.

        Does nothing if no utterance is in progress.
        """
        if not self.utt_ended:
            super(PocketSphinx, self).end_utt()
            self._utterance_state = self._UTT_ENDED

    @property
    def utt_idle(self):
        """
        Whether an utterance is in progress, but no speech has been detected yet.

        :rtype: bool
        """
        # This property is True if get_in_speech() returns False.
        return self._utterance_state == self._UTT_IDLE

    @property
    def utt_started(self):
        """
        Whether an utterance is in progress and speech has been detected.

        :rtype: bool
        """
        # This property is True if get_in_speech() returns True.
        return self._utterance_state == self._UTT_STARTED

    @property
    def utt_ended(self):
        """
        Whether there is no utterance in progress.

        :rtype: bool
        """
        return self._utterance_state == self._UTT_ENDED

    # Alias utterance methods and properties
    end_utterance = end_utt
    start_utterance = start_utt
    utterance_started = utt_started
    utterance_idle = utt_idle
    utterance_ended = utt_ended

[docs]    def set_kws_list(self, name, kws_list):
        """
        Set a keyword-list search which, when active, scans input audio for
        keywords defined in the specified list or dictionary.

        :param name: Search name
        :param kws_list: Dictionary of words to threshold value.  Can also be a
            list of 2-tuples.
        :type name: str
        :type kws_list: list | dict
        """
        if not kws_list:
            return

        # If we get a list or tuple, turn it into a dict.
        if isinstance(kws_list, (list, tuple)):
            kws_list = dict(kws_list)

        # Get a new temporary file and write each keyword string and threshold
        # value on separate lines with the threshold value bounded with forward
        # slashes.
        tf = tempfile.NamedTemporaryFile(mode="a", delete=False)
        for words, threshold in kws_list.items():
            tf.write("%s /%s/\n" % (words, float(threshold)))
        tf.close()

        # Set the search using the temporary file, deleting it afterwards.
        try:
            self.set_kws(name, tf.name)
        finally:
            os.remove(tf.name)

    @property
    def active_search(self):
        """
        The name of the currently active Pocket Sphinx search.

        If the setter is passed a name with no matching Pocket Sphinx search,
        a ``RuntimeError`` will be raised.

        :rtype: str
        """
        return self.get_search()

    @active_search.setter
    def active_search(self, value):
        self.set_search(value)

    @property
    def speech_start_callback(self):
        """
        Function invoked when speech is first detected.

        To use this callback, set it to a callable that takes no arguments: ::

            ps = PocketSphinx()

            def callback():
                print("Speech started.")

            ps.speech_start_callback = callback

        To disable this callback, set it to ``None`` (default).
        """
        return self._speech_start_callback

    @speech_start_callback.setter
    def speech_start_callback(self, value):
        if not callable(value) and value is not None:
            raise TypeError("value must be callable or None")
        self._speech_start_callback = value

    @property
    def hypothesis_callback(self):
        """
        Function invoked when the decoder has finished processing speech.

        To use this callback, set it to a callable that takes one positional
        argument, the decoder's hypothesis: ::

            ps = PocketSphinx()

            def callback(hyp):
                print(hyp)

            ps.hypothesis_callback = callback

        To disable this callback, set it to ``None`` (default).
        """
        return self._hypothesis_callback

    @hypothesis_callback.setter
    def hypothesis_callback(self, value):
        if not callable(value) and value is not None:
            raise TypeError("value must be callable or None")
        self._hypothesis_callback = value