Merge pull request #263 from dvonthenen/impl-vad-events-speech-start

davidvonthenen · web-flow · commit 231f9161a57d · 2024-01-26T07:32:54.000-08:00
Implements vad_events for SpeechStarted
diff --git a/deepgram/__init__.py b/deepgram/__init__.py
@@ -20,6 +20,7 @@
 from .client import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/client.py b/deepgram/client.py
@@ -21,6 +21,7 @@
 from .clients import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/clients/__init__.py b/deepgram/clients/__init__.py
@@ -13,6 +13,7 @@
 from .live import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/clients/listen.py b/deepgram/clients/listen.py
@@ -36,6 +36,7 @@
 from .live import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/clients/live/__init__.py b/deepgram/clients/live/__init__.py
@@ -9,6 +9,7 @@
 from .client import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/clients/live/client.py b/deepgram/clients/live/client.py
@@ -9,6 +9,7 @@
 from .v1.response import (
     LiveResultResponse as LiveResultResponseLatest,
     MetadataResponse as MetadataResponseLatest,
+    SpeechStartedResponse as SpeechStartedResponseLatest,
     UtteranceEndResponse as UtteranceEndResponseLatest,
     ErrorResponse as ErrorResponseLatest,
 )
@@ -45,6 +46,14 @@ class MetadataResponse(MetadataResponseLatest):
     pass
 
 
+class SpeechStartedResponse(SpeechStartedResponseLatest):
+    """
+    pass through for SpeechStartedResponse based on API version
+    """
+
+    pass
+
+
 class UtteranceEndResponse(UtteranceEndResponseLatest):
     """
     pass through for UtteranceEndResponse based on API version
diff --git a/deepgram/clients/live/enums.py b/deepgram/clients/live/enums.py
@@ -15,5 +15,6 @@ class LiveTranscriptionEvents(Enum):
     Transcript = "Results"
     Metadata = "Metadata"
     UtteranceEnd = "UtteranceEnd"
+    SpeechStarted = "SpeechStarted"
     Error = "Error"
     Warning = "Warning"
diff --git a/deepgram/clients/live/v1/__init__.py b/deepgram/clients/live/v1/__init__.py
@@ -9,6 +9,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
diff --git a/deepgram/clients/live/v1/async_client.py b/deepgram/clients/live/v1/async_client.py
@@ -14,6 +14,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -129,6 +130,19 @@ async def _start(self) -> None:
                             metadata=result,
                             **dict(self.kwargs),
                         )
+                    case LiveTranscriptionEvents.SpeechStarted.value:
+                        self.logger.debug(
+                            "response_type: %s, data: %s", response_type, data
+                        )
+                        result = SpeechStartedResponse.from_json(message)
+                        if result is None:
+                            self.logger.error("SpeechStartedResponse.from_json is None")
+                            continue
+                        await self._emit(
+                            LiveTranscriptionEvents.SpeechStarted,
+                            speech_started=result,
+                            **dict(self.kwargs),
+                        )
                     case LiveTranscriptionEvents.UtteranceEnd.value:
                         self.logger.debug(
                             "response_type: %s, data: %s", response_type, data
diff --git a/deepgram/clients/live/v1/client.py b/deepgram/clients/live/v1/client.py
@@ -16,6 +16,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -161,6 +162,19 @@ def _listening(self) -> None:
                             metadata=result,
                             **dict(self.kwargs),
                         )
+                    case LiveTranscriptionEvents.SpeechStarted.value:
+                        self.logger.debug(
+                            "response_type: %s, data: %s", response_type, data
+                        )
+                        result = SpeechStartedResponse.from_json(message)
+                        if result is None:
+                            self.logger.error("SpeechStartedResponse.from_json is None")
+                            continue
+                        self._emit(
+                            LiveTranscriptionEvents.SpeechStarted,
+                            speech_started=result,
+                            **dict(self.kwargs),
+                        )
                     case LiveTranscriptionEvents.UtteranceEnd.value:
                         self.logger.debug(
                             "response_type: %s, data: %s", response_type, data
diff --git a/deepgram/clients/live/v1/options.py b/deepgram/clients/live/v1/options.py
@@ -44,6 +44,7 @@ class LiveOptions:
     tag: Optional[list] = None
     tier: Optional[str] = None
     utterance_end_ms: Optional[str] = None
+    vad_events: Optional[bool] = None
     version: Optional[str] = None
 
     def __getitem__(self, key):
diff --git a/deepgram/clients/live/v1/response.py b/deepgram/clients/live/v1/response.py
@@ -161,6 +161,25 @@ def __getitem__(self, key):
         return _dict[key]
 
 
+# Speech Started Message
+
+
+@dataclass_json
+@dataclass
+class SpeechStartedResponse:
+    """
+    SpeechStartedResponse Message from the Deepgram Platform
+    """
+
+    type: Optional[str] = ""
+    channel: Optional[List[int]] = None
+    timestamp: Optional[float] = 0
+
+    def __getitem__(self, key):
+        _dict = self.to_dict()
+        return _dict[key]
+
+
 # Utterance End Message
 
 
diff --git a/examples/advanced/streaming/direct-invocation/main.py b/examples/advanced/streaming/direct-invocation/main.py
@@ -39,6 +39,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -47,11 +50,12 @@ def on_error(self, error, **kwargs):
 
         liveClient.on(LiveTranscriptionEvents.Transcript, on_message)
         liveClient.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        liveClient.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         liveClient.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         liveClient.on(LiveTranscriptionEvents.Error, on_error)
 
         # connect to websocket
-        options = LiveOptions(model="nova", interim_results=False, language="en-US")
+        options = LiveOptions(model="nova-2", language="en-US")
         liveClient.start(options)
 
         lock_exit = threading.Lock()
diff --git a/examples/advanced/streaming/microphone-inheritance/main.py b/examples/advanced/streaming/microphone-inheritance/main.py
@@ -14,6 +14,7 @@
     Microphone,
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -27,6 +28,7 @@ def __init__(self, config: LiveClient):
         super().__init__(config)
         super().on(LiveTranscriptionEvents.Transcript, self.on_message)
         super().on(LiveTranscriptionEvents.Metadata, self.on_metadata)
+        super().on(LiveTranscriptionEvents.SpeechStarted, self.on_speech_started)
         super().on(LiveTranscriptionEvents.UtteranceEnd, self.on_utterance_end)
         super().on(LiveTranscriptionEvents.Error, self.on_error)
         # self.test = "child"
@@ -54,6 +56,9 @@ def on_message(self, parent, result, **kwargs):
     def on_metadata(self, parent, metadata, **kwargs):
         print(f"\n\n{metadata}\n\n")
 
+    def on_speech_started(self, parent, speech_started, **kwargs):
+        print(f"\n\n{speech_started}\n\n")
+
     def on_utterance_end(self, parent, utterance_end, **kwargs):
         print(f"\n\n{utterance_end}\n\n")
 
@@ -73,6 +78,7 @@ def main():
         liveClient = MyLiveClient(ClientOptionsFromEnv())
 
         options = LiveOptions(
+            model="nova-2",
             punctuate=True,
             language="en-US",
             encoding="linear16",
@@ -81,6 +87,7 @@ def main():
             # To get UtteranceEnd, the following must be set:
             interim_results=True,
             utterance_end_ms="1000",
+            vad_events=True,
         )
         liveClient.start(options, addons=dict(myattr="hello"), test="hello")
 
diff --git a/examples/streaming/async_http/main.py b/examples/streaming/async_http/main.py
@@ -14,8 +14,7 @@
 API_KEY = os.getenv("DG_API_KEY")
 
 options = LiveOptions(
-    model="nova",
-    interim_results=False,
+    model="nova-2",
     language="en-US",
 )
 
@@ -39,14 +38,18 @@ async def on_message(self, result, **kwargs):
         async def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
-        def on_utterance_end(self, utterance_end, **kwargs):
+        async def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
+        async def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
         async def on_error(self, error, **kwargs):
             print(f"\n\n{error}\n\n")
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 
diff --git a/examples/streaming/http/main.py b/examples/streaming/http/main.py
@@ -42,6 +42,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -50,11 +53,12 @@ def on_error(self, error, **kwargs):
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 
         # connect to websocket
-        options = LiveOptions(model="nova", interim_results=False, language="en-US")
+        options = LiveOptions(model="nova-2", language="en-US")
         dg_connection.start(options)
 
         lock_exit = threading.Lock()
diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py
@@ -39,6 +39,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -47,10 +50,12 @@ def on_error(self, error, **kwargs):
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 
         options = LiveOptions(
+            model="nova-2",
             punctuate=True,
             language="en-US",
             encoding="linear16",
@@ -59,6 +64,7 @@ def on_error(self, error, **kwargs):
             # To get UtteranceEnd, the following must be set:
             interim_results=True,
             utterance_end_ms="1000",
+            vad_events=True,
         )
         dg_connection.start(options, addons=dict(myattr="hello"), test="hello")
 

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`from .client import (`
`21`	`21`	`LiveResultResponse,`
`22`	`22`	`MetadataResponse,`
	`23`	`+ SpeechStartedResponse,`
`23`	`24`	`UtteranceEndResponse,`
`24`	`25`	`ErrorResponse,`
`25`	`26`	`)`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`from .clients import (`
`22`	`22`	`LiveResultResponse,`
`23`	`23`	`MetadataResponse,`
	`24`	`+ SpeechStartedResponse,`
`24`	`25`	`UtteranceEndResponse,`
`25`	`26`	`ErrorResponse,`
`26`	`27`	`)`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`from .live import (`
`14`	`14`	`LiveResultResponse,`
`15`	`15`	`MetadataResponse,`
	`16`	`+ SpeechStartedResponse,`
`16`	`17`	`UtteranceEndResponse,`
`17`	`18`	`ErrorResponse,`
`18`	`19`	`)`
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@`
`36`	`36`	`from .live import (`
`37`	`37`	`LiveResultResponse,`
`38`	`38`	`MetadataResponse,`
	`39`	`+ SpeechStartedResponse,`
`39`	`40`	`UtteranceEndResponse,`
`40`	`41`	`ErrorResponse,`
`41`	`42`	`)`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`from .client import (`
`10`	`10`	`LiveResultResponse,`
`11`	`11`	`MetadataResponse,`
	`12`	`+ SpeechStartedResponse,`
`12`	`13`	`UtteranceEndResponse,`
`13`	`14`	`ErrorResponse,`
`14`	`15`	`)`
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`from .response import (`
`10`	`10`	`LiveResultResponse,`
`11`	`11`	`MetadataResponse,`
	`12`	`+ SpeechStartedResponse,`
`12`	`13`	`UtteranceEndResponse,`
`13`	`14`	`ErrorResponse,`
`14`	`15`	`)`