Skip to content

Commit f0f2360

Browse files
modernize: dont write audio to tmp file (#45)
* modernize: dont write to tmp file / drop dependency on speech_recognition package * Update ovos_stt_http_server/__init__.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * 📝 Add docstrings to `modernize` (#46) * 📝 Add docstrings to `modernize` Docstrings generation was requested by @JarbasAl. * #45 (comment) The following files were modified: * `ovos_stt_http_server/__init__.py` * `ovos_stt_http_server/gradio_app.py` * Update gradio_app.py --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: JarbasAI <33701864+JarbasAl@users.noreply.github.com> --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent ff0ff80 commit f0f2360

File tree

4 files changed

+78
-30
lines changed

4 files changed

+78
-30
lines changed

ovos_stt_http_server/__init__.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
from ovos_config import Configuration
2121
from ovos_plugin_manager.audio_transformers import load_audio_transformer_plugin, AudioLanguageDetector
2222
from ovos_plugin_manager.stt import load_stt_plugin
23+
from ovos_plugin_manager.utils.audio import AudioFile, AudioData
2324
from ovos_utils.log import LOG
24-
from speech_recognition import AudioData, Recognizer, AudioFile
2525
from starlette.requests import Request
2626

2727
LOG.set_level("ERROR") # avoid server side logs
@@ -92,20 +92,38 @@ def unload_engine(self, lang: str):
9292
self.engines.pop(lang)
9393

9494
def process_audio(self, audio: AudioData, lang: str):
95+
"""
96+
Transcribes the provided audio using the engine for the specified language.
97+
98+
Parameters:
99+
audio (AudioData): Audio content to transcribe.
100+
lang (str): Language code identifying which engine to use.
101+
102+
Returns:
103+
str: Transcribed text for the audio, or an empty string if no transcription is produced.
104+
"""
95105
engine = self.get_engine(lang)
96106
return engine.execute(audio, language=lang) or ""
97107

98108

99-
def bytes2audiodata(data: bytes) -> AudioData:
100-
recognizer = Recognizer()
101-
with NamedTemporaryFile() as fp:
102-
fp.write(data)
103-
with AudioFile(fp.name) as source:
104-
audio = recognizer.record(source)
105-
return audio
106-
107-
108109
def create_app(stt_plugin, lang_plugin=None, multi=False, has_gradio=False):
110+
"""
111+
Create and configure a FastAPI app that exposes STT and language-detection endpoints and returns the app with its model container.
112+
113+
Configures CORS origins from the CORS_ORIGINS environment variable, initializes either a single-model or multi-model container using the provided plugins, and registers three endpoints:
114+
- GET /status: returns service and plugin metadata.
115+
- POST /stt: accepts raw audio bytes in the request body (query params: `lang`, `sample_rate`, `sample_width`), optionally performs language detection when `lang=auto`, and returns transcribed text.
116+
- POST /lang_detect: accepts raw audio bytes and returns detected language and confidence (supports `valid_langs` query param).
117+
118+
Parameters:
119+
stt_plugin (str): Name or identifier of the STT plugin to load.
120+
lang_plugin (str, optional): Name or identifier of an optional language-detection plugin. Defaults to None.
121+
multi (bool, optional): If True, use a MultiModelContainer (one engine per language); otherwise use a single ModelContainer. Defaults to False.
122+
has_gradio (bool, optional): Flag included in the /status response indicating whether a Gradio UI is available. Defaults to False.
123+
124+
Returns:
125+
tuple: (app, model) where `app` is the configured FastAPI application and `model` is the initialized ModelContainer or MultiModelContainer instance.
126+
"""
109127
app = FastAPI()
110128
cors_origins = os.environ.get("CORS_ORIGINS", "*")
111129
origins = [origin.strip() for origin in cors_origins.split(",")] if cors_origins != "*" else ["*"]
@@ -130,9 +148,23 @@ def stats(request: Request):
130148

131149
@app.post("/stt", response_class=PlainTextResponse)
132150
async def get_stt(request: Request):
151+
"""
152+
Handle an STT request: read audio from the request body, determine language if requested, and return the transcription.
153+
154+
Parameters:
155+
request (Request): HTTP request whose body contains raw audio bytes. Query parameters:
156+
- lang: language code or "auto" (default from Configuration().get("lang", "auto")).
157+
- sample_rate: sample rate in Hz for the audio (default 16000).
158+
- sample_width: sample width in bytes (default 2).
159+
160+
Returns:
161+
str: Transcribed text from the provided audio, or an empty string if no transcription is produced.
162+
"""
133163
lang = str(request.query_params.get("lang", Configuration().get("lang", "auto"))).lower()
164+
sr = int(request.query_params.get("sample_rate", 16000))
165+
sw = int(request.query_params.get("sample_width", 2))
134166
audio_bytes = await request.body()
135-
audio = bytes2audiodata(audio_bytes)
167+
audio = AudioData(audio_bytes, sr, sw)
136168
if lang == "auto":
137169
lang, prob = model.detect_language(audio_bytes)
138170
return model.process_audio(audio, lang)
@@ -154,4 +186,4 @@ def start_stt_server(engine: str,
154186
multi: bool = False,
155187
has_gradio: bool = False) -> (FastAPI, ModelContainer):
156188
app, engine = create_app(engine, lang_engine, multi, has_gradio)
157-
return app, engine
189+
return app, engine

ovos_stt_http_server/gradio_app.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,30 @@
1-
21
import gradio as gr
32

43
from os.path import join, dirname, basename, splitext, isfile
54
from ovos_utils.log import LOG
6-
from ovos_stt_http_server import ModelContainer, bytes2audiodata
5+
from ovos_stt_http_server import ModelContainer
6+
from ovos_plugin_manager.utils.audio import AudioData
77

88
STT = None
99

1010

11-
def transcribe(audio_file, language: str):
11+
def transcribe(audio_file, language: str, sample_rate: int = 16000, sample_width: int = 2):
12+
"""
13+
Transcribe an audio file into text using the configured STT engine.
14+
15+
Parameters:
16+
audio_file (str): Path to the audio file to transcribe.
17+
language (str): Language code to use for transcription.
18+
sample_rate (int): Sample rate in Hz for the provided audio (default 16000).
19+
sample_width (int): Sample width in bytes for the provided audio (default 2).
20+
21+
Returns:
22+
transcription (str): The transcribed text, or `None` if the file is missing or invalid.
23+
"""
1224
try:
1325
with open(audio_file, 'rb') as f:
1426
audio = f.read()
15-
return STT.process_audio(bytes2audiodata(audio), language)
27+
return STT.process_audio(AudioData(audio, sample_rate, sample_width), language)
1628
except TypeError:
1729
LOG.error(f"Requested file not valid: {audio_file}")
1830
except FileNotFoundError:
@@ -21,7 +33,23 @@ def transcribe(audio_file, language: str):
2133
def bind_gradio_service(app, stt_engine: ModelContainer,
2234
title, description, info, badge,
2335
default_lang="en", cache=True):
36+
"""
37+
Create and mount a Gradio-based transcription UI at /gradio using the provided STT engine.
38+
39+
Initializes the module STT with the given ModelContainer, prepares available language choices and example audio files, constructs a Gradio Interface configured to call the transcribe function, and mounts that interface to the supplied app at path "/gradio". This function logs a deprecation warning for the Gradio interface.
40+
41+
Parameters:
42+
app: The web application or framework instance to which the Gradio interface will be mounted.
43+
stt_engine (ModelContainer): Speech-to-text engine container used to perform transcriptions and to obtain available languages.
44+
title (str): Title to display in the Gradio UI.
45+
description (str): Short description shown in the Gradio UI.
46+
info (str): Additional informational HTML or text displayed in the Gradio UI article section.
47+
badge: UI badge metadata (present for API compatibility; not used by this function).
48+
default_lang (str): Preferred default language code; if not available it will be adjusted or replaced with the first available language.
49+
cache (bool): Whether to cache example executions to speed up runtime after initial initialization.
50+
"""
2451
global STT
52+
LOG.warning("gradio interface is deprecated and will be removed in a follow up release")
2553
STT = stt_engine
2654
languages = list(stt_engine.engine.available_languages or [default_lang])
2755
languages.sort()

requirements/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ovos-plugin-manager>=2.1.0,<2.2.0
1+
ovos-plugin-manager>=2.1.1,<3.0.0
22
fastapi~=0.95
33
uvicorn~=0.22
44
gradio~=3.28

setup.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def get_version():
4747
setup(
4848
name='ovos-stt-http-server',
4949
version=get_version(),
50-
description='simple aiohttp server to host OpenVoiceOS stt plugins as a service',
50+
description='simple fastapi server to host OpenVoiceOS stt plugins as a service',
5151
long_description=long_description,
5252
long_description_content_type="text/markdown",
5353
url='https://github.com/OpenVoiceOS/ovos-stt-http-server',
@@ -61,19 +61,7 @@ def get_version():
6161
classifiers=[
6262
'Development Status :: 3 - Alpha',
6363
'Intended Audience :: Developers',
64-
'Topic :: Text Processing :: Linguistic',
6564
'License :: OSI Approved :: Apache Software License',
66-
67-
'Programming Language :: Python :: 2',
68-
'Programming Language :: Python :: 2.7',
69-
'Programming Language :: Python :: 3',
70-
'Programming Language :: Python :: 3.0',
71-
'Programming Language :: Python :: 3.1',
72-
'Programming Language :: Python :: 3.2',
73-
'Programming Language :: Python :: 3.3',
74-
'Programming Language :: Python :: 3.4',
75-
'Programming Language :: Python :: 3.5',
76-
'Programming Language :: Python :: 3.6',
7765
],
7866
keywords='plugin STT OVOS OpenVoiceOS',
7967
entry_points={

0 commit comments

Comments
 (0)