Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 44 additions & 12 deletions ovos_stt_http_server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from ovos_config import Configuration
from ovos_plugin_manager.audio_transformers import load_audio_transformer_plugin, AudioLanguageDetector
from ovos_plugin_manager.stt import load_stt_plugin
from ovos_plugin_manager.utils.audio import AudioFile, AudioData
from ovos_utils.log import LOG
from speech_recognition import AudioData, Recognizer, AudioFile
from starlette.requests import Request

LOG.set_level("ERROR") # avoid server side logs
Expand Down Expand Up @@ -92,20 +92,38 @@ def unload_engine(self, lang: str):
self.engines.pop(lang)

def process_audio(self, audio: AudioData, lang: str):
"""
Transcribes the provided audio using the engine for the specified language.

Parameters:
audio (AudioData): Audio content to transcribe.
lang (str): Language code identifying which engine to use.

Returns:
str: Transcribed text for the audio, or an empty string if no transcription is produced.
"""
engine = self.get_engine(lang)
return engine.execute(audio, language=lang) or ""


def bytes2audiodata(data: bytes) -> AudioData:
recognizer = Recognizer()
with NamedTemporaryFile() as fp:
fp.write(data)
with AudioFile(fp.name) as source:
audio = recognizer.record(source)
return audio


def create_app(stt_plugin, lang_plugin=None, multi=False, has_gradio=False):
"""
Create and configure a FastAPI app that exposes STT and language-detection endpoints and returns the app with its model container.

Configures CORS origins from the CORS_ORIGINS environment variable, initializes either a single-model or multi-model container using the provided plugins, and registers three endpoints:
- GET /status: returns service and plugin metadata.
- POST /stt: accepts raw audio bytes in the request body (query params: `lang`, `sample_rate`, `sample_width`), optionally performs language detection when `lang=auto`, and returns transcribed text.
- POST /lang_detect: accepts raw audio bytes and returns detected language and confidence (supports `valid_langs` query param).

Parameters:
stt_plugin (str): Name or identifier of the STT plugin to load.
lang_plugin (str, optional): Name or identifier of an optional language-detection plugin. Defaults to None.
multi (bool, optional): If True, use a MultiModelContainer (one engine per language); otherwise use a single ModelContainer. Defaults to False.
has_gradio (bool, optional): Flag included in the /status response indicating whether a Gradio UI is available. Defaults to False.

Returns:
tuple: (app, model) where `app` is the configured FastAPI application and `model` is the initialized ModelContainer or MultiModelContainer instance.
"""
app = FastAPI()
cors_origins = os.environ.get("CORS_ORIGINS", "*")
origins = [origin.strip() for origin in cors_origins.split(",")] if cors_origins != "*" else ["*"]
Expand All @@ -130,9 +148,23 @@ def stats(request: Request):

@app.post("/stt", response_class=PlainTextResponse)
async def get_stt(request: Request):
"""
Handle an STT request: read audio from the request body, determine language if requested, and return the transcription.

Parameters:
request (Request): HTTP request whose body contains raw audio bytes. Query parameters:
- lang: language code or "auto" (default from Configuration().get("lang", "auto")).
- sample_rate: sample rate in Hz for the audio (default 16000).
- sample_width: sample width in bytes (default 2).

Returns:
str: Transcribed text from the provided audio, or an empty string if no transcription is produced.
"""
lang = str(request.query_params.get("lang", Configuration().get("lang", "auto"))).lower()
sr = int(request.query_params.get("sample_rate", 16000))
sw = int(request.query_params.get("sample_width", 2))
audio_bytes = await request.body()
audio = bytes2audiodata(audio_bytes)
audio = AudioData(audio_bytes, sr, sw)
if lang == "auto":
lang, prob = model.detect_language(audio_bytes)
return model.process_audio(audio, lang)
Expand All @@ -154,4 +186,4 @@ def start_stt_server(engine: str,
multi: bool = False,
has_gradio: bool = False) -> (FastAPI, ModelContainer):
app, engine = create_app(engine, lang_engine, multi, has_gradio)
return app, engine
return app, engine
36 changes: 32 additions & 4 deletions ovos_stt_http_server/gradio_app.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,30 @@

import gradio as gr

from os.path import join, dirname, basename, splitext, isfile
from ovos_utils.log import LOG
from ovos_stt_http_server import ModelContainer, bytes2audiodata
from ovos_stt_http_server import ModelContainer
from ovos_plugin_manager.utils.audio import AudioData

STT = None


def transcribe(audio_file, language: str):
def transcribe(audio_file, language: str, sample_rate: int = 16000, sample_width: int = 2):
"""
Transcribe an audio file into text using the configured STT engine.

Parameters:
audio_file (str): Path to the audio file to transcribe.
language (str): Language code to use for transcription.
sample_rate (int): Sample rate in Hz for the provided audio (default 16000).
sample_width (int): Sample width in bytes for the provided audio (default 2).

Returns:
transcription (str): The transcribed text, or `None` if the file is missing or invalid.
"""
try:
with open(audio_file, 'rb') as f:
audio = f.read()
return STT.process_audio(bytes2audiodata(audio), language)
return STT.process_audio(AudioData(audio, sample_rate, sample_width), language)
except TypeError:
LOG.error(f"Requested file not valid: {audio_file}")
except FileNotFoundError:
Expand All @@ -21,7 +33,23 @@ def transcribe(audio_file, language: str):
def bind_gradio_service(app, stt_engine: ModelContainer,
title, description, info, badge,
default_lang="en", cache=True):
"""
Create and mount a Gradio-based transcription UI at /gradio using the provided STT engine.

Initializes the module STT with the given ModelContainer, prepares available language choices and example audio files, constructs a Gradio Interface configured to call the transcribe function, and mounts that interface to the supplied app at path "/gradio". This function logs a deprecation warning for the Gradio interface.

Parameters:
app: The web application or framework instance to which the Gradio interface will be mounted.
stt_engine (ModelContainer): Speech-to-text engine container used to perform transcriptions and to obtain available languages.
title (str): Title to display in the Gradio UI.
description (str): Short description shown in the Gradio UI.
info (str): Additional informational HTML or text displayed in the Gradio UI article section.
badge: UI badge metadata (present for API compatibility; not used by this function).
default_lang (str): Preferred default language code; if not available it will be adjusted or replaced with the first available language.
cache (bool): Whether to cache example executions to speed up runtime after initial initialization.
"""
global STT
LOG.warning("gradio interface is deprecated and will be removed in a follow up release")
STT = stt_engine
languages = list(stt_engine.engine.available_languages or [default_lang])
languages.sort()
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ovos-plugin-manager>=2.1.0,<2.2.0
ovos-plugin-manager>=2.1.1,<3.0.0
fastapi~=0.95
uvicorn~=0.22
gradio~=3.28
Expand Down
14 changes: 1 addition & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_version():
setup(
name='ovos-stt-http-server',
version=get_version(),
description='simple aiohttp server to host OpenVoiceOS stt plugins as a service',
description='simple fastapi server to host OpenVoiceOS stt plugins as a service',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/OpenVoiceOS/ovos-stt-http-server',
Expand All @@ -61,19 +61,7 @@ def get_version():
classifiers=[
'Development Status :: 3 - Alpha',
'Intended Audience :: Developers',
'Topic :: Text Processing :: Linguistic',
'License :: OSI Approved :: Apache Software License',

'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.0',
'Programming Language :: Python :: 3.1',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
keywords='plugin STT OVOS OpenVoiceOS',
entry_points={
Expand Down
Loading