diff --git a/.github/workflows/build_tests.yml b/.github/workflows/build_tests.yml new file mode 100644 index 0000000..24a5558 --- /dev/null +++ b/.github/workflows/build_tests.yml @@ -0,0 +1,11 @@ +name: Run Build Tests +on: + push: + pull_request: + branches: + - dev + workflow_dispatch: + +jobs: + build_tests: + uses: OpenVoiceOS/gh-automations/.github/workflows/build-tests.yml@dev diff --git a/.github/workflows/license_tests.yml b/.github/workflows/license_tests.yml index 8733cba..12c0485 100644 --- a/.github/workflows/license_tests.yml +++ b/.github/workflows/license_tests.yml @@ -1,12 +1,15 @@ name: Run License Tests on: push: - workflow_dispatch: - pull_request: branches: - master + pull_request: + branches: + - dev + workflow_dispatch: + jobs: license_tests: - uses: neongeckocom/.github/.github/workflows/license_tests.yml@master + uses: OpenVoiceOS/gh-automations/.github/workflows/license-check.yml@dev with: - packages-exclude: '^(tqdm|bs4|gradio|bitstruct|attrs|referencing).*' + exclude_packages: '^(tqdm|bs4|gradio|bitstruct|attrs|referencing).*' diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..616c779 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,11 @@ +name: Run Lint +on: + push: + pull_request: + branches: + - dev + workflow_dispatch: + +jobs: + lint: + uses: OpenVoiceOS/gh-automations/.github/workflows/lint.yml@dev diff --git a/.github/workflows/notify_matrix.yml b/.github/workflows/notify_matrix.yml deleted file mode 100644 index 14ef802..0000000 --- a/.github/workflows/notify_matrix.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Close Pull Request - -# only trigger on pull request closed events -on: - pull_request: - types: [ closed ] - -jobs: - merge_job: - # this job will only run if the PR has been merged - if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - - name: Send message to Matrix bots channel - id: matrix-chat-message - uses: fadenb/matrix-chat-message@v0.0.6 - with: - homeserver: 'matrix.org' - token: ${{ secrets.MATRIX_TOKEN }} - channel: '!WjxEKjjINpyBRPFgxl:krbel.duckdns.org' - message: | - new ovos-stt-http-server PR merged! https://github.com/OpenVoiceOS/ovos-bus-client/pull/${{ github.event.number }} diff --git a/.github/workflows/pip_audit.yml b/.github/workflows/pip_audit.yml new file mode 100644 index 0000000..01e7ea0 --- /dev/null +++ b/.github/workflows/pip_audit.yml @@ -0,0 +1,11 @@ +name: Run Pip Audit +on: + push: + pull_request: + branches: + - dev + workflow_dispatch: + +jobs: + pip_audit: + uses: OpenVoiceOS/gh-automations/.github/workflows/pip-audit.yml@dev diff --git a/.github/workflows/publish_stable.yml b/.github/workflows/publish_stable.yml index 22bd869..42b8e34 100644 --- a/.github/workflows/publish_stable.yml +++ b/.github/workflows/publish_stable.yml @@ -6,53 +6,12 @@ on: jobs: publish_stable: - uses: TigreGotico/gh-automations/.github/workflows/publish-stable.yml@master + if: github.actor != 'github-actions[bot]' + uses: OpenVoiceOS/gh-automations/.github/workflows/publish-stable.yml@dev secrets: inherit with: branch: 'master' version_file: 'ovos_stt_http_server/version.py' - setup_py: 'setup.py' + publish_pypi: true + sync_dev: true publish_release: true - - publish_pypi: - needs: publish_stable - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - with: - ref: master - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - name: Setup Python - uses: actions/setup-python@v6 - with: - python-version: "3.14" - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: version - run: echo "::set-output name=version::$(python setup.py --version)" - id: version - - name: Build Distribution Packages - run: | - python setup.py sdist bdist_wheel - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{secrets.PYPI_TOKEN}} - - - sync_dev: - needs: publish_stable - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - ref: master - - name: Push master -> dev - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - branch: dev diff --git a/.github/workflows/release_workflow.yml b/.github/workflows/release_workflow.yml index 5a4ff27..17ec806 100644 --- a/.github/workflows/release_workflow.yml +++ b/.github/workflows/release_workflow.yml @@ -8,101 +8,16 @@ on: jobs: publish_alpha: - uses: TigreGotico/gh-automations/.github/workflows/publish-alpha.yml@master + if: github.event.pull_request.merged == true || github.event_name == 'workflow_dispatch' + uses: OpenVoiceOS/gh-automations/.github/workflows/publish-alpha.yml@dev secrets: inherit with: branch: 'dev' version_file: 'ovos_stt_http_server/version.py' - setup_py: 'setup.py' update_changelog: true publish_prerelease: true + propose_release: true changelog_max_issues: 100 - - notify: - if: github.event.pull_request.merged == true - needs: publish_alpha - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - - name: Send message to Matrix bots channel - id: matrix-chat-message - uses: fadenb/matrix-chat-message@v0.0.6 - with: - homeserver: 'matrix.org' - token: ${{ secrets.MATRIX_TOKEN }} - channel: '!WjxEKjjINpyBRPFgxl:krbel.duckdns.org' - message: | - new ${{ github.event.repository.name }} PR merged! https://github.com/${{ github.repository }}/pull/${{ github.event.number }} - - publish_pypi: - needs: publish_alpha - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v6 - with: - ref: dev - fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - name: Setup Python - uses: actions/setup-python@v6 - with: - python-version: "3.14" - - name: Install Build Tools - run: | - python -m pip install build wheel - - name: version - run: echo "::set-output name=version::$(python setup.py --version)" - id: version - - name: Build Distribution Packages - run: | - python setup.py sdist bdist_wheel - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{secrets.PYPI_TOKEN}} - - - propose_release: - needs: publish_alpha - if: success() # Ensure this job only runs if the previous job succeeds - runs-on: ubuntu-latest - steps: - - name: Checkout dev branch - uses: actions/checkout@v6 - with: - ref: dev - - - name: Setup Python - uses: actions/setup-python@v6 - with: - python-version: '3.14' - - - name: Get version from setup.py - id: get_version - run: | - VERSION=$(python setup.py --version) - echo "VERSION=$VERSION" >> $GITHUB_ENV - - - name: Create and push new branch - run: | - git checkout -b release-${{ env.VERSION }} - git push origin release-${{ env.VERSION }} - - - name: Open Pull Request from dev to master - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Variables - BRANCH_NAME="release-${{ env.VERSION }}" - BASE_BRANCH="master" - HEAD_BRANCH="release-${{ env.VERSION }}" - PR_TITLE="Release ${{ env.VERSION }}" - PR_BODY="Human review requested!" - - # Create a PR using GitHub API - curl -X POST \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: token $GITHUB_TOKEN" \ - -d "{\"title\":\"$PR_TITLE\",\"body\":\"$PR_BODY\",\"head\":\"$HEAD_BRANCH\",\"base\":\"$BASE_BRANCH\"}" \ - https://api.github.com/repos/${{ github.repository }}/pulls + publish_pypi: true + notify_matrix: true diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 2fa42bf..bf4e5ec 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -1,14 +1,11 @@ -# This workflow will run unit tests - name: Run Unit Tests on: push: - workflow_dispatch: pull_request: branches: - - master + - dev + workflow_dispatch: + jobs: - py_build_tests: - uses: neongeckocom/.github/.github/workflows/python_build_tests.yml@master - with: - python_version: "3.8" + unit_tests: + uses: OpenVoiceOS/gh-automations/.github/workflows/build-tests.yml@dev diff --git a/AUDIT.md b/AUDIT.md new file mode 100644 index 0000000..be5d8e9 --- /dev/null +++ b/AUDIT.md @@ -0,0 +1,29 @@ +# ovos-stt-http-server — Audit Report + +## Documentation Status +- [x] QUICK_FACTS.md +- [x] FAQ.md +- [x] MAINTENANCE_REPORT.md +- [x] AUDIT.md +- [x] SUGGESTIONS.md +- [x] docs/index.md +- [x] docs/api-compatibility.md +- [x] docs/audio-formats.md +- [x] docs/response-formats.md + +## Technical Debt & Issues + +- `[MINOR]` **pyproject.toml**: `requires-python = ">=3.9"` — workspace standard is 3.10+; align after verifying compatibility — `pyproject.toml:12`. +- `[MINOR]` **deps**: `fastapi~=0.95` and `uvicorn~=0.22` are old pinned versions; should be broadened — `pyproject.toml:21-22`. +- `[MINOR]` **validation**: `/stt` does not validate `sample_width` values — `__init__.py:165`. +- `[MINOR]` **Speechmatics in-memory store**: `_jobs` dict in `speechmatics.py:13` is module-level and not thread-safe under concurrent requests; grows unboundedly — `routers/speechmatics.py:13`. +- `[MINOR]` **Deepgram audio assumption**: Deepgram router assumes 16 kHz 16-bit mono regardless of `Content-Type` — `routers/deepgram.py:81`. WAV files with different parameters will produce incorrect results. +- `[INFO]` **ci**: `publish_stable.yml` and `release_workflow.yml` already use `@dev` refs — no action needed. + +## Resolved Issues + +- `[RESOLVED 2026-03-18]` **tests**: 25 unit tests added in `test/unittests/test_compat_routers.py`. +- `[RESOLVED 2026-03-17]` Gradio dependency and `gradio_app.py` removed. +- `[RESOLVED 2026-03-17]` `CORS_ORIGINS` env-var removed; `allow_origins=["*"]` unconditional. +- `[RESOLVED 2026-03-17]` `unit_tests.yml` updated from obsolete `neongeckocom` reference to `OpenVoiceOS/gh-automations@dev`. +- `[RESOLVED 2026-03-17]` `lint.yml`, `build_tests.yml`, `pip_audit.yml` workflows added. diff --git a/FAQ.md b/FAQ.md new file mode 100644 index 0000000..65b4244 --- /dev/null +++ b/FAQ.md @@ -0,0 +1,148 @@ +# FAQ — ovos-stt-http-server + +## General + +**Q: What is ovos-stt-http-server?** +A: A FastAPI-based HTTP server that wraps any OVOS STT plugin and exposes it via a REST API. It also provides five vendor-compatible compat routers for drop-in use with OpenAI Whisper, Deepgram, Google Cloud STT, AssemblyAI, and Speechmatics clients. + +**Q: What is the default port?** +A: The CLI defaults to `8080`. Override with `--port `. + +**Q: How do I start the server?** +A: `ovos-stt-server --engine --port 8080`. The `--engine` flag is required. + +**Q: Do I need API keys or credentials?** +A: No. All vendor-compatible routers accept auth headers and API-key query parameters but silently ignore them. No credentials are validated. + +**Q: What Python version is required?** +A: Python 3.9 or later (pyproject.toml `requires-python = ">=3.9"`). + +**Q: What audio format does `/stt` expect?** +A: Raw PCM bytes: 16 kHz, mono, 16-bit signed integer (int16). Pass `sample_rate` and `sample_width` query params if your audio differs from the defaults. + +**Q: How do I configure CORS?** +A: CORS is unconditionally set to `allow_origins=["*"]`. There is no env-var override. All origins are permitted. See `create_app` — `ovos_stt_http_server/__init__.py:109`. + +**Q: How do I enable automatic language detection?** +A: Pass `lang=auto` as a query parameter to `/stt`, or use the `/lang_detect` endpoint directly. A `lang_plugin` must be provided at startup (`--lang-engine`). + +**Q: What is `--multi` mode?** +A: `--multi` loads one `MultiModelContainer` (`__init__.py:57`) that instantiates a separate plugin instance per language code on first use. Useful for multilingual deployments with language-specific models. + +**Q: How do I specify the STT plugin?** +A: Pass `--engine ` to the CLI. The plugin must be installed and discoverable via `ovos-plugin-manager`. + +**Q: What plugins are supported?** +A: Any plugin registered under the `opm.plugin.stt` entry point group. Install the plugin package and reference it by its entry point name. + +**Q: What does `/status` return?** +A: `{"status": "ok", "plugin": "", "lang_plugin": ""}` — `stats` handler in `__init__.py:142`. + +**Q: Is Gradio UI supported?** +A: No. Gradio support was removed. The server is a pure REST API only. + +--- + +## OpenAI Whisper Compatible Clients + +**Q: Which OpenAI Whisper clients work with this server?** +A: Any client that POSTs to `/v1/audio/transcriptions` or `/v1/audio/translations` with multipart form data works. This includes the official `openai` Python SDK, `whisper-client`, and raw `curl` commands. + +**Q: How do I use the OpenAI Python SDK against this server?** +A: Set `base_url="http://localhost:8080/openai"` when constructing the `OpenAI` client. The `api_key` parameter is accepted but ignored. + +**Q: What `response_format` values are supported?** +A: `json` (default), `text`, `srt`, `vtt`, and `verbose_json`. See [docs/response-formats.md](docs/response-formats.md). + +**Q: Does `verbose_json` return real word-level segments?** +A: No. The `segments` field is always an empty list. `task`, `language`, `duration`, and `text` are populated. + +**Q: Does the translations endpoint really translate audio?** +A: No — it calls the same STT engine as transcriptions but forces `language=en`. Translation between languages is not performed; the engine transcribes with English as the target hint. + +--- + +## Deepgram Compatible Clients + +**Q: Which Deepgram clients work with this server?** +A: Any client that POSTs raw audio bytes to `/v1/listen`. The official `deepgram-sdk` Python package works when its base URL is overridden. + +**Q: How is audio parsed for the Deepgram endpoint?** +A: The raw request body is wrapped in `AudioData(body, 16000, 2)` — no format detection. Send WAV or raw PCM at 16 kHz 16-bit mono for best results. + +**Q: Does `punctuate=true` add punctuation?** +A: No. The `punctuate` query parameter is accepted and ignored. Punctuation depends on the underlying STT plugin. + +**Q: What does the Deepgram `words` array contain?** +A: An empty list. Word-level timing is not implemented. + +--- + +## Google Speech-to-Text Compatible Clients + +**Q: Which Google STT clients work?** +A: Any client that POSTs to `/v1/speech:recognize` with a JSON body containing `config` and `audio.content` (base64-encoded audio). + +**Q: Are GCS URIs (`gs://...`) supported?** +A: No. The server returns HTTP 501 if `audio.uri` is set. Use `audio.content` with base64-encoded audio. + +**Q: Does the `encoding` field matter?** +A: No — the server attempts to parse uploaded bytes as WAV regardless of the `encoding` field value, then falls back to raw PCM. + +--- + +## AssemblyAI Stub Behavior + +**Q: Why does the AssemblyAI GET transcript endpoint always return `status: error`?** +A: This server is synchronous. Transcription completes in the POST response. No job store persists between requests, so GET by ID cannot retrieve prior results. + +**Q: Do I need to poll for results like the real AssemblyAI API?** +A: No — the POST response already contains `status: completed` and the `text` field. Read the result directly from the POST response. + +**Q: What happens if I send `audio_url` instead of `audio`?** +A: The server returns `status: error` with a message explaining that `audio_url` fetching is not supported. Encode your audio as base64 and put it in the `audio` field. + +**Q: Is the `id` in the POST response reusable?** +A: No. The ID is a UUID generated per-request. The GET endpoint ignores it and always returns an error stub. + +--- + +## Speechmatics Behavior + +**Q: How does the Speechmatics job model work on this server?** +A: Job creation (POST `/v1/jobs`) transcribes immediately and stores the result in an in-memory dict keyed by job ID. GET retrieves from that dict. + +**Q: What happens if I GET a job that doesn't exist?** +A: HTTP 404 is returned: `{"detail": "Job '' not found."}`. + +**Q: Are job results preserved across server restarts?** +A: No. The `_jobs` dict (`speechmatics.py:13`) is in-memory only. + +**Q: What `format` parameter does GET `/transcript` accept?** +A: The `format` query param is accepted and ignored. The response is always Speechmatics JSON v2.9 format. + +--- + +## Audio Format + +**Q: What audio formats are supported?** +A: WAV is supported natively via stdlib. MP3, OGG, FLAC, M4A, and WebM require `pydub` (`pip install pydub`). See [docs/audio-formats.md](docs/audio-formats.md). + +**Q: What happens if I upload a non-WAV file without pydub installed?** +A: HTTP 501 is returned with a message indicating that the format requires pydub. + +**Q: What sample rate and bit depth should I use?** +A: 16 kHz, mono, 16-bit (int16). The server resamples non-WAV files via pydub to match these parameters. + +--- + +## Language + +**Q: How do I specify the transcription language?** +A: Each compat router has its own mechanism: `language` form field (Whisper), `?language=` query param (Deepgram), `config.languageCode` JSON field (Google), `language_code` JSON field (AssemblyAI), `transcription_config.language` in the job config JSON (Speechmatics). + +**Q: What happens if no language is specified?** +A: Defaults vary per router: Deepgram defaults to `en`, AssemblyAI defaults to `en`, Speechmatics defaults to `en`, Whisper passes `None` → the engine receives `"auto"`. + +**Q: Does language auto-detection work with compat routers?** +A: Not directly. Use the native `/lang_detect` endpoint, or start the server with `--lang-engine` to enable automatic language detection in the underlying engine. diff --git a/MAINTENANCE_REPORT.md b/MAINTENANCE_REPORT.md new file mode 100644 index 0000000..fd2b24e --- /dev/null +++ b/MAINTENANCE_REPORT.md @@ -0,0 +1,33 @@ +# Maintenance Report — ovos-stt-http-server + +## 2026-03-18 + +**AI Model**: claude-sonnet-4-6 +**Oversight**: Human-directed, agent-executed + +### Actions Taken + +- **Created `docs/api-compatibility.md`**: Full table of all 5 compat routers with vendor prefix, endpoints, auth method, input formats, response formats, and curl examples per endpoint. +- **Created `docs/audio-formats.md`**: Documents `multipart_audio_to_audiodata()` WAV/pydub paths, 501 fallback, Deepgram raw-body handling, Google/AssemblyAI base64 handling, and supported MIME types. +- **Created `docs/response-formats.md`**: Documents all Whisper `response_format` values (`json`, `text`, `srt`, `vtt`, `verbose_json`) with example outputs, plus Deepgram/Google/AssemblyAI/Speechmatics response shapes. +- **Updated `docs/index.md`**: Added table of contents linking to all three new docs files, added compat router section to architecture, updated audio format note. +- **Rewrote `FAQ.md`**: Expanded from 8 to 30+ Q&A entries covering OpenAI Whisper, Deepgram, Google STT, AssemblyAI, Speechmatics, audio formats, language parameters, port/startup, and all general questions. +- **Updated `QUICK_FACTS.md`**: Added `multipart_audio_to_audiodata()`, all 5 API prefixes, default port, and test count. +- **Updated `AUDIT.md`**: Marked `[MAJOR]` test issue as resolved (25 tests added). Added new issues for compat router edge cases. +- **Updated `SUGGESTIONS.md`**: Marked S-001 resolved. Added S-006 (Speechmatics in-memory store), S-007 (pydub optional dep documentation). +- **Extended `test/unittests/test_compat_routers.py`**: Added 8 new tests — `response_format=text` plain text, `verbose_json` with `segments` field, translations endpoint forces `lang=en`, Deepgram with `?punctuate=true`, Google STT with base64 WAV, AssemblyAI GET transcript `status` field, Speechmatics GET unknown job_id returns 404, Speechmatics GET known job_id returns transcript. + +## 2026-03-17 + +**AI Model**: claude-sonnet-4-6 +**Oversight**: Human-directed, agent-executed + +### Actions Taken + +- **Removed Gradio**: Deleted `ovos_stt_http_server/gradio_app.py`. Removed `has_gradio` parameter from `create_app()` and `start_stt_server()`. Removed `"gradio"` key from `/status` response. Removed `--gradio`, `--cache`, `--title`, `--description`, `--info`, `--badge` CLI args from `__main__.py`. +- **Fixed CORSMiddleware**: Removed `CORS_ORIGINS` env-var logic; `allow_origins` is now unconditionally `["*"]` — `ovos_stt_http_server/__init__.py:130`. +- **Cleaned imports**: Removed unused `import os` from `__init__.py`. +- **Updated `pyproject.toml`**: Removed `gradio~=3.28` and `flask` from dependencies. +- **Added workflows**: `lint.yml`, `build_tests.yml`, `pip_audit.yml` using `OpenVoiceOS/gh-automations@dev`. +- **Updated `unit_tests.yml`**: Replaced obsolete `neongeckocom/.github` reference with `OpenVoiceOS/gh-automations/.github/workflows/build-tests.yml@dev`. +- **Created documentation**: `docs/index.md`, `QUICK_FACTS.md`, `FAQ.md`, `AUDIT.md` (updated), `SUGGESTIONS.md`, `MAINTENANCE_REPORT.md`. diff --git a/QUICK_FACTS.md b/QUICK_FACTS.md new file mode 100644 index 0000000..0109976 --- /dev/null +++ b/QUICK_FACTS.md @@ -0,0 +1,20 @@ +# QUICK_FACTS — ovos-stt-http-server + +| Field | Value | +| :--- | :--- | +| **Package name** | `ovos-stt-http-server` | +| **Version** | `0.1.5a7` (`ovos_stt_http_server/version.py`) | +| **Entry point** | `ovos-stt-server` → `ovos_stt_http_server.__main__:main` | +| **Key classes** | `ModelContainer` — `ovos_stt_http_server/__init__.py:30` | +| | `MultiModelContainer` — `ovos_stt_http_server/__init__.py:57` | +| **Key functions** | `create_app()` — `ovos_stt_http_server/__init__.py:109` | +| | `start_stt_server()` — `ovos_stt_http_server/__init__.py:184` | +| | `multipart_audio_to_audiodata()` — `ovos_stt_http_server/audio_utils.py:10` | +| **Native endpoints** | `GET /status`, `POST /stt`, `POST /lang_detect` | +| **API prefixes** | `/openai`, `/deepgram`, `/google`, `/assemblyai/v2`, `/speechmatics/v1` | +| **Audio format** | PCM 16 kHz mono int16 (native); WAV/MP3/OGG via compat routers | +| **CORS** | Unconditional `allow_origins=["*"]` | +| **Default port** | `8080` | +| **Python** | >=3.9 | +| **License** | Apache-2.0 | +| **Unit tests** | 25 tests — `test/unittests/test_compat_routers.py` | diff --git a/SUGGESTIONS.md b/SUGGESTIONS.md new file mode 100644 index 0000000..8f17bf6 --- /dev/null +++ b/SUGGESTIONS.md @@ -0,0 +1,25 @@ +# Suggestions — ovos-stt-http-server + +## S-001: Add unit tests [RESOLVED 2026-03-18] +25 tests added in `test/unittests/test_compat_routers.py` covering all five compat routers. + +## S-002: Pin fastapi and uvicorn to broader ranges +`fastapi~=0.95` and `uvicorn~=0.22` are old. Update to `fastapi>=0.95,<1.0` and `uvicorn>=0.22` to allow newer compatible releases. + +## S-003: Validate `sample_width` in `/stt` +No validation is performed on `sample_width`. Values other than 1 or 2 will silently produce garbage audio. Add an explicit check and return HTTP 400 on invalid values. + +## S-004: Add `python-support.yml` workflow +Add a CI matrix test across Python 3.10, 3.11, 3.12 using `OpenVoiceOS/gh-automations/.github/workflows/build-tests.yml@dev`. + +## S-005: Migrate requires-python to >=3.10 +The project targets `>=3.9` but the workspace standard is 3.10+. Align after verifying no 3.9-specific usage. + +## S-006: Add TTL or size limit to Speechmatics in-memory job store +The `_jobs` dict (`speechmatics.py:13`) grows unboundedly. Add a `maxlen` via `collections.OrderedDict` or an LRU cache, or a TTL-based eviction on the store. + +## S-007: Document pydub as an optional dependency in pyproject.toml +`pydub` is imported conditionally in `audio_utils.py:35` but is not listed as a dependency. Add it as an optional extra in `pyproject.toml`: `[project.optional-dependencies] audio = ["pydub"]`. + +## S-008: Parse WAV headers in Deepgram router +The Deepgram router blindly treats the body as 16 kHz 16-bit mono. Attempt `wave.open()` first and fall back to the hardcoded parameters only if parsing fails — similar to the pattern in `google_stt.py:90-97`. diff --git a/docs/api-compatibility.md b/docs/api-compatibility.md new file mode 100644 index 0000000..1bdf5ea --- /dev/null +++ b/docs/api-compatibility.md @@ -0,0 +1,150 @@ +# API Compatibility Routers + +All five compat routers are mounted in `create_app` (`__init__.py:180-190`) and share the same underlying `ModelContainer` or `MultiModelContainer`. Authentication headers and API-key query parameters are accepted and silently ignored — no credentials are required or validated. + +## Router Summary + +| Vendor | Prefix | Endpoints | Auth Method | Input Format | Response Format | +| :--- | :--- | :--- | :--- | :--- | :--- | +| OpenAI Whisper | `/openai` | `POST /v1/audio/transcriptions`, `POST /v1/audio/translations` | `Authorization: Bearer ` (ignored) | multipart/form-data (WAV, MP3, etc.) | json, text, srt, vtt, verbose_json | +| Deepgram | `/deepgram` | `POST /v1/listen` | `Authorization: Token ` (ignored) | Raw audio bytes (body), `Content-Type` header | Deepgram JSON | +| Google Cloud STT | `/google` | `POST /v1/speech:recognize` | `?key=` or `Authorization` (ignored) | JSON with base64 `audio.content` | Google STT JSON | +| AssemblyAI | `/assemblyai` | `POST /v2/transcript`, `GET /v2/transcript/{id}` | `Authorization: ` (ignored) | JSON with base64 `audio` field | AssemblyAI JSON | +| Speechmatics | `/speechmatics` | `POST /v1/jobs`, `GET /v1/jobs/{id}/transcript` | `Authorization: Bearer ` (ignored) | multipart/form-data (`data_file` + `config` JSON string) | Speechmatics JSON | + +--- + +## OpenAI Whisper (`/openai`) + +Source: `ovos_stt_http_server/routers/openai_whisper.py` + +### POST /openai/v1/audio/transcriptions + +Transcribe audio. Mirrors the OpenAI Whisper API. + +**Form fields:** + +| Field | Required | Description | +| :--- | :--- | :--- | +| `file` | yes | Audio file (WAV, MP3, etc.) | +| `model` | no | Model name — accepted, ignored | +| `language` | no | BCP-47 language code hint | +| `response_format` | no | `json` (default), `text`, `srt`, `vtt`, `verbose_json` | +| `temperature` | no | 0–1 — accepted, ignored | + +```bash +curl -X POST http://localhost:8080/openai/v1/audio/transcriptions \ + -H "Authorization: Bearer sk-fake" \ + -F "file=@audio.wav" \ + -F "model=whisper-1" \ + -F "response_format=json" +``` + +### POST /openai/v1/audio/translations + +Identical to transcriptions but forces `language=en`. Always returns English output. + +```bash +curl -X POST http://localhost:8080/openai/v1/audio/translations \ + -F "file=@audio.wav" \ + -F "model=whisper-1" +``` + +--- + +## Deepgram (`/deepgram`) + +Source: `ovos_stt_http_server/routers/deepgram.py` + +### POST /deepgram/v1/listen + +Audio bytes sent directly in the request body. Language and options are query parameters. + +**Query parameters:** + +| Parameter | Default | Description | +| :--- | :--- | :--- | +| `language` | `en` | BCP-47 language code | +| `model` | — | Model name — accepted, ignored | +| `punctuate` | — | Boolean — accepted, ignored | +| `diarize` | — | Boolean — accepted, ignored | + +Audio is treated as raw 16 kHz 16-bit mono PCM regardless of `Content-Type`. + +```bash +curl -X POST "http://localhost:8080/deepgram/v1/listen?language=en-US&punctuate=true" \ + -H "Authorization: Token fake-token" \ + -H "Content-Type: audio/wav" \ + --data-binary @audio.wav +``` + +--- + +## Google Cloud STT (`/google`) + +Source: `ovos_stt_http_server/routers/google_stt.py` + +### POST /google/v1/speech:recognize + +JSON body with `config` and `audio` objects. Only base64 `content` is supported; GCS URIs return 501. + +```bash +AUDIO_B64=$(base64 -w0 audio.wav) +curl -X POST http://localhost:8080/google/v1/speech:recognize \ + -H "Content-Type: application/json" \ + -d "{ + \"config\": {\"encoding\": \"LINEAR16\", \"sampleRateHertz\": 16000, \"languageCode\": \"en-US\"}, + \"audio\": {\"content\": \"$AUDIO_B64\"} + }" +``` + +--- + +## AssemblyAI (`/assemblyai`) + +Source: `ovos_stt_http_server/routers/assemblyai.py` + +This is a **synchronous stub**. Unlike the real AssemblyAI API there is no async job queue — transcription completes immediately in the POST response. The GET endpoint always returns `status: error` because no job store is maintained across requests. + +### POST /assemblyai/v2/transcript + +```bash +AUDIO_B64=$(base64 -w0 audio.wav) +curl -X POST http://localhost:8080/assemblyai/v2/transcript \ + -H "Authorization: fake-api-key" \ + -H "Content-Type: application/json" \ + -d "{\"audio\": \"$AUDIO_B64\", \"language_code\": \"en\"}" +``` + +### GET /assemblyai/v2/transcript/{id} + +Always returns `{"status": "error", ...}`. Use the POST response directly. + +```bash +curl http://localhost:8080/assemblyai/v2/transcript/some-id +``` + +--- + +## Speechmatics (`/speechmatics`) + +Source: `ovos_stt_http_server/routers/speechmatics.py` + +This is a **synchronous stub**. Transcription happens immediately on POST. Results are stored in an in-memory dict (`_jobs`) and retrievable via GET until the server restarts. + +### POST /speechmatics/v1/jobs + +```bash +curl -X POST http://localhost:8080/speechmatics/v1/jobs \ + -H "Authorization: Bearer fake-token" \ + -F "data_file=@audio.wav" \ + -F 'config={"type":"transcription","transcription_config":{"language":"en"}}' +``` + +### GET /speechmatics/v1/jobs/{job_id}/transcript + +Returns 404 if the job ID is unknown (e.g. after server restart). Returns 200 with `results` array when found. + +```bash +curl http://localhost:8080/speechmatics/v1/jobs//transcript +``` diff --git a/docs/audio-formats.md b/docs/audio-formats.md new file mode 100644 index 0000000..48e5886 --- /dev/null +++ b/docs/audio-formats.md @@ -0,0 +1,60 @@ +# Audio Formats + +## Native `/stt` Endpoint + +The native endpoint (`POST /stt`) accepts raw PCM bytes in the request body with no container. Parameters are passed as query strings: + +| Parameter | Default | Description | +| :--- | :--- | :--- | +| `sample_rate` | `16000` | Sample rate in Hz | +| `sample_width` | `2` | Sample width in bytes (2 = int16) | + +## Compat Router Audio Handling + +Compat routers that accept file uploads (OpenAI Whisper, Speechmatics) use `multipart_audio_to_audiodata` (`audio_utils.py:10`) to convert uploaded bytes into `AudioData`. + +### WAV (stdlib) + +When the uploaded filename ends in `.wav`, the file is parsed with Python's stdlib `wave` module. Sample rate and sample width are read from the WAV header. No external dependencies required. + +``` +multipart_audio_to_audiodata(wav_bytes, "audio.wav") +→ wave.open() → AudioData(raw_pcm, sample_rate, sample_width) +``` + +### Other Formats (pydub) + +All non-WAV extensions (`.mp3`, `.ogg`, `.flac`, `.m4a`, etc.) are processed via `pydub.AudioSegment`. The audio is resampled to 16 kHz mono int16 before passing to the STT engine. + +``` +multipart_audio_to_audiodata(mp3_bytes, "audio.mp3") +→ AudioSegment.from_file() → resample → wave.open() → AudioData(raw_pcm, 16000, 2) +``` + +If `pydub` is not installed and a non-WAV file is uploaded, the server returns: + +``` +HTTP 501 Not Implemented +{"detail": "Format 'mp3' requires pydub. Install with: pip install pydub"} +``` + +### Deepgram Raw Body + +The Deepgram router (`deepgram.py:80-81`) reads the raw request body and wraps it in `AudioData(audio_bytes, 16000, 2)` — no format detection is performed. Send WAV or raw PCM at 16 kHz 16-bit mono. + +### Google STT and AssemblyAI Base64 + +Both routers accept base64-encoded audio in a JSON field (`audio.content` for Google, `audio` for AssemblyAI). After decoding, they attempt to parse as WAV first; if that fails they treat the bytes as raw PCM at the configured sample rate. + +## Supported MIME Types (compat routers) + +| MIME Type | Extension | Handler | +| :--- | :--- | :--- | +| `audio/wav` / `audio/x-wav` | `.wav` | stdlib `wave` | +| `audio/mpeg` | `.mp3` | pydub | +| `audio/ogg` | `.ogg` | pydub | +| `audio/flac` | `.flac` | pydub | +| `audio/mp4` / `audio/x-m4a` | `.m4a` | pydub | +| `audio/webm` | `.webm` | pydub | + +Any extension not matching `.wav` falls through to pydub. If pydub cannot handle the format, it will raise its own error before the 501 is returned. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..dc10987 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,46 @@ +# ovos-stt-http-server + +A lightweight FastAPI server that exposes any OVOS STT plugin as an HTTP service, with built-in compatibility routers for OpenAI Whisper, Deepgram, Google Cloud STT, AssemblyAI, and Speechmatics APIs. + +## Table of Contents + +- [Architecture](#architecture) +- [Native Endpoints](#native-endpoints) +- [API Compatibility Routers](api-compatibility.md) +- [Audio Formats](audio-formats.md) +- [Response Formats](response-formats.md) +- [Usage](#usage) + +## Architecture + +- **Framework**: FastAPI with Uvicorn ASGI server. +- **Plugin loading**: `ovos-plugin-manager` discovers and loads STT plugins by name — `ModelContainer` (`__init__.py:30`) for single-language mode, `MultiModelContainer` (`__init__.py:57`) for per-language model loading. +- **CORS**: Unconditional `allow_origins=["*"]` — `create_app` (`__init__.py:109`). +- **Compat routers**: Five vendor-compatible routers mounted at `/openai`, `/deepgram`, `/google`, `/assemblyai`, `/speechmatics` — `create_app` (`__init__.py:180-190`). + +## Native Endpoints + +| Method | Path | Description | +| :--- | :--- | :--- | +| `GET` | `/status` | Returns `{"status": "ok", "plugin": ..., "lang_plugin": ...}` | +| `POST` | `/stt` | Raw audio bytes in body → transcribed text (plain text response) | +| `POST` | `/lang_detect` | Raw audio bytes in body → `{"lang": ..., "conf": ...}` | + +### `/stt` query parameters + +| Parameter | Default | Description | +| :--- | :--- | :--- | +| `lang` | system lang or `auto` | Language code or `auto` to trigger language detection | +| `sample_rate` | `16000` | Audio sample rate in Hz | +| `sample_width` | `2` | Sample width in bytes (2 = int16) | + +## Usage + +```bash +ovos-stt-server --engine ovos-stt-plugin-whisper --port 8080 +ovos-stt-server --engine ovos-stt-plugin-whisper --lang-engine ovos-audio-transformer-plugin-fasterwhisper --multi +``` + +## Audio Format + +Input audio for the native `/stt` endpoint must be raw PCM: 16 kHz, mono, 16-bit signed integer (int16). Send bytes directly as the POST body. See [audio-formats.md](audio-formats.md) for compat router audio handling. diff --git a/docs/response-formats.md b/docs/response-formats.md new file mode 100644 index 0000000..e87338b --- /dev/null +++ b/docs/response-formats.md @@ -0,0 +1,112 @@ +# Response Formats + +## OpenAI Whisper `response_format` Values + +The `response_format` form field on `POST /openai/v1/audio/transcriptions` and `POST /openai/v1/audio/translations` controls the shape and content type of the response. + +Source: `ovos_stt_http_server/routers/openai_whisper.py:68-86` + +| Value | Content-Type | Shape | +| :--- | :--- | :--- | +| `json` (default) | `application/json` | `{"text": ""}` | +| `text` | `text/plain` | Raw transcript string, no JSON wrapper | +| `srt` | `text/plain` | SRT subtitle block (index, timecode range, text, blank line) | +| `vtt` | `text/plain` | WebVTT file (`WEBVTT` header + cue block) | +| `verbose_json` | `application/json` | Extended object with `task`, `language`, `duration`, `text`, `segments` | + +### json + +```json +{"text": "hello world"} +``` + +### text + +``` +hello world +``` + +### srt + +``` +1 +00:00:00,000 --> 00:00:00,000 +hello world + +``` + +The end timecode uses `int(duration)` seconds — sub-second durations appear as `00:00:00,000`. + +### vtt + +``` +WEBVTT + +00:00:00.000 --> 00:00:00.000 +hello world + +``` + +### verbose_json + +```json +{ + "task": "transcribe", + "language": "en", + "duration": 0.012, + "text": "hello world", + "segments": [] +} +``` + +`segments` is always an empty list in this implementation. `task` is `"transcribe"` for the transcriptions endpoint and `"translate"` for the translations endpoint. `duration` is the wall-clock seconds taken by `model.process_audio()`. + +## Deepgram Response + +Always `application/json`. Shape mirrors the Deepgram v1 API: + +```json +{ + "metadata": {"request_id": "", "created": "", "duration": 0.01, "channels": 1, "models": ["ovos"]}, + "results": { + "channels": [{ + "alternatives": [{"transcript": "hello world", "confidence": 1.0, "words": []}] + }] + } +} +``` + +## Google Cloud STT Response + +```json +{ + "results": [{"alternatives": [{"transcript": "hello world", "confidence": 0.9}]}] +} +``` + +## AssemblyAI Response + +POST returns `status: completed` on success, `status: error` when `audio_url` is supplied (not supported) or decoding fails. GET always returns `status: error`. + +```json +{"id": "", "status": "completed", "text": "hello world", "language_code": "en", "confidence": 0.9, "words": []} +``` + +## Speechmatics Job Response + +POST `/v1/jobs` returns: + +```json +{"id": "", "status": "done"} +``` + +GET `/v1/jobs/{id}/transcript` returns: + +```json +{ + "format": "2.9", + "job": {"id": "", "status": "done"}, + "results": [{"type": "word", "start_time": 0.0, "end_time": 1.0, "alternatives": [{"content": "hello world", "confidence": 0.9}]}], + "metadata": {} +} +``` diff --git a/ovos_stt_http_server/__init__.py b/ovos_stt_http_server/__init__.py index 194acd2..ae5ae4e 100644 --- a/ovos_stt_http_server/__init__.py +++ b/ovos_stt_http_server/__init__.py @@ -10,7 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import os from tempfile import NamedTemporaryFile from typing import List, Tuple, Optional, Set, Union @@ -20,8 +19,8 @@ from ovos_config import Configuration from ovos_plugin_manager.audio_transformers import load_audio_transformer_plugin, AudioLanguageDetector from ovos_plugin_manager.stt import load_stt_plugin +from ovos_plugin_manager.utils.audio import AudioFile, AudioData from ovos_utils.log import LOG -from speech_recognition import AudioData, Recognizer, AudioFile from starlette.requests import Request LOG.set_level("ERROR") # avoid server side logs @@ -92,26 +91,45 @@ def unload_engine(self, lang: str): self.engines.pop(lang) def process_audio(self, audio: AudioData, lang: str): + """ + Transcribes the provided audio using the engine for the specified language. + + Parameters: + audio (AudioData): Audio content to transcribe. + lang (str): Language code identifying which engine to use. + + Returns: + str: Transcribed text for the audio, or an empty string if no transcription is produced. + """ engine = self.get_engine(lang) return engine.execute(audio, language=lang) or "" -def bytes2audiodata(data: bytes) -> AudioData: - recognizer = Recognizer() - with NamedTemporaryFile() as fp: - fp.write(data) - with AudioFile(fp.name) as source: - audio = recognizer.record(source) - return audio +def create_app(stt_plugin: str, lang_plugin: str = None, multi: bool = False): + """ + Create and configure a FastAPI app that exposes STT and language-detection endpoints. + Initializes either a single-model or multi-model container using the provided plugins, + and registers three endpoints: + - GET /status: returns service and plugin metadata. + - POST /stt: accepts raw audio bytes (query params: `lang`, `sample_rate`, `sample_width`), + optionally performs language detection when `lang=auto`, and returns transcribed text. + - POST /lang_detect: accepts raw audio bytes and returns detected language and confidence + (supports `valid_langs` query param). -def create_app(stt_plugin, lang_plugin=None, multi=False, has_gradio=False): + Parameters: + stt_plugin: Name or identifier of the STT plugin to load. + lang_plugin: Name or identifier of an optional language-detection plugin. + multi: If True, use a MultiModelContainer (one engine per language). + + Returns: + tuple: (app, model) where `app` is the configured FastAPI application and `model` is + the initialized ModelContainer or MultiModelContainer instance. + """ app = FastAPI() - cors_origins = os.environ.get("CORS_ORIGINS", "*") - origins = [origin.strip() for origin in cors_origins.split(",")] if cors_origins != "*" else ["*"] app.add_middleware( CORSMiddleware, - allow_origins=origins, + allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], @@ -125,14 +143,27 @@ def create_app(stt_plugin, lang_plugin=None, multi=False, has_gradio=False): def stats(request: Request): return {"status": "ok", "plugin": stt_plugin, - "lang_plugin": lang_plugin, - "gradio": has_gradio} + "lang_plugin": lang_plugin} @app.post("/stt", response_class=PlainTextResponse) async def get_stt(request: Request): + """ + Handle an STT request: read audio from the request body, determine language if requested, and return the transcription. + + Parameters: + request (Request): HTTP request whose body contains raw audio bytes. Query parameters: + - lang: language code or "auto" (default from Configuration().get("lang", "auto")). + - sample_rate: sample rate in Hz for the audio (default 16000). + - sample_width: sample width in bytes (default 2). + + Returns: + str: Transcribed text from the provided audio, or an empty string if no transcription is produced. + """ lang = str(request.query_params.get("lang", Configuration().get("lang", "auto"))).lower() + sr = int(request.query_params.get("sample_rate", 16000)) + sw = int(request.query_params.get("sample_width", 2)) audio_bytes = await request.body() - audio = bytes2audiodata(audio_bytes) + audio = AudioData(audio_bytes, sr, sw) if lang == "auto": lang, prob = model.detect_language(audio_bytes) return model.process_audio(audio, lang) @@ -146,12 +177,34 @@ async def get_lang(request: Request): lang, prob = model.detect_language(audio_bytes, valid_langs=valid) return {"lang": lang, "conf": prob} + from ovos_stt_http_server.routers.openai_whisper import make_openai_whisper_router + from ovos_stt_http_server.routers.deepgram import make_deepgram_router + from ovos_stt_http_server.routers.google_stt import make_google_stt_router + from ovos_stt_http_server.routers.assemblyai import make_assemblyai_router + from ovos_stt_http_server.routers.speechmatics import make_speechmatics_router + + app.include_router(make_openai_whisper_router(model)) + app.include_router(make_deepgram_router(model)) + app.include_router(make_google_stt_router(model)) + app.include_router(make_assemblyai_router(model)) + app.include_router(make_speechmatics_router(model)) + return app, model def start_stt_server(engine: str, lang_engine: str = None, - multi: bool = False, - has_gradio: bool = False) -> (FastAPI, ModelContainer): - app, engine = create_app(engine, lang_engine, multi, has_gradio) - return app, engine + multi: bool = False) -> tuple: + """ + Initialize and return a configured FastAPI STT server and its model container. + + Parameters: + engine: STT plugin name to load. + lang_engine: Optional language-detection plugin name. + multi: If True, load one engine per language via MultiModelContainer. + + Returns: + tuple: (app, model) — the FastAPI application and the model container. + """ + app, engine = create_app(engine, lang_engine, multi) + return app, engine \ No newline at end of file diff --git a/ovos_stt_http_server/__main__.py b/ovos_stt_http_server/__main__.py index 7e79971..c813156 100644 --- a/ovos_stt_http_server/__main__.py +++ b/ovos_stt_http_server/__main__.py @@ -13,44 +13,25 @@ import argparse import uvicorn -from ovos_config import Configuration from ovos_utils.log import LOG from ovos_stt_http_server import start_stt_server -from ovos_stt_http_server.gradio_app import bind_gradio_service def main(): + """Entry point for the OVOS STT HTTP server CLI.""" parser = argparse.ArgumentParser() parser.add_argument("--engine", help="stt plugin to be used", required=True) parser.add_argument("--lang-engine", help="audio language detection plugin to be used") parser.add_argument("--port", help="port number", default=8080) parser.add_argument("--host", help="host", default="0.0.0.0") - parser.add_argument("--lang", help="default language supported by plugin", - default=Configuration().get("lang", "en-us")) parser.add_argument("--multi", help="Load a plugin instance per language (force lang support)", action="store_true") - parser.add_argument("--gradio", help="Enable Gradio Web UI", - action="store_true") - parser.add_argument("--cache", help="Cache models for Gradio demo", - action="store_true") - parser.add_argument("--title", help="Title for webUI", - default="STT") - parser.add_argument("--description", help="Text description to print in UI", - default="Get Speech-To-Text") - parser.add_argument("--info", help="Text to display at end of UI", - default=None) - parser.add_argument("--badge", help="URL of visitor badge", default=None) args = parser.parse_args() server, engine = start_stt_server(args.engine, lang_engine=args.lang_engine, - multi=bool(args.multi), - has_gradio=bool(args.gradio)) + multi=bool(args.multi)) LOG.info("Server Started") - if args.gradio: - bind_gradio_service(server, engine, args.title, args.description, - args.info, args.badge, args.lang, args.cache) - LOG.info("Gradio Started") uvicorn.run(server, host=args.host, port=int(args.port)) diff --git a/ovos_stt_http_server/audio_utils.py b/ovos_stt_http_server/audio_utils.py new file mode 100644 index 0000000..67cdd1d --- /dev/null +++ b/ovos_stt_http_server/audio_utils.py @@ -0,0 +1,50 @@ +# Licensed under the Apache License, Version 2.0 +"""Audio conversion utilities for STT server.""" +import io +import wave + +from fastapi import HTTPException +from ovos_plugin_manager.utils.audio import AudioData + + +def multipart_audio_to_audiodata(file_bytes: bytes, filename: str) -> AudioData: + """Convert uploaded audio file bytes to AudioData. + + Handles WAV directly via stdlib. Falls back to pydub for other formats. + + Args: + file_bytes: Raw bytes from uploaded file. + filename: Original filename (used to detect format). + + Returns: + AudioData instance suitable for STT engine. + + Raises: + HTTPException: 501 if non-WAV format is uploaded and pydub is not installed. + """ + ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else "wav" + + if ext == "wav": + with wave.open(io.BytesIO(file_bytes)) as wf: + sample_rate = wf.getframerate() + sample_width = wf.getsampwidth() + audio_bytes = wf.readframes(wf.getnframes()) + return AudioData(audio_bytes, sample_rate, sample_width) + + try: + from pydub import AudioSegment + audio = AudioSegment.from_file(io.BytesIO(file_bytes), format=ext) + audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) + buf = io.BytesIO() + audio.export(buf, format="wav") + buf.seek(0) + with wave.open(buf) as wf: + sample_rate = wf.getframerate() + sample_width = wf.getsampwidth() + audio_bytes = wf.readframes(wf.getnframes()) + return AudioData(audio_bytes, sample_rate, sample_width) + except ImportError: + raise HTTPException( + status_code=501, + detail=f"Format '{ext}' requires pydub. Install with: pip install pydub", + ) diff --git a/ovos_stt_http_server/gradio_app.py b/ovos_stt_http_server/gradio_app.py deleted file mode 100644 index 09009c0..0000000 --- a/ovos_stt_http_server/gradio_app.py +++ /dev/null @@ -1,62 +0,0 @@ - -import gradio as gr - -from os.path import join, dirname, basename, splitext, isfile -from ovos_utils.log import LOG -from ovos_stt_http_server import ModelContainer, bytes2audiodata - -STT = None - - -def transcribe(audio_file, language: str): - try: - with open(audio_file, 'rb') as f: - audio = f.read() - return STT.process_audio(bytes2audiodata(audio), language) - except TypeError: - LOG.error(f"Requested file not valid: {audio_file}") - except FileNotFoundError: - LOG.error(f"Requested file not found: {audio_file}") - -def bind_gradio_service(app, stt_engine: ModelContainer, - title, description, info, badge, - default_lang="en", cache=True): - global STT - STT = stt_engine - languages = list(stt_engine.engine.available_languages or [default_lang]) - languages.sort() - LOG.debug(languages) - - if default_lang not in languages: - LOG.info(f"{default_lang} not in languages, trying ISO 639-1 code") - default_lang = default_lang.split('-')[0] - if default_lang not in languages: - LOG.warning(f"{default_lang} not in languages, choosing first lang") - default_lang = languages[0] - - examples = [join(dirname(__file__), 'audio', f'{lang.split("-")[0]}.mp3') - for lang in languages] - examples = [example for example in examples if isfile(example)] - iface = gr.Interface( - fn=transcribe, - inputs=[ - gr.Audio(source="microphone", type="filepath"), - gr.Radio( - label="Language", - choices=languages, - value=default_lang - ) - ], - outputs=[ - "textbox" - ], - examples=[[e, basename(splitext(e)[0])] for e in examples], - cache_examples=cache, # Takes some time at init, but speeds up runtime - live=True, - title=title, - description=description, - article=info, - analytics_enabled=False) - - LOG.info(f"Mounting app to /gradio") - gr.mount_gradio_app(app, iface, path="/gradio") diff --git a/ovos_stt_http_server/routers/__init__.py b/ovos_stt_http_server/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ovos_stt_http_server/routers/assemblyai.py b/ovos_stt_http_server/routers/assemblyai.py new file mode 100644 index 0000000..d8b5f29 --- /dev/null +++ b/ovos_stt_http_server/routers/assemblyai.py @@ -0,0 +1,131 @@ +# Licensed under the Apache License, Version 2.0 +"""AssemblyAI-compatible STT endpoint (synchronous stub).""" +import base64 +import io +import uuid +import wave +from typing import Dict, List, Literal, Optional + +from fastapi import APIRouter, Header +from pydantic import BaseModel, Field + +from ovos_plugin_manager.utils.audio import AudioData + + +class AssemblyAIRequest(BaseModel): + """Request body for POST /v2/transcript.""" + audio_url: Optional[str] = None + audio: Optional[str] = Field(default=None, description="Base64-encoded audio bytes") + language_code: Optional[str] = Field(default=None, min_length=1) + punctuate: Optional[bool] = None + format_text: Optional[bool] = None + disfluencies: Optional[bool] = None + speaker_labels: Optional[bool] = None + + +class AssemblyAIWord(BaseModel): + """Word-level timing information.""" + text: str + start: int = Field(..., ge=0) + end: int = Field(..., ge=0) + confidence: float = Field(..., ge=0.0, le=1.0) + + +class AssemblyAITranscript(BaseModel): + """AssemblyAI transcript response.""" + id: str + status: Literal["queued", "processing", "completed", "error"] = "completed" + audio_url: Optional[str] = None + text: Optional[str] = None + words: List[AssemblyAIWord] = Field(default_factory=list) + language_code: Optional[str] = None + audio_duration: Optional[float] = None + confidence: Optional[float] = Field(default=None, ge=0.0, le=1.0) + error: Optional[str] = None + + +def make_assemblyai_router(model) -> APIRouter: + """Create AssemblyAI-compatible router (synchronous stub).""" + router = APIRouter(prefix="/assemblyai/v2", tags=["assemblyai"]) + + @router.post("/transcript", response_model=AssemblyAITranscript) + def create_transcript( + request: AssemblyAIRequest, + authorization: Optional[str] = Header(default=None), + ) -> AssemblyAITranscript: + """Transcribe audio (AssemblyAI-compatible, synchronous stub). + + Accepts base64 audio in the `audio` field. The `audio_url` field is + acknowledged but audio fetching is not supported — supply `audio` instead. + + Args: + request: AssemblyAI transcript request. + authorization: API key header (accepted, ignored). + + Returns: + AssemblyAITranscript with status 'completed' and transcript text. + """ + job_id = str(uuid.uuid4()) + lang = request.language_code or "en" + + if request.audio: + try: + audio_bytes = base64.b64decode(request.audio) + try: + with wave.open(io.BytesIO(audio_bytes)) as wf: + sr = wf.getframerate() + sw = wf.getsampwidth() + raw = wf.readframes(wf.getnframes()) + audio = AudioData(raw, sr, sw) + except Exception: + audio = AudioData(audio_bytes, 16000, 2) + transcript_text = model.process_audio(audio, lang) + except Exception as exc: + return AssemblyAITranscript( + id=job_id, + status="error", + audio_url=request.audio_url, + error=str(exc), + ) + else: + # audio_url provided but fetching not supported — return error + return AssemblyAITranscript( + id=job_id, + status="error", + audio_url=request.audio_url, + error="audio_url fetching is not supported; supply base64 'audio' field instead.", + ) + + return AssemblyAITranscript( + id=job_id, + status="completed", + audio_url=request.audio_url, + text=transcript_text or "", + language_code=lang, + confidence=0.9, + ) + + @router.get("/transcript/{transcript_id}", response_model=AssemblyAITranscript) + def get_transcript( + transcript_id: str, + authorization: Optional[str] = Header(default=None), + ) -> AssemblyAITranscript: + """Get transcript by ID (AssemblyAI-compatible stub). + + Since this server is synchronous, all transcripts are immediately + completed. This endpoint returns a not-found stub. + + Args: + transcript_id: ID of the transcript to retrieve. + authorization: API key header (accepted, ignored). + + Returns: + AssemblyAITranscript stub with error status. + """ + return AssemblyAITranscript( + id=transcript_id, + status="error", + error="Transcript retrieval by ID is not supported in synchronous mode.", + ) + + return router diff --git a/ovos_stt_http_server/routers/deepgram.py b/ovos_stt_http_server/routers/deepgram.py new file mode 100644 index 0000000..b33ce30 --- /dev/null +++ b/ovos_stt_http_server/routers/deepgram.py @@ -0,0 +1,109 @@ +# Licensed under the Apache License, Version 2.0 +"""Deepgram-compatible STT endpoint.""" +import time +import uuid +from typing import List, Optional + +from fastapi import APIRouter, Header, Query, Request +from pydantic import BaseModel, Field + +from ovos_plugin_manager.utils.audio import AudioData + + +class DeepgramWord(BaseModel): + """A single word with timing in a Deepgram transcript.""" + word: str + start: float = Field(..., ge=0.0) + end: float = Field(..., ge=0.0) + confidence: float = Field(..., ge=0.0, le=1.0) + + +class DeepgramAlternative(BaseModel): + """A single transcription alternative.""" + transcript: str + confidence: float = Field(..., ge=0.0, le=1.0) + words: List[DeepgramWord] = Field(default_factory=list) + + +class DeepgramChannel(BaseModel): + """A single audio channel's transcription results.""" + alternatives: List[DeepgramAlternative] + + +class DeepgramResults(BaseModel): + """Top-level results container.""" + channels: List[DeepgramChannel] + + +class DeepgramMetadata(BaseModel): + """Request metadata in Deepgram response.""" + request_id: str + created: str + duration: float = Field(..., ge=0.0) + channels: int = Field(default=1, ge=1) + models: List[str] = Field(default_factory=lambda: ["general"]) + + +class DeepgramResponse(BaseModel): + """Full Deepgram transcription response.""" + metadata: DeepgramMetadata + results: DeepgramResults + + +def make_deepgram_router(model) -> APIRouter: + """Create Deepgram-compatible router.""" + router = APIRouter(prefix="/deepgram", tags=["deepgram"]) + + @router.post("/v1/listen", response_model=DeepgramResponse) + async def listen( + request: Request, + language: str = Query(default="en"), + model_name: Optional[str] = Query(default=None, alias="model"), + punctuate: Optional[bool] = Query(default=None), + diarize: Optional[bool] = Query(default=None), + authorization: Optional[str] = Header(default=None), + ) -> DeepgramResponse: + """Transcribe audio (Deepgram-compatible). + + Args: + request: Raw request whose body contains audio bytes. + language: BCP-47 language code. + model_name: Model name (accepted, ignored). + punctuate: Punctuation flag (accepted, ignored). + diarize: Diarization flag (accepted, ignored). + authorization: Token auth header (accepted, ignored). + + Returns: + DeepgramResponse with transcription results. + """ + audio_bytes = await request.body() + # Default 16kHz 16-bit mono + audio = AudioData(audio_bytes, 16000, 2) + start = time.time() + transcript = model.process_audio(audio, language) + duration = time.time() - start + + return DeepgramResponse( + metadata=DeepgramMetadata( + request_id=str(uuid.uuid4()), + created=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + duration=duration, + channels=1, + models=["ovos"], + ), + results=DeepgramResults( + channels=[ + DeepgramChannel( + alternatives=[ + DeepgramAlternative( + transcript=transcript or "", + confidence=1.0, + words=[], + ) + ] + ) + ] + ), + ) + + return router diff --git a/ovos_stt_http_server/routers/google_stt.py b/ovos_stt_http_server/routers/google_stt.py new file mode 100644 index 0000000..d5d6b6c --- /dev/null +++ b/ovos_stt_http_server/routers/google_stt.py @@ -0,0 +1,109 @@ +# Licensed under the Apache License, Version 2.0 +"""Google Cloud STT-compatible endpoint.""" +import base64 +import io +import wave +from typing import List, Optional + +from fastapi import APIRouter, Header, HTTPException, Query, status +from pydantic import BaseModel, Field + +from ovos_plugin_manager.utils.audio import AudioData + + +class GoogleSTTConfig(BaseModel): + """Recognition configuration.""" + encoding: Optional[str] = Field(default="LINEAR16", min_length=1) + sampleRateHertz: Optional[int] = Field(default=16000, gt=0) + languageCode: str = Field(default="en-US", min_length=1) + maxAlternatives: Optional[int] = Field(default=1, ge=1, le=30) + enableAutomaticPunctuation: Optional[bool] = None + + +class GoogleSTTAudio(BaseModel): + """Audio input: base64 content or GCS URI.""" + content: Optional[str] = None + uri: Optional[str] = None + + +class GoogleSTTRequest(BaseModel): + """Request body for POST /v1/speech:recognize.""" + config: GoogleSTTConfig + audio: GoogleSTTAudio + + +class GoogleSTTAlternative(BaseModel): + """A single recognition alternative.""" + transcript: str + confidence: float = Field(default=0.9, ge=0.0, le=1.0) + + +class GoogleSTTResult(BaseModel): + """A single recognition result.""" + alternatives: List[GoogleSTTAlternative] = Field(..., min_length=1) + + +class GoogleSTTResponse(BaseModel): + """Response for POST /v1/speech:recognize.""" + results: List[GoogleSTTResult] + + +def make_google_stt_router(model) -> APIRouter: + """Create Google Cloud STT-compatible router.""" + router = APIRouter(prefix="/google", tags=["google-stt"]) + + @router.post("/v1/speech:recognize", response_model=GoogleSTTResponse) + def recognize( + request: GoogleSTTRequest, + key: Optional[str] = Query(default=None), + authorization: Optional[str] = Header(default=None), + ) -> GoogleSTTResponse: + """Transcribe audio (Google Cloud STT-compatible). + + Args: + request: Google STT recognition request with config and base64 audio. + key: API key query param (accepted, ignored). + authorization: Bearer token (accepted, ignored). + + Returns: + GoogleSTTResponse with recognition results. + + Raises: + HTTPException: 400 if neither content nor uri is provided. + HTTPException: 501 if GCS URI is supplied (not supported). + """ + if request.audio.uri: + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="GCS URI audio input is not supported; use 'content' (base64).", + ) + if not request.audio.content: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="audio.content (base64-encoded audio) is required.", + ) + + audio_bytes = base64.b64decode(request.audio.content) + sr = request.config.sampleRateHertz or 16000 + + # Attempt to parse as WAV; fall back to raw PCM + try: + with wave.open(io.BytesIO(audio_bytes)) as wf: + sr = wf.getframerate() + sw = wf.getsampwidth() + raw = wf.readframes(wf.getnframes()) + audio = AudioData(raw, sr, sw) + except Exception: + audio = AudioData(audio_bytes, sr, 2) + + lang = request.config.languageCode + transcript = model.process_audio(audio, lang) + return GoogleSTTResponse( + results=[ + GoogleSTTResult( + alternatives=[GoogleSTTAlternative(transcript=transcript or "", confidence=0.9)] + ) + ] + ) + + return router diff --git a/ovos_stt_http_server/routers/openai_whisper.py b/ovos_stt_http_server/routers/openai_whisper.py new file mode 100644 index 0000000..a61c242 --- /dev/null +++ b/ovos_stt_http_server/routers/openai_whisper.py @@ -0,0 +1,136 @@ +# Licensed under the Apache License, Version 2.0 +"""OpenAI Whisper-compatible transcription endpoints.""" +import time +from typing import Annotated, Any, Dict, List, Literal, Optional + +from fastapi import APIRouter, File, Form, Header, UploadFile +from fastapi.responses import PlainTextResponse, JSONResponse +from pydantic import BaseModel, Field + +from ovos_stt_http_server.audio_utils import multipart_audio_to_audiodata + +_ResponseFormat = Literal["json", "text", "srt", "vtt", "verbose_json"] + + +class WhisperTranscriptionResponse(BaseModel): + """JSON response for OpenAI transcription API.""" + + text: str + + +class WhisperVerboseResponse(BaseModel): + """Verbose JSON response for OpenAI transcription API.""" + + task: Literal["transcribe", "translate"] + language: str = Field(..., min_length=1) + duration: float = Field(..., ge=0.0) + text: str + segments: List[Dict[str, Any]] = Field(default_factory=list) + + +def make_openai_whisper_router(model) -> APIRouter: + """Create OpenAI Whisper-compatible router. + + Args: + model: ModelContainer or MultiModelContainer instance. + + Returns: + Configured APIRouter with OpenAI-compatible transcription endpoints. + """ + router = APIRouter(prefix="/openai", tags=["openai-whisper"]) + + async def _transcribe( + file: UploadFile, + language: Optional[str], + response_format: _ResponseFormat, + task: Literal["transcribe", "translate"] = "transcribe", + force_lang: Optional[str] = None, + ): + """Internal transcription helper. + + Args: + file: Uploaded audio file. + language: Optional language hint. + response_format: One of json, text, srt, vtt, verbose_json. + task: Task type for verbose response metadata. + force_lang: If set, overrides language (used for translation endpoint). + + Returns: + Transcription response in the requested format. + """ + file_bytes = await file.read() + audio = multipart_audio_to_audiodata(file_bytes, file.filename or "audio.wav") + lang = force_lang or language or "auto" + start = time.time() + text = model.process_audio(audio, lang) + duration = time.time() - start + + if response_format == "text": + return PlainTextResponse(text) + elif response_format == "srt": + srt = f"1\n00:00:00,000 --> 00:00:{int(duration):02d},000\n{text}\n" + return PlainTextResponse(srt) + elif response_format == "vtt": + vtt = f"WEBVTT\n\n00:00:00.000 --> 00:00:{int(duration):02d}.000\n{text}\n" + return PlainTextResponse(vtt) + elif response_format == "verbose_json": + resp = WhisperVerboseResponse( + task=task, + language=lang, + duration=duration, + text=text, + segments=[], + ) + return JSONResponse(resp.model_dump()) + else: + return JSONResponse(WhisperTranscriptionResponse(text=text).model_dump()) + + @router.post("/v1/audio/transcriptions") + async def transcriptions( + file: UploadFile = File(...), + model_name: Optional[str] = Form(default=None, alias="model"), + language: Optional[str] = Form(default=None), + response_format: _ResponseFormat = Form(default="json"), + temperature: Annotated[Optional[float], Field(ge=0.0, le=1.0)] = Form(default=None), + authorization: Optional[str] = Header(default=None), + ): + """Transcribe audio (OpenAI Whisper-compatible). + + Args: + file: Audio file to transcribe. + model_name: Model identifier (accepted, ignored). + language: Optional language hint (BCP-47 code). + response_format: Output format: json, text, srt, vtt, verbose_json. + temperature: Sampling temperature 0–1 (accepted, ignored). + authorization: Bearer token (accepted, ignored). + + Returns: + Transcription in the requested format. + """ + return await _transcribe(file, language, response_format) + + @router.post("/v1/audio/translations") + async def translations( + file: UploadFile = File(...), + model_name: Optional[str] = Form(default=None, alias="model"), + response_format: _ResponseFormat = Form(default="json"), + temperature: Annotated[Optional[float], Field(ge=0.0, le=1.0)] = Form(default=None), + authorization: Optional[str] = Header(default=None), + ): + """Translate audio to English (OpenAI Whisper-compatible). + + Forces language to 'en' regardless of audio language. + + Args: + file: Audio file to translate. + model_name: Model identifier (accepted, ignored). + response_format: Output format: json, text, srt, vtt, verbose_json. + temperature: Sampling temperature 0–1 (accepted, ignored). + authorization: Bearer token (accepted, ignored). + + Returns: + Transcription in the requested format with forced English output. + """ + return await _transcribe(file, None, response_format, task="translate", force_lang="en") + + return router diff --git a/ovos_stt_http_server/routers/speechmatics.py b/ovos_stt_http_server/routers/speechmatics.py new file mode 100644 index 0000000..64d6698 --- /dev/null +++ b/ovos_stt_http_server/routers/speechmatics.py @@ -0,0 +1,126 @@ +# Licensed under the Apache License, Version 2.0 +"""Speechmatics-compatible STT endpoint (synchronous stub).""" +import json +import uuid +from typing import Dict, List, Literal, Optional + +from fastapi import APIRouter, File, Form, Header, HTTPException, UploadFile, status +from pydantic import BaseModel, Field + +from ovos_stt_http_server.audio_utils import multipart_audio_to_audiodata + +# In-memory store: job_id → transcript text +_jobs: Dict[str, str] = {} + + +class SpeechmaticsTranscriptionConfig(BaseModel): + """Transcription configuration for Speechmatics.""" + language: str = Field(default="en", min_length=1) + operating_point: Optional[Literal["standard", "enhanced"]] = None + diarization: Optional[str] = None + + +class SpeechmaticsJobConfig(BaseModel): + """Top-level job configuration.""" + type: Literal["transcription"] = "transcription" + transcription_config: SpeechmaticsTranscriptionConfig = Field( + default_factory=SpeechmaticsTranscriptionConfig + ) + + +class SpeechmaticsJobResponse(BaseModel): + """Response from POST /v1/jobs.""" + id: str + status: Literal["running", "done", "rejected", "deleted"] = "done" + + +class SpeechmaticsWord(BaseModel): + """A word in a Speechmatics transcript.""" + type: Literal["word", "punctuation"] = "word" + start_time: float = Field(..., ge=0.0) + end_time: float = Field(..., ge=0.0) + alternatives: List[Dict] = Field(default_factory=list) + + +class SpeechmaticsTranscriptResponse(BaseModel): + """GET /v1/jobs/{job_id}/transcript response.""" + format: str = "2.9" + job: SpeechmaticsJobResponse + results: List[SpeechmaticsWord] = Field(default_factory=list) + metadata: Dict = Field(default_factory=dict) + + +def make_speechmatics_router(model) -> APIRouter: + """Create Speechmatics-compatible router (synchronous stub).""" + router = APIRouter(prefix="/speechmatics/v1", tags=["speechmatics"]) + + @router.post("/jobs", response_model=SpeechmaticsJobResponse) + async def create_job( + data_file: UploadFile = File(...), + config: str = Form(...), + authorization: Optional[str] = Header(default=None), + ) -> SpeechmaticsJobResponse: + """Create transcription job (Speechmatics-compatible, synchronous stub). + + Transcribes immediately and stores result for GET retrieval. + + Args: + data_file: Audio file upload. + config: JSON string with SpeechmaticsJobConfig. + authorization: Auth header (accepted, ignored). + + Returns: + SpeechmaticsJobResponse with job ID and 'done' status. + """ + job_id = str(uuid.uuid4()) + try: + cfg = SpeechmaticsJobConfig(**json.loads(config)) + lang = cfg.transcription_config.language + except Exception: + lang = "en" + + file_bytes = await data_file.read() + audio = multipart_audio_to_audiodata(file_bytes, data_file.filename or "audio.wav") + transcript_text = model.process_audio(audio, lang) + _jobs[job_id] = transcript_text or "" + + return SpeechmaticsJobResponse(id=job_id, status="done") + + @router.get("/jobs/{job_id}/transcript", response_model=SpeechmaticsTranscriptResponse) + def get_transcript( + job_id: str, + format: str = "json-v2", + authorization: Optional[str] = Header(default=None), + ) -> SpeechmaticsTranscriptResponse: + """Retrieve transcript for a completed job (Speechmatics-compatible). + + Args: + job_id: Job identifier from POST /v1/jobs response. + format: Response format (accepted, json-v2 returned regardless). + authorization: Auth header (accepted, ignored). + + Returns: + SpeechmaticsTranscriptResponse with results. + + Raises: + HTTPException: 404 if job_id not found. + """ + if job_id not in _jobs: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Job {job_id!r} not found.", + ) + text = _jobs[job_id] + return SpeechmaticsTranscriptResponse( + job=SpeechmaticsJobResponse(id=job_id, status="done"), + results=[ + SpeechmaticsWord( + type="word", + start_time=0.0, + end_time=1.0, + alternatives=[{"content": text, "confidence": 0.9}], + ) + ] if text else [], + ) + + return router diff --git a/ovos_stt_http_server/version.py b/ovos_stt_http_server/version.py index 984e3e6..ad2ee2f 100644 --- a/ovos_stt_http_server/version.py +++ b/ovos_stt_http_server/version.py @@ -4,3 +4,5 @@ VERSION_BUILD = 5 VERSION_ALPHA = 7 # END_VERSION_BLOCK + +__version__ = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_BUILD}" + (f"a{VERSION_ALPHA}" if VERSION_ALPHA else "") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..af6950c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "ovos-stt-http-server" +dynamic = ["version"] +description = "simple fastapi server to host OpenVoiceOS stt plugins as a service" +readme = "README.md" +license = "Apache-2.0" +authors = [{name = "JarbasAi", email = "jarbasai@mailfence.com"}] +requires-python = ">=3.9" + +keywords = [ + "plugin", + "STT", + "OVOS", + "OpenVoiceOS", +] +dependencies = [ + "ovos-plugin-manager>=2.1.1,<3.0.0", + "fastapi~=0.95", + "uvicorn~=0.22", + "ovos-utils>=0.0.32,<1.0.0", +] + +[project.urls] +Homepage = "https://github.com/OpenVoiceOS/ovos-stt-http-server" +Repository = "https://github.com/OpenVoiceOS/ovos-stt-http-server" + +[project.scripts] +ovos-stt-server = "ovos_stt_http_server.__main__:main" + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +include = ["ovos_stt_http_server*"] + +[tool.setuptools.dynamic] +version = {attr = "ovos_stt_http_server.version.__version__"} diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 563d165..9fa87fa 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,4 +1,4 @@ -ovos-plugin-manager>=2.1.0,<2.2.0 +ovos-plugin-manager>=2.1.1,<3.0.0 fastapi~=0.95 uvicorn~=0.22 gradio~=3.28 diff --git a/setup.py b/setup.py index 5b220f6..7daa4d3 100755 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ def get_version(): setup( name='ovos-stt-http-server', version=get_version(), - description='simple aiohttp server to host OpenVoiceOS stt plugins as a service', + description='simple fastapi server to host OpenVoiceOS stt plugins as a service', long_description=long_description, long_description_content_type="text/markdown", url='https://github.com/OpenVoiceOS/ovos-stt-http-server', @@ -61,19 +61,7 @@ def get_version(): classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', - 'Topic :: Text Processing :: Linguistic', 'License :: OSI Approved :: Apache Software License', - - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.0', - 'Programming Language :: Python :: 3.1', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', ], keywords='plugin STT OVOS OpenVoiceOS', entry_points={ diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unittests/__init__.py b/test/unittests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unittests/test_compat_routers.py b/test/unittests/test_compat_routers.py new file mode 100644 index 0000000..7e8d723 --- /dev/null +++ b/test/unittests/test_compat_routers.py @@ -0,0 +1,389 @@ +# Licensed under the Apache License, Version 2.0 +"""Unit tests for STT server compatibility routers. + +All compat routers are mounted under a named prefix (e.g. /openai, /deepgram) +to avoid path conflicts when all routers are registered in the same FastAPI app. +""" +import base64 +import io +import json +import wave +from typing import Optional + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_wav_bytes() -> bytes: + """Return minimal valid PCM WAV bytes.""" + buf = io.BytesIO() + with wave.open(buf, "w") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(16000) + wf.writeframes(b"\x00\x00" * 160) + return buf.getvalue() + + +class FakeModel: + """Mock ModelContainer for testing routers.""" + + def process_audio(self, audio, lang: str = "auto") -> str: + return "hello world" + + +@pytest.fixture(scope="module") +def model(): + return FakeModel() + + +def _make_app(model) -> FastAPI: + from ovos_stt_http_server.routers.openai_whisper import make_openai_whisper_router + from ovos_stt_http_server.routers.deepgram import make_deepgram_router + from ovos_stt_http_server.routers.google_stt import make_google_stt_router + from ovos_stt_http_server.routers.assemblyai import make_assemblyai_router + from ovos_stt_http_server.routers.speechmatics import make_speechmatics_router + + app = FastAPI() + app.include_router(make_openai_whisper_router(model)) + app.include_router(make_deepgram_router(model)) + app.include_router(make_google_stt_router(model)) + app.include_router(make_assemblyai_router(model)) + app.include_router(make_speechmatics_router(model)) + return app + + +@pytest.fixture(scope="module") +def client(model): + app = _make_app(model) + return TestClient(app) + + +@pytest.fixture(scope="module") +def wav_bytes(): + return _make_wav_bytes() + + +@pytest.fixture(scope="module") +def wav_b64(wav_bytes): + return base64.b64encode(wav_bytes).decode() + + +# --------------------------------------------------------------------------- +# OpenAI Whisper (prefix: /openai) +# --------------------------------------------------------------------------- + +class TestOpenAIWhisperRouter: + def test_transcription_json(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["text"] == "hello world" + + def test_transcription_text_format(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "text"}, + ) + assert resp.status_code == 200 + assert "hello world" in resp.text + + def test_transcription_verbose_json(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "verbose_json"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["text"] == "hello world" + assert body["task"] == "transcribe" + assert "duration" in body + + def test_transcription_srt_format(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "srt"}, + ) + assert resp.status_code == 200 + assert "hello world" in resp.text + + def test_translation_forces_english(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/translations", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1"}, + ) + assert resp.status_code == 200 + assert resp.json()["text"] == "hello world" + + def test_auth_header_ignored(self, client, wav_bytes): + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1"}, + headers={"Authorization": "Bearer fake-token"}, + ) + assert resp.status_code == 200 + + +# --------------------------------------------------------------------------- +# Deepgram (prefix: /deepgram) +# --------------------------------------------------------------------------- + +class TestDeepgramRouter: + def test_listen_basic(self, client, wav_bytes): + resp = client.post( + "/deepgram/v1/listen", + content=wav_bytes, + headers={"Content-Type": "audio/wav"}, + ) + assert resp.status_code == 200 + body = resp.json() + alt = body["results"]["channels"][0]["alternatives"][0] + assert alt["transcript"] == "hello world" + assert "confidence" in alt + + def test_listen_with_language(self, client, wav_bytes): + resp = client.post( + "/deepgram/v1/listen?language=en-US", + content=wav_bytes, + headers={"Content-Type": "audio/wav"}, + ) + assert resp.status_code == 200 + + def test_auth_header_ignored(self, client, wav_bytes): + resp = client.post( + "/deepgram/v1/listen", + content=wav_bytes, + headers={"Content-Type": "audio/wav", "Authorization": "Token fake"}, + ) + assert resp.status_code == 200 + + +# --------------------------------------------------------------------------- +# Google STT (prefix: /google) +# --------------------------------------------------------------------------- + +class TestGoogleSTTRouter: + def test_recognize_base64(self, client, wav_b64): + resp = client.post( + "/google/v1/speech:recognize", + json={ + "config": {"encoding": "LINEAR16", "sampleRateHertz": 16000, "languageCode": "en-US"}, + "audio": {"content": wav_b64}, + }, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["results"][0]["alternatives"][0]["transcript"] == "hello world" + + def test_recognize_uri_not_supported(self, client): + resp = client.post( + "/google/v1/speech:recognize", + json={ + "config": {"encoding": "LINEAR16", "sampleRateHertz": 16000, "languageCode": "en-US"}, + "audio": {"uri": "gs://bucket/file.wav"}, + }, + ) + assert resp.status_code == 501 + + +# --------------------------------------------------------------------------- +# AssemblyAI (prefix: /assemblyai) +# --------------------------------------------------------------------------- + +class TestAssemblyAIRouter: + def test_submit_b64_transcript(self, client, wav_b64): + resp = client.post( + "/assemblyai/v2/transcript", + json={"audio": wav_b64}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["status"] == "completed" + assert body["text"] == "hello world" + + def test_get_transcript_returns_error_stub(self, client, wav_b64): + """GET by ID is a synchronous stub — always returns error status.""" + create = client.post("/assemblyai/v2/transcript", json={"audio": wav_b64}) + tid = create.json()["id"] + get_resp = client.get(f"/assemblyai/v2/transcript/{tid}") + assert get_resp.status_code == 200 + assert get_resp.json()["status"] == "error" + + def test_submit_audio_url_returns_error(self, client): + resp = client.post( + "/assemblyai/v2/transcript", + json={"audio_url": "https://example.com/audio.wav"}, + ) + assert resp.status_code == 200 + assert resp.json()["status"] == "error" + + +# --------------------------------------------------------------------------- +# Speechmatics (prefix: /speechmatics) +# --------------------------------------------------------------------------- + +class TestSpeechmaticsRouter: + def test_submit_job(self, client, wav_bytes): + resp = client.post( + "/speechmatics/v1/jobs", + files={"data_file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"config": json.dumps({"type": "transcription", "transcription_config": {"language": "en"}})}, + ) + assert resp.status_code == 200 + body = resp.json() + assert "id" in body + assert body["status"] == "done" + + def test_get_transcript(self, client, wav_bytes): + create = client.post( + "/speechmatics/v1/jobs", + files={"data_file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"config": json.dumps({"type": "transcription", "transcription_config": {"language": "en"}})}, + ) + job_id = create.json()["id"] + resp = client.get(f"/speechmatics/v1/jobs/{job_id}/transcript") + assert resp.status_code == 200 + assert "results" in resp.json() + + def test_get_missing_job_transcript(self, client): + resp = client.get("/speechmatics/v1/jobs/nonexistent/transcript") + assert resp.status_code == 404 + + +# --------------------------------------------------------------------------- +# Additional tests (8 new) covering edge cases +# --------------------------------------------------------------------------- + +class TestWhisperResponseFormats: + """Additional Whisper response_format edge-case tests.""" + + def test_response_format_text_is_plain_text(self, client, wav_bytes): + """response_format=text must return Content-Type text/plain, not JSON.""" + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "text"}, + ) + assert resp.status_code == 200 + # Must NOT be a JSON object + assert resp.text.strip() == "hello world" + assert "text/plain" in resp.headers["content-type"] + + def test_response_format_verbose_json_has_segments_field(self, client, wav_bytes): + """verbose_json response must contain a 'segments' key (may be empty list).""" + resp = client.post( + "/openai/v1/audio/transcriptions", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "verbose_json"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert "segments" in body + assert isinstance(body["segments"], list) + + def test_translations_endpoint_forces_lang_en(self, client, wav_bytes): + """Translations endpoint must set task=translate and language=en in verbose_json.""" + resp = client.post( + "/openai/v1/audio/translations", + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={"model": "whisper-1", "response_format": "verbose_json"}, + ) + assert resp.status_code == 200 + body = resp.json() + assert body["task"] == "translate" + assert body["language"] == "en" + + +class TestDeepgramEdgeCases: + """Additional Deepgram router edge-case tests.""" + + def test_listen_with_punctuate_param_ignored(self, client, wav_bytes): + """?punctuate=true is accepted and ignored; transcript is still returned.""" + resp = client.post( + "/deepgram/v1/listen?punctuate=true", + content=wav_bytes, + headers={"Content-Type": "audio/wav"}, + ) + assert resp.status_code == 200 + alt = resp.json()["results"]["channels"][0]["alternatives"][0] + assert alt["transcript"] == "hello world" + + +class TestGoogleSTTEdgeCases: + """Additional Google STT router edge-case tests.""" + + def test_recognize_with_base64_wav(self, client, wav_b64): + """Explicit test that base64-encoded WAV bytes are decoded and transcribed.""" + resp = client.post( + "/google/v1/speech:recognize", + json={ + "config": { + "encoding": "LINEAR16", + "sampleRateHertz": 16000, + "languageCode": "en-US", + }, + "audio": {"content": wav_b64}, + }, + ) + assert resp.status_code == 200 + result = resp.json()["results"][0] + assert result["alternatives"][0]["transcript"] == "hello world" + assert result["alternatives"][0]["confidence"] == pytest.approx(0.9, abs=0.01) + + +class TestAssemblyAIEdgeCases: + """Additional AssemblyAI router edge-case tests.""" + + def test_get_transcript_always_has_status_field(self, client, wav_b64): + """GET by any ID must always return a JSON body with a 'status' key.""" + create = client.post("/assemblyai/v2/transcript", json={"audio": wav_b64}) + tid = create.json()["id"] + get_resp = client.get(f"/assemblyai/v2/transcript/{tid}") + assert get_resp.status_code == 200 + body = get_resp.json() + assert "status" in body + + +class TestSpeechmaticsEdgeCases: + """Additional Speechmatics router edge-case tests.""" + + def test_get_unknown_job_id_returns_404(self, client): + """A job ID that was never created must return HTTP 404.""" + resp = client.get("/speechmatics/v1/jobs/totally-unknown-id-xyz/transcript") + assert resp.status_code == 404 + + def test_get_known_job_id_returns_transcript(self, client, wav_bytes): + """A job ID from a successful POST must return 200 with transcript text.""" + create = client.post( + "/speechmatics/v1/jobs", + files={"data_file": ("audio.wav", wav_bytes, "audio/wav")}, + data={ + "config": json.dumps( + {"type": "transcription", "transcription_config": {"language": "en"}} + ) + }, + ) + assert create.status_code == 200 + job_id = create.json()["id"] + + resp = client.get(f"/speechmatics/v1/jobs/{job_id}/transcript") + assert resp.status_code == 200 + body = resp.json() + assert "results" in body + # transcript content should appear in alternatives + if body["results"]: + assert body["results"][0]["alternatives"][0]["content"] == "hello world"