Add Groq Whisper integration for audio transcription

howethomas · howethomas · commit 812b61d22f89 · 2025-04-02T21:14:42.000Z
- Introduced a new module for integrating Groq's Whisper ASR service to transcribe audio content in vCon recordings.
- Added configuration options for minimum audio duration and API key management.
- Implemented error handling and logging for transcription processes.
- Updated dependencies in pyproject.toml and poetry.lock to include the Groq library.
- Created tests to ensure functionality and error handling in the transcription process.
diff --git a/.env.example b/.env.example
@@ -1,4 +1,3 @@
-
 REDIS_URL=redis://redis
 
 # Leave this blank to disable API security
@@ -9,3 +8,6 @@ CONSERVER_API_TOKEN=
 # modify the values in config.yml as needed
 # and set CONSERVER_CONFIG_FILE to ./config.yml below
 CONSERVER_CONFIG_FILE= 
+
+# Groq API key for Whisper transcription
+GROQ_API_KEY=your_groq_api_key_here
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ slack-sdk = "^3.27.1"
 boto3 = "^1.34.52"
 deepgram-sdk = "^3.1.5"
 openai = ">=1.54.3"
+groq = "^0.4.0"
 psycopg2-binary = "^2.9.9"
 pymongo = "^4.6.2"
 elasticsearch = "^8.13.1"
diff --git a/server/links/groq_whisper/README.md b/server/links/groq_whisper/README.md
@@ -0,0 +1,104 @@
+# Groq Whisper Link
+
+A vCon-server link that provides automatic transcription of audio content using Groq's implementation of Whisper ASR.
+
+## Overview
+
+This link processes vCon objects containing audio recordings and transcribes them using Groq's Whisper API. The transcription results are added back to the vCon as analysis entries.
+
+## Requirements
+
+- Python 3.12+
+- A valid Groq API key
+- The `groq` Python package
+
+## Installation
+
+1. Install the required dependencies:
+
+```bash
+poetry add groq
+```
+
+2. Set your Groq API key in the environment:
+
+```bash
+export GROQ_API_KEY=your_groq_api_key_here
+```
+
+Alternatively, you can add the API key to your `.env` file:
+
+```
+GROQ_API_KEY=your_groq_api_key_here
+```
+
+## Configuration
+
+The link accepts the following configuration options:
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `API_KEY` | Groq API key for authentication | From GROQ_API_KEY environment variable |
+| `minimum_duration` | Minimum duration (in seconds) of audio to transcribe | 30 |
+
+## Usage
+
+To use this link in a vCon processing chain:
+
+```python
+from server.links.groq_whisper import run
+
+result = run(
+    vcon_uuid="your-vcon-uuid",
+    link_name="groq_whisper",
+    opts={
+        "minimum_duration": 60  # Optional override
+    }
+)
+```
+
+## How It Works
+
+1. The link retrieves the vCon object from Redis
+2. For each recording dialog in the vCon:
+   - Skips dialogs shorter than the minimum duration
+   - Skips dialogs that already have a transcript
+   - Extracts audio content (from inline base64 or external URL)
+   - Sends the audio to Groq's Whisper API for transcription
+   - Adds transcription results as a new analysis entry
+3. Stores the updated vCon back to Redis
+
+## Testing
+
+To run the tests:
+
+```bash
+# Set a dummy API key for testing
+export GROQ_API_KEY=test_api_key_for_testing
+
+# Run the tests
+pytest server/links/groq_whisper/test_groq_whisper.py -v
+```
+
+## Response Format
+
+The Groq Whisper API returns transcription results in the following format:
+
+```json
+{
+  "text": "The complete transcription text.",
+  "chunks": [
+    {
+      "text": "Chunk of transcription",
+      "timestamp": [0.0, 5.0]
+    },
+    {
+      "text": "Another chunk",
+      "timestamp": [5.1, 10.0]
+    }
+  ],
+  "language": "en"
+}
+```
+
+This response is stored in the vCon's analysis section as a transcript entry. 
diff --git a/server/links/groq_whisper/__init__.py b/server/links/groq_whisper/__init__.py
@@ -0,0 +1,260 @@
+"""Groq Face Whisper Integration Module
+
+This module provides integration with Groq Face's Whisper ASR service for transcribing audio content
+in vCon recordings. It handles the transcription process, error retries, and updates vCon objects with
+transcription results.
+"""
+
+import base64
+import hashlib
+import logging
+import tempfile
+import time
+import os
+from typing import Optional, Dict, Any, Union
+
+import requests
+from tenacity import (
+    RetryError,
+    before_sleep_log,
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+)
+from groq import Groq
+
+from lib.error_tracking import init_error_tracker
+from lib.logging_utils import init_logger
+from lib.metrics import init_metrics, stats_gauge, stats_count
+from server.lib.vcon_redis import VconRedis
+
+# Initialize services
+init_error_tracker()
+init_metrics()
+logger = init_logger(__name__)
+
+# Default configuration for the Whisper service
+default_options = {
+    "minimum_duration": 30,  # Minimum duration in seconds for audio to be transcribed
+    "API_KEY": os.environ.get("GROQ_API_KEY", "YOUR_GROQ_API_KEY"),  # IMPORTANT: Replace with actual API key in environment variables
+    "Content-Type": "audio/flac",
+}
+
+
+def get_transcription(vcon: Any, index: int) -> Optional[dict]:
+    """Retrieve existing transcription for a dialog at specified index.
+
+    Args:
+        vcon: The vCon object containing the dialog
+        index (int): Index of the dialog to check
+
+    Returns:
+        Optional[dict]: The transcription analysis if found, None otherwise
+    """
+    for a in vcon.analysis:
+        if a["dialog"] == index and a["type"] == "transcript":
+            return a
+    return None
+
+
+def get_file_content(dialog: dict) -> bytes:
+    """Get file content from either inline or external reference.
+
+    Args:
+        dialog (dict): Dialog object containing file information
+
+    Returns:
+        bytes: The file content
+
+    Raises:
+        Exception: If file cannot be retrieved or verified
+    """
+    if "body" in dialog:
+        # body contains the base64 encoded content. Decode and return
+        return base64.b64decode(dialog["body"])
+
+    elif "url" in dialog:
+        # Handle external file
+        response = requests.get(dialog["url"], verify=True)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download file from {dialog['url']}")
+
+        content = response.content
+
+        # Verify file integrity if signature is provided
+        if "signature" in dialog and "alg" in dialog:
+            if dialog["alg"] == "SHA-512":
+                file_hash = base64.urlsafe_b64encode(
+                    hashlib.sha512(content).digest()).decode('utf-8')
+                if file_hash != dialog["signature"]:
+                    raise Exception("File signature verification failed")
+            else:
+                raise Exception(f"Unsupported hash algorithm: {dialog['alg']}")
+
+        return content
+    else:
+        raise Exception("Dialog contains neither inline body nor external URL")
+
+
+@retry(
+    wait=wait_exponential(multiplier=2, min=12, max=100),
+    stop=stop_after_attempt(6),
+    before_sleep=before_sleep_log(logger, logging.INFO),
+)
+def transcribe_groq_whisper(dialog: dict, opts: dict) -> Union[Dict[str, Any], Any]:
+    """Send audio to Groq Whisper API for transcription using the Groq Python library.
+
+    Args:
+        dialog (dict): Dialog object containing the audio file information
+        opts (dict): Configuration options including API credentials and settings
+
+    Returns:
+        Union[Dict[str, Any], Any]: Transcription result from the API, which may be a dict
+        or a Groq library response object
+
+    Raises:
+        RetryError: If all retry attempts fail
+    """
+    # Get file content handling both inline and external references
+    content = get_file_content(dialog)
+
+    # Write content to temporary file
+    with tempfile.NamedTemporaryFile(suffix='.flac', delete=True) as temp_file:
+        temp_file.write(content)
+        temp_file.flush()
+        
+        # Initialize Groq client with the API key
+        client = Groq(api_key=opts['API_KEY'])
+        
+        # Open the audio file for the API request
+        with open(temp_file.name, 'rb') as audio_file:
+            # Make the transcription request using the Groq client
+            response = client.audio.transcriptions.create(
+                file=audio_file,
+                model="distil-whisper-large-v3-en",
+                response_format="verbose_json"
+            )
+            
+            # Return the response (could be a dict or an object depending on Groq library version)
+            return response
+
+
+def run(
+    vcon_uuid: str,
+    link_name: str,
+    opts: dict = default_options,
+) -> Optional[str]:
+    """Process a vCon object through the Whisper transcription service.
+
+    This function:
+    1. Retrieves the vCon from Redis
+    2. Processes each recording dialog that meets the minimum duration requirement
+    3. Skips already transcribed dialogs
+    4. Adds transcription results as analysis entries
+    5. Updates the vCon in Redis
+
+    Args:
+        vcon_uuid (str): UUID of the vCon to process
+        link_name (str): Name of the link (unused but required for plugin interface)
+        opts (dict): Optional configuration overrides
+
+    Returns:
+        Optional[str]: The vcon_uuid if processing should continue, None to stop chain
+    """
+    # Merge provided options with defaults
+    merged_opts = default_options.copy()
+    merged_opts.update(opts)
+    opts = merged_opts
+
+    logger.info("Starting whisper plugin for vCon: %s", vcon_uuid)
+
+    vcon_redis = VconRedis()
+    vCon = vcon_redis.get_vcon(vcon_uuid)
+
+    for index, dialog in enumerate(vCon.dialog):
+        # Skip non-recording dialogs
+        if dialog["type"] != "recording":
+            logger.info(
+                "whisper plugin: skipping non-recording dialog %s in vCon: %s",
+                index,
+                vCon.uuid,
+            )
+            continue
+
+        # Skip short recordings
+        if int(dialog["duration"]) < opts["minimum_duration"]:
+            logger.info("Skipping short recording dialog %s in vCon: %s",
+                        index, vCon.uuid)
+            continue
+
+        # Skip already transcribed dialogs
+        if get_transcription(vCon, index):
+            logger.info("Dialog %s already transcribed on vCon: %s", index,
+                        vCon.uuid)
+            continue
+
+        try:
+            # Attempt transcription with timing metrics
+            start = time.time()
+            logger.debug("Transcribing dialog %s in vCon: %s", index,
+                         vCon.uuid)
+            result = transcribe_groq_whisper(dialog, opts)
+            stats_gauge("conserver.link.groq_whisper.transcription_time",
+                        time.time() - start)
+        except RetryError as re:
+            logger.error(
+                "Failed to transcribe vCon %s after multiple retry attempts: %s",
+                vcon_uuid, re)
+            stats_count("conserver.link.groq_whisper.transcription_failures")
+            break
+        except Exception as e:
+            logger.error(
+                "Unexpected error transcribing vCon %s: %s",
+                vcon_uuid, e)
+            stats_count("conserver.link.groq_whisper.transcription_failures")
+            break
+
+        if not result:
+            logger.warning("No transcription generated for vCon %s", vcon_uuid)
+            stats_count(
+                "conserver.link.groq_whisper.transcription_failures")
+            break
+
+        logger.info("Transcribed vCon: %s", vCon.uuid)
+        logger.info(result)
+
+        # Handle different response formats from the Groq API
+        # The result could be a dict, an object with model_dump method, or something else
+        transcription_data = result
+        if hasattr(result, 'model_dump'):
+            transcription_data = result.model_dump()
+        elif not isinstance(result, dict):
+            transcription_data = {
+                "text": str(result),
+                "raw_response": str(result)
+            }
+
+        # Prepare vendor schema without sensitive data
+        vendor_schema = {
+            "opts": {
+                k: v
+                for k, v in opts.items() if k != "API_KEY"
+            }
+        }
+
+        # Add transcription analysis to vCon
+        vCon.add_analysis(
+            type="transcript",
+            dialog=index,
+            vendor="groq_whisper",
+            body=transcription_data,
+            extra={
+                "vendor_schema": vendor_schema,
+            },
+        )
+
+    # Store updated vCon
+    vcon_redis.store_vcon(vCon)
+
+    logger.info("Finished groq_whisper plugin for vCon: %s", vcon_uuid)
+    return vcon_uuid
diff --git a/server/links/groq_whisper/test_groq_whisper.py b/server/links/groq_whisper/test_groq_whisper.py