refactor: Migration from httpx to aiohttp for improved concurrency

davidgao7 · davidgao7 · commit 9f98e5a62ff1 · 2025-08-22T16:11:29.000+08:00
Replaced httpx with aiohttp for better asynchronous performance and resource utilization. Fixed JSON syntax error in error response handling.
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
     "xxhash==3.5.0",
     "psutil==7.0.0",
     "pyyaml>=6.0.2",
-    "httpx==0.28.1",
 ]
 
 [project.scripts]
diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt
@@ -1,7 +1,6 @@
 aiofiles==24.1.0
 aiohttp==3.9.5
 fastapi==0.115.8
-httpx==0.28.1
 kubernetes==32.0.0
 numpy==1.26.4
 prometheus_client==0.21.1
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
@@ -20,7 +20,6 @@
 from typing import Optional
 
 import aiohttp
-import httpx
 from fastapi import BackgroundTasks, HTTPException, Request, UploadFile
 from fastapi.responses import JSONResponse, StreamingResponse
 from requests import JSONDecodeError
@@ -627,14 +626,26 @@ async def route_general_transcriptions(
     logger.debug("==== data payload keys ====")
 
     try:
-        async with request.app.state.httpx_client_wrapper() as client:
-            backend_response = await client.post(
-                f"{chosen_url}{endpoint}", data=data, files=files, timeout=300.0
-            )
-        backend_response.raise_for_status()
+        client = request.app.state.aiohttp_client_wrapper()
+
+        form_data = aiohttp.FormData()
+
+        # add file data
+        for key, (filename, content, content_type) in files.items():
+            form_data.add_field(key, content, filename=filename, content_type=content_type)
+
+        # add from data
+        for key, value in data.items():
+            form_data.add_field(key,value)
+
+        backend_response = await client.post(
+            f"{chosen_url}{endpoint}",
+            data=form_data,
+            timeout=aiohttp.ClientTimeout(total=300)
+        )
 
         # --- 4. Return the response ---
-        response_content = backend_response.json()
+        response_content = await backend_response.json()
         headers = {
             k: v
             for k, v in backend_response.headers.items()
@@ -645,17 +656,19 @@ async def route_general_transcriptions(
 
         return JSONResponse(
             content=response_content,
-            status_code=backend_response.status_code,
+            status_code=backend_response.status,
             headers=headers,
         )
-    except httpx.HTTPStatusError as e:
-        error_content = (
-            e.response.json()
-            if "json" in e.response.headers.get("content-type", "")
-            else e.response.text
-        )
-        return JSONResponse(status_code=e.response.status_code, content=error_content)
-    except httpx.RequestError as e:
+    except aiohttp.ClientResponseError as e:
+        if hasattr(e, "response") and e.response is not None:
+            try:
+                error_content = await e.response.json()
+            except:
+                error_content = await e.response.text()
+        else:
+            error_content = {"error": f"HTTP {e.status}: {e.message}"}
+        return JSONResponse(status_code=e.status, content=error_content)
+    except aiohttp.ClientError as e:
         return JSONResponse(
-            status_code=503, content={"error": f"Failed to connect to backend: {e}"}
+            status_code=503, content={"error": f"Failed to connect to backend: {str(e)}"}
         )
diff --git a/tutorials/23-whisper-api-transcription.md b/tutorials/23-whisper-api-transcription.md
@@ -0,0 +1,99 @@
+# Tutorial: Whisper Transcription API in vLLM Production Stack
+
+## Overview
+
+This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model.
+
+## Prerequisites
+
+* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/))
+* Python 3.12 environment (recommended with `uv`)
+* `vllm` and `production-stack` cloned and installed
+* `vllm` installed with audio support:
+
+  ```bash
+  pip install vllm[audio]
+  ```
+
+## 1. Serving the Whisper Model
+
+Start a vLLM backend with the `whisper-small` model:
+
+```bash
+vllm serve \
+  --task transcription openai/whisper-small \
+  --host 0.0.0.0 --port 8002
+```
+
+## 2. Running the Router
+
+Create and run a router connected to the Whisper backend:
+
+```bash
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <router_port> <backend_url>"
+    exit 1
+fi
+
+uv run python3 -m vllm_router.app \
+  --host 0.0.0.0 --port "$1" \
+  --service-discovery static \
+  --static-backends "$2" \
+  --static-models "openai/whisper-small" \
+  --static-model-types "transcription" \
+  --routing-logic roundrobin \
+  --log-stats \
+  --engine-stats-interval 10 \
+  --request-stats-window 10
+```
+
+Example usage:
+
+```bash
+./run-router.sh 8000 http://localhost:8002
+```
+
+## 3. Sending a Transcription Request
+
+Use `curl` to send a `.wav` file to the transcription endpoint:
+
+* You can test with any `.wav` audio file of your choice.
+
+```bash
+curl -v http://localhost:8000/v1/audio/transcriptions \
+  -F 'file=@/path/to/audio.wav;type=audio/wav' \
+  -F 'model=openai/whisper-small' \
+  -F 'response_format=json' \
+  -F 'language=en'
+```
+
+### Supported Parameters
+
+| Parameter         | Description                                            |
+| ----------------- | ------------------------------------------------------ |
+| `file`            | Path to a `.wav` audio file                            |
+| `model`           | Whisper model to use (e.g., `openai/whisper-small`)    |
+| `prompt`          | *(Optional)* Text prompt to guide the transcription    |
+| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` |
+| `temperature`     | *(Optional)* Sampling temperature as a float           |
+| `language`        | ISO 639-1 code (e.g., `en`, `fr`, `zh`)                |
+
+## 4. Sample Output
+
+```json
+{
+  "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model"
+}
+```
+
+## 5. Notes
+
+* Router uses extended aiohttp timeouts to support long transcription jobs.
+* This implementation dynamically discovers valid transcription backends and routes requests accordingly.
+
+## 6. Resources
+
+* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469)
+* [OpenAI Whisper GitHub](https://github.com/openai/whisper)
+* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/)

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,6 @@ dependencies = [`
`26`	`26`	`"xxhash==3.5.0",`
`27`	`27`	`"psutil==7.0.0",`
`28`	`28`	`"pyyaml>=6.0.2",`
`29`		`- "httpx==0.28.1",`
`30`	`29`	`]`
`31`	`30`
`32`	`31`	`[project.scripts]`