fix: resolve streaming response corruption on HTTP keep-alive connections (#80)

jundot · jundot · commit ebe214f1528b · 2026-03-05T15:54:00.000+09:00
Remove custom StreamingResponse that created duplicate ASGI receive()
consumers, causing TransferEncodingError on second request. Replace
BaseHTTPMiddleware with pure ASGI middleware to avoid streaming response
pipe layer interference. Fix MockBaseEngine.count_chat_tokens signature.
diff --git a/omlx/server.py b/omlx/server.py
@@ -55,7 +55,7 @@
 from fastapi import Depends, FastAPI, HTTPException, Request as FastAPIRequest
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse as _BaseStreamingResponse
+from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 
 from omlx._version import __version__
@@ -143,50 +143,6 @@
 logger = logging.getLogger(__name__)
 
 
-class StreamingResponse(_BaseStreamingResponse):
-    """StreamingResponse that aborts generation when client disconnects.
-
-    Monitors the ASGI receive channel for http.disconnect and closes
-    the body iterator, propagating GeneratorExit through the engine's
-    stream_generate which calls abort_request().
-    """
-
-    async def __call__(self, scope, receive, send):
-        disconnected = asyncio.Event()
-
-        async def _monitor_disconnect():
-            while True:
-                message = await receive()
-                if message.get("type") == "http.disconnect":
-                    disconnected.set()
-                    return
-
-        monitor_task = asyncio.create_task(_monitor_disconnect())
-
-        inner = self.body_iterator
-
-        async def _disconnect_aware():
-            try:
-                async for chunk in inner:
-                    if disconnected.is_set():
-                        logger.info("Client disconnected, stopping stream")
-                        return
-                    yield chunk
-            finally:
-                if hasattr(inner, "aclose"):
-                    await inner.aclose()
-
-        self.body_iterator = _disconnect_aware()
-        try:
-            await super().__call__(scope, receive, send)
-        finally:
-            monitor_task.cancel()
-            try:
-                await monitor_task
-            except asyncio.CancelledError:
-                pass
-
-
 # Security bearer for API key authentication
 security = HTTPBearer(auto_error=False)
 
@@ -434,19 +390,57 @@ async def unhandled_exception_handler(request: FastAPIRequest, exc: Exception):
     )
 
 
-@app.middleware("http")
-async def debug_request_logging(request: FastAPIRequest, call_next):
-    """Log full request body for POST requests when debug logging is enabled."""
-    if logger.isEnabledFor(5) and request.method == "POST":
-        body = await request.body()
+class DebugRequestLoggingMiddleware:
+    """Pure ASGI middleware for trace-level request body logging.
+
+    Uses raw ASGI protocol instead of BaseHTTPMiddleware to avoid
+    wrapping StreamingResponse in an intermediate pipe layer, which
+    causes connection corruption on HTTP keep-alive connections.
+    """
+
+    def __init__(self, app):
+        self.app = app
+
+    async def __call__(self, scope, receive, send):
+        if (
+            scope["type"] != "http"
+            or not logger.isEnabledFor(5)
+            or scope.get("method") != "POST"
+        ):
+            await self.app(scope, receive, send)
+            return
+
+        # Read and cache the request body for logging
+        body_parts = []
+        while True:
+            message = await receive()
+            body_parts.append(message)
+            if not message.get("more_body", False):
+                break
+
+        body = b"".join(part.get("body", b"") for part in body_parts)
         logger.log(
             5,
             "Incoming %s %s — body: %s",
-            request.method, request.url.path,
+            scope["method"],
+            scope["path"],
             body.decode("utf-8", errors="replace"),
         )
-    response = await call_next(request)
-    return response
+
+        # Replay cached body for inner app, then forward real receive
+        body_sent = False
+
+        async def cached_receive():
+            nonlocal body_sent
+            if not body_sent:
+                body_sent = True
+                return {"type": "http.request", "body": body, "more_body": False}
+            return await receive()
+
+        await self.app(scope, cached_receive, send)
+
+
+app.add_middleware(DebugRequestLoggingMiddleware)
 
 
 # =============================================================================
diff --git a/tests/integration/test_e2e_streaming.py b/tests/integration/test_e2e_streaming.py
@@ -104,7 +104,7 @@ async def stream_generate(self, prompt: str, **kwargs) -> AsyncIterator[MockGene
                 finish_reason="stop",
             )
 
-    def count_chat_tokens(self, messages: List[Dict], tools=None) -> int:
+    def count_chat_tokens(self, messages: List[Dict], tools=None, chat_template_kwargs=None) -> int:
         prompt = self._tokenizer.apply_chat_template(messages, tokenize=False)
         return len(self._tokenizer.encode(prompt))
 
diff --git a/tests/integration/test_server_endpoints.py b/tests/integration/test_server_endpoints.py
@@ -177,7 +177,7 @@ async def stream_generate(self, prompt: str, **kwargs):
             finish_reason="stop",
         )
 
-    def count_chat_tokens(self, messages: List[Dict], tools=None) -> int:
+    def count_chat_tokens(self, messages: List[Dict], tools=None, chat_template_kwargs=None) -> int:
         prompt = self._tokenizer.apply_chat_template(messages, tokenize=False)
         return len(self._tokenizer.encode(prompt))
 

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ async def stream_generate(self, prompt: str, **kwargs) -> AsyncIterator[MockGene`
`104`	`104`	`finish_reason="stop",`
`105`	`105`	`)`
`106`	`106`
`107`		`- def count_chat_tokens(self, messages: List[Dict], tools=None) -> int:`
	`107`	`+ def count_chat_tokens(self, messages: List[Dict], tools=None, chat_template_kwargs=None) -> int:`
`108`	`108`	`prompt = self._tokenizer.apply_chat_template(messages, tokenize=False)`
`109`	`109`	`return len(self._tokenizer.encode(prompt))`
`110`	`110`
Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ async def stream_generate(self, prompt: str, **kwargs):`
`177`	`177`	`finish_reason="stop",`
`178`	`178`	`)`
`179`	`179`
`180`		`- def count_chat_tokens(self, messages: List[Dict], tools=None) -> int:`
	`180`	`+ def count_chat_tokens(self, messages: List[Dict], tools=None, chat_template_kwargs=None) -> int:`
`181`	`181`	`prompt = self._tokenizer.apply_chat_template(messages, tokenize=False)`
`182`	`182`	`return len(self._tokenizer.encode(prompt))`
`183`	`183`