fix: add content_index to streamed text deltas and improve stream reliability (#52)

OriNachum · claude · web-flow · commit 3211aa99d6ec · 2026-03-31T08:00:55.000+03:00
* fix: add content_index to streamed text delta events and improve stream reliability (#44) Add the missing `content_index` field to `response.output_text.delta` SSE events so clients that validate against the OpenAI Responses API spec (e.g. ChatKit SDK) no longer fail with a Pydantic validation error. Also introduces SSE heartbeat keepalives, configurable stream timeout, and structured stream timing logs to improve reliability with slow backends. Closes #44 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: address PR review - heartbeat task cleanup and trailing whitespace - Add try/finally to _with_heartbeat() to cancel in-flight tasks and close the underlying async iterator on cancellation/exit - Guard against interval <= 0 to prevent tight heartbeat loops - Fix trailing whitespace and missing newline in config.py Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: resolve SonarCloud quality gate failures in _with_heartbeat - S7497: Re-raise asyncio.CancelledError after cleanup instead of swallowing it - B110: Replace bare except/pass with logger.debug for aclose errors - S5806: Rename `aiter` to `inner` to avoid shadowing the builtin - S3776: Extract cleanup logic to _cleanup_heartbeat() to reduce cognitive complexity below threshold Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/.env.example b/.env.example
@@ -7,6 +7,10 @@ OPENAI_API_KEY=sk-mockapikey123456789abcdefghijklmnopqrstuvwxyz
 API_ADAPTER_HOST=0.0.0.0
 API_ADAPTER_PORT=8080
 
+# Streaming Configuration
+STREAM_TIMEOUT=120.0
+HEARTBEAT_INTERVAL=15.0
+
 # Logging Configuration (optional)
 LOG_LEVEL=INFO
 LOG_FILE_PATH=./log/api_adapter.log
diff --git a/src/open_responses_server/api_controller.py b/src/open_responses_server/api_controller.py
@@ -1,14 +1,61 @@
 import json
+import asyncio
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import StreamingResponse, Response
 from fastapi.middleware.cors import CORSMiddleware
 
-from open_responses_server.common.config import logger
+from open_responses_server.common.config import logger, HEARTBEAT_INTERVAL, STREAM_TIMEOUT
 from open_responses_server.common.llm_client import startup_llm_client, shutdown_llm_client, LLMClient
 from open_responses_server.common.mcp_manager import mcp_manager
 from open_responses_server.responses_service import convert_responses_to_chat_completions, process_chat_completions_stream
 from open_responses_server.chat_completions_service import handle_chat_completions
 
+_HEARTBEAT = object()
+
+
+async def _with_heartbeat(async_gen, interval):
+    """Wrap an async generator to yield _HEARTBEAT sentinels during idle periods.
+
+    Uses asyncio.wait with timeout so the underlying task is never cancelled.
+    This keeps SSE connections alive when the backend LLM is slow to respond.
+    """
+    if not interval or interval <= 0:
+        interval = 1.0
+
+    inner = async_gen.__aiter__()
+    task = None
+    try:
+        while True:
+            task = asyncio.ensure_future(inner.__anext__())
+            while not task.done():
+                done, _ = await asyncio.wait({task}, timeout=interval)
+                if not done:
+                    yield _HEARTBEAT
+            try:
+                yield task.result()
+            except StopAsyncIteration:
+                return
+            finally:
+                task = None
+    finally:
+        await _cleanup_heartbeat(task, inner)
+
+
+async def _cleanup_heartbeat(task, inner):
+    """Cancel in-flight task and close the underlying async iterator."""
+    if task is not None and not task.done():
+        task.cancel()
+        try:
+            await task
+        except asyncio.CancelledError:
+            raise
+    if hasattr(inner, "aclose"):
+        try:
+            await inner.aclose()
+        except Exception:
+            logger.debug("Error closing heartbeat inner iterator", exc_info=True)
+
+
 app = FastAPI(
     title="Open Responses Server",
     description="A proxy server that converts between different OpenAI-compatible API formats.",
@@ -249,7 +296,7 @@ async def stream_response():
                         "POST",
                         "/v1/chat/completions",
                         json=chat_request,
-                        timeout=120.0
+                        timeout=STREAM_TIMEOUT
                     ) as response:
                         logger.info(f"Stream request status: {response.status_code}")
                         
@@ -259,8 +306,15 @@ async def stream_response():
                             yield f"data: {json.dumps({'type': 'error', 'error': {'message': f'Error from LLM API: {response.status_code}'}})}\n\n"
                             return
                         
-                        async for event in process_chat_completions_stream(response, chat_request):
-                            yield event
+                        async for event in _with_heartbeat(
+                            process_chat_completions_stream(response, chat_request),
+                            HEARTBEAT_INTERVAL
+                        ):
+                            if event is _HEARTBEAT:
+                                logger.debug("[STREAM-HEARTBEAT] Sending SSE keepalive")
+                                yield ": heartbeat\n\n"
+                            else:
+                                yield event
                 except Exception as e:
                     logger.error(f"Error in stream_response: {str(e)}")
                     yield f"data: {json.dumps({'type': 'error', 'error': {'message': str(e)}})}\n\n"
@@ -346,7 +400,7 @@ async def stream_response():
         
 #        async def stream_response():
 #            try:
-#                async with client.stream("POST", "/v1/chat/completions", json=chat_request, timeout=120.0) as response:
+#                async with client.stream("POST", "/v1/chat/completions", json=chat_request, timeout=STREAM_TIMEOUT) as response:
 #                    if response.status_code != 200:
 #                        error_content = await response.aread()
 #                        logger.error(f"Error from LLM API: {error_content.decode()}")
@@ -411,12 +465,12 @@ async def proxy_endpoint(request: Request, path_name: str):
 
         if is_stream:
             async def stream_proxy():
-                async with client.stream(request.method, url, headers=headers, content=body, timeout=120.0) as response:
+                async with client.stream(request.method, url, headers=headers, content=body, timeout=STREAM_TIMEOUT) as response:
                     async for chunk in response.aiter_bytes():
                         yield chunk
             return StreamingResponse(stream_proxy(), media_type=request.headers.get('accept', 'application/json'))
         else:
-            response = await client.request(request.method, url, headers=headers, content=body, timeout=120.0)
+            response = await client.request(request.method, url, headers=headers, content=body, timeout=STREAM_TIMEOUT)
             return Response(content=response.content, status_code=response.status_code, headers=response.headers)
             
     except Exception as e:
diff --git a/src/open_responses_server/chat_completions_service.py b/src/open_responses_server/chat_completions_service.py
@@ -2,7 +2,7 @@
 from fastapi import Request
 from fastapi.responses import StreamingResponse, Response, JSONResponse
 from open_responses_server.common.llm_client import LLMClient
-from open_responses_server.common.config import logger, OPENAI_BASE_URL_INTERNAL, OPENAI_API_KEY, MAX_TOOL_CALL_ITERATIONS
+from open_responses_server.common.config import logger, OPENAI_BASE_URL_INTERNAL, OPENAI_API_KEY, MAX_TOOL_CALL_ITERATIONS, STREAM_TIMEOUT
 from open_responses_server.common.mcp_manager import mcp_manager, serialize_tool_result
 
 async def _handle_non_streaming_request(client: LLMClient, request_data: dict):
@@ -25,7 +25,7 @@ async def _handle_non_streaming_request(client: LLMClient, request_data: dict):
             response = await client.post(
                 "/v1/chat/completions",
                 json=current_request_data,
-                timeout=120.0
+                timeout=STREAM_TIMEOUT
             )
             response.raise_for_status()
             response_data = response.json()
@@ -102,7 +102,7 @@ async def _handle_streaming_request(client: LLMClient, request_data: dict) -> St
     for _ in range(MAX_TOOL_CALL_ITERATIONS):
         try:
             # Make a non-streaming request first to check for tool calls
-            response = await client.post("/v1/chat/completions", json={**non_stream_request_data, "messages": messages}, timeout=120.0)
+            response = await client.post("/v1/chat/completions", json={**non_stream_request_data, "messages": messages}, timeout=STREAM_TIMEOUT)
             response.raise_for_status()
             response_data = response.json()
             
@@ -170,7 +170,7 @@ async def stream_proxy():
                             "POST",
                             "/v1/chat/completions",
                             json=stream_request_data,
-                            timeout=120.0
+                            timeout=STREAM_TIMEOUT
                         ) as stream_response:
                             async for chunk in stream_response.aiter_bytes():
                                 yield chunk
diff --git a/src/open_responses_server/common/config.py b/src/open_responses_server/common/config.py
@@ -22,6 +22,10 @@
 MAX_CONVERSATION_HISTORY = int(os.environ.get("MAX_CONVERSATION_HISTORY", "100"))
 MAX_TOOL_CALL_ITERATIONS = int(os.environ.get("MAX_TOOL_CALL_ITERATIONS", "25"))
 
+# Streaming Configuration
+STREAM_TIMEOUT = float(os.environ.get("STREAM_TIMEOUT", "120.0"))
+HEARTBEAT_INTERVAL = float(os.environ.get("HEARTBEAT_INTERVAL", "15.0"))
+
 
 # --- Logging Configuration ---
 
@@ -54,4 +58,6 @@ def setup_logging():
 logger.info(f"  MCP_TOOL_REFRESH_INTERVAL: {MCP_TOOL_REFRESH_INTERVAL}")
 logger.info(f"  MCP_SERVERS_CONFIG_PATH: {MCP_SERVERS_CONFIG_PATH}")
 logger.info(f"  MAX_CONVERSATION_HISTORY: {MAX_CONVERSATION_HISTORY}")
-logger.info(f"  MAX_TOOL_CALL_ITERATIONS: {MAX_TOOL_CALL_ITERATIONS}") 
+logger.info(f"  MAX_TOOL_CALL_ITERATIONS: {MAX_TOOL_CALL_ITERATIONS}")
+logger.info(f"  STREAM_TIMEOUT: {STREAM_TIMEOUT}")
+logger.info(f"  HEARTBEAT_INTERVAL: {HEARTBEAT_INTERVAL}")
diff --git a/src/open_responses_server/common/llm_client.py b/src/open_responses_server/common/llm_client.py
@@ -1,5 +1,5 @@
 import httpx
-from .config import OPENAI_BASE_URL_INTERNAL, OPENAI_API_KEY, logger
+from .config import OPENAI_BASE_URL_INTERNAL, OPENAI_API_KEY, STREAM_TIMEOUT, logger
 
 class LLMClient:
     """
@@ -18,7 +18,7 @@ async def get_client(cls) -> httpx.AsyncClient:
             cls._client = httpx.AsyncClient(
                 base_url=OPENAI_BASE_URL_INTERNAL,
                 headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
-                timeout=httpx.Timeout(120.0)
+                timeout=httpx.Timeout(STREAM_TIMEOUT)
             )
         return cls._client
 
diff --git a/src/open_responses_server/models/responses_models.py b/src/open_responses_server/models/responses_models.py
@@ -91,6 +91,7 @@ class OutputTextDelta(BaseModel):
     type: str = "response.output_text.delta"
     item_id: str
     output_index: int
+    content_index: int
     delta: str
 
 class ResponseCreated(BaseModel):
diff --git a/src/open_responses_server/responses_service.py b/src/open_responses_server/responses_service.py
@@ -324,7 +324,9 @@ async def process_chat_completions_stream(response, chat_request=None):
     tool_call_counter = 0
     message_id = f"msg_{uuid.uuid4().hex}"
     output_text_content = ""  # Track the full text content for logging
-    logger.info(f"Processing streaming response from chat.completions API response_id {response_id}; message_id {message_id}")
+    request_start_time = time.time()
+    last_chunk_time = request_start_time
+    logger.info(f"[STREAM-START] response_id={response_id} message_id={message_id}")
     
     # Create and yield the initial response.created event
     response_obj = ResponseModel(
@@ -354,12 +356,25 @@ async def process_chat_completions_stream(response, chat_request=None):
     try:
         async for chunk in response.aiter_lines():
             chunk_counter += 1
+            now = time.time()
+            chunk_gap = now - last_chunk_time
+            last_chunk_time = now
+            if chunk_gap > 2.0:
+                logger.info(
+                    f"[STREAM-TIMING] response_id={response_id} "
+                    f"chunk_gap={chunk_gap:.1f}s chunk={chunk_counter}"
+                )
             if not chunk.strip():
                 continue
                 
             # Handle [DONE] message
             if chunk.strip() == "data: [DONE]" or chunk.strip() == "[DONE]":
-                logger.info(f"Received [DONE] message after {chunk_counter} chunks (status: {response_obj.status})")
+                total_time = time.time() - request_start_time
+                logger.info(
+                    f"[STREAM-DONE] response_id={response_id} "
+                    f"chunks={chunk_counter} total_time={total_time:.1f}s "
+                    f"status={response_obj.status}"
+                )
                 
                 # If we haven't already completed the response, do it now
                 if response_obj.status != "completed":
@@ -544,6 +559,7 @@ async def process_chat_completions_stream(response, chat_request=None):
                                 type="response.output_text.delta",
                                 item_id=message_id,
                                 output_index=0,
+                                content_index=0,
                                 delta=content_delta
                             )
                             
@@ -595,6 +611,7 @@ async def process_chat_completions_stream(response, chat_request=None):
                                         type="response.output_text.delta",
                                         item_id=tool_call["id"],
                                         output_index=0,
+                                        content_index=0,
                                         delta=text
                                     )
                                     yield f"data: {json.dumps(text_event.dict())}\n\n"
@@ -725,10 +742,11 @@ async def process_chat_completions_stream(response, chat_request=None):
                                         type="response.output_text.delta",
                                         item_id=tool_call["id"],
                                         output_index=0,
+                                        content_index=0,
                                         delta=text
                                     )
                                     yield f"data: {json.dumps(text_event.dict())}\n\n"
-                                    
+
                                     logger.info(f"[TOOL-CALLS-FINISH] Added function_call_output for MCP tool '{tool_call['function']['name']}'")
                                     
                                 else:
@@ -885,7 +903,12 @@ async def process_chat_completions_stream(response, chat_request=None):
                 continue
     
     except Exception as e:
-        logger.error(f"Error processing streaming response: {str(e)}")
+        total_time = time.time() - request_start_time
+        logger.error(
+            f"[STREAM-ERROR] response_id={response_id} "
+            f"error={str(e)} total_time={total_time:.1f}s "
+            f"chunks={chunk_counter}"
+        )
         # Emit a completion event if we haven't already
         if response_obj.status != "completed":
             response_obj.status = "completed"
diff --git a/tests/test_api_controller_endpoints.py b/tests/test_api_controller_endpoints.py
@@ -1,13 +1,14 @@
 """
 Tests for api_controller.py endpoints.
 """
+import asyncio
 import json
 import pytest
 from unittest.mock import patch, MagicMock, AsyncMock
 from fastapi.testclient import TestClient
 from fastapi.responses import StreamingResponse
 
-from open_responses_server.api_controller import app
+from open_responses_server.api_controller import app, _with_heartbeat, _HEARTBEAT
 
 
 class TestResponsesEndpoint:
@@ -274,3 +275,66 @@ def test_proxy_invalid_json_body(self, client, mock_llm_client_fixture):
             headers={"content-type": "text/plain"},
         )
         assert response.status_code == 200
+
+
+@pytest.mark.asyncio
+class TestWithHeartbeat:
+    """Tests for the _with_heartbeat async generator wrapper."""
+
+    async def test_fast_generator_no_heartbeats(self):
+        """Fast generators produce no heartbeat sentinels."""
+        async def fast_gen():
+            yield "a"
+            yield "b"
+            yield "c"
+
+        results = [item async for item in _with_heartbeat(fast_gen(), interval=10.0)]
+        assert results == ["a", "b", "c"]
+        assert _HEARTBEAT not in results
+
+    async def test_slow_generator_emits_heartbeats(self):
+        """Slow generators trigger heartbeat sentinels between items."""
+        async def slow_gen():
+            yield "first"
+            await asyncio.sleep(0.6)
+            yield "second"
+
+        results = [item async for item in _with_heartbeat(slow_gen(), interval=0.2)]
+        # Should have at least one heartbeat between "first" and "second"
+        heartbeats = [r for r in results if r is _HEARTBEAT]
+        data = [r for r in results if r is not _HEARTBEAT]
+        assert len(heartbeats) >= 1
+        assert data == ["first", "second"]
+
+    async def test_empty_generator(self):
+        """Empty generator produces no output."""
+        async def empty_gen():
+            return
+            yield  # noqa: unreachable - makes this an async generator
+
+        results = [item async for item in _with_heartbeat(empty_gen(), interval=1.0)]
+        assert results == []
+
+    async def test_generator_exception_propagates(self):
+        """Exceptions from the wrapped generator propagate through."""
+        async def error_gen():
+            yield "ok"
+            raise ValueError("test error")
+
+        results = []
+        with pytest.raises(ValueError, match="test error"):
+            async for item in _with_heartbeat(error_gen(), interval=1.0):
+                results.append(item)
+        assert results == ["ok"]
+
+    async def test_heartbeat_count_scales_with_delay(self):
+        """Longer delays produce more heartbeats."""
+        async def very_slow_gen():
+            yield "start"
+            await asyncio.sleep(1.0)
+            yield "end"
+
+        results = [item async for item in _with_heartbeat(very_slow_gen(), interval=0.2)]
+        heartbeats = [r for r in results if r is _HEARTBEAT]
+        # ~1.0s delay / 0.2s interval = ~5 heartbeats (allow some variance)
+        assert len(heartbeats) >= 3
diff --git a/tests/test_responses_service.py b/tests/test_responses_service.py