teabranch
diff --git a/‎.env.example‎
Lines changed: 10 additions & 2 deletions b/‎.env.example‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 5 additions & 3 deletions b/‎CLAUDE.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 16 additions & 4 deletions b/‎README.md‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎docs/cli-local.md‎
Lines changed: 52 additions & 44 deletions b/‎docs/cli-local.md‎
Lines changed: 52 additions & 44 deletions
diff --git a/‎docs/events-and-tool-handling.md‎
Lines changed: 13 additions & 1 deletion b/‎docs/events-and-tool-handling.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎docs/open-responses-server.md‎
Lines changed: 25 additions & 10 deletions b/‎docs/open-responses-server.md‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎src/open_responses_server/api_controller.py‎
Lines changed: 7 additions & 86 deletions b/‎src/open_responses_server/api_controller.py‎
Lines changed: 7 additions & 86 deletions
@@ -7,10 +7,18 @@ OPENAI_API_KEY=sk-mockapikey123456789abcdefghijklmnopqrstuvwxyz
 API_ADAPTER_HOST=0.0.0.0
 API_ADAPTER_PORT=8080
 
+# MCP Configuration
+MCP_SERVERS_CONFIG_PATH=src/open_responses_server/servers_config.json
+MCP_TOOL_REFRESH_INTERVAL=10
+
+# Conversation and Tool Handling
+MAX_CONVERSATION_HISTORY=100
+MAX_TOOL_CALL_ITERATIONS=25
+
 # Streaming Configuration
 STREAM_TIMEOUT=120.0
 HEARTBEAT_INTERVAL=15.0
 
-# Logging Configuration (optional)
+# Logging Configuration
 LOG_LEVEL=INFO
-LOG_FILE_PATH=./log/api_adapter.log
+LOG_FILE_PATH=./log/api_adapter.log
@@ -73,7 +73,7 @@ api_controller.py  -- FastAPI app with route definitions, CORS, startup/shutdown
 - `api_controller.py` - Route definitions. `server_entrypoint.py` is the uvicorn entry point that imports from `api_controller`.
 - `responses_service.py` - Converts Responses API requests to Chat Completions format (`convert_responses_to_chat_completions`), processes streaming Chat Completions responses back into Responses API SSE events (`process_chat_completions_stream`). Maintains in-memory `conversation_history` keyed by `previous_response_id`.
 - `chat_completions_service.py` - Handles `/v1/chat/completions` with MCP tool injection. Implements a tool-call loop (up to `MAX_TOOL_CALL_ITERATIONS`) for both streaming and non-streaming modes.
-- `common/mcp_manager.py` - `MCPManager` singleton manages MCP server lifecycle (stdio-based), tool discovery/caching with periodic refresh, and tool execution. `MCPServer` wraps individual server sessions.
+- `common/mcp_manager.py` - `MCPManager` singleton manages MCP server lifecycle (stdio, sse, streamable-http transports), tool discovery/caching with periodic refresh, and tool execution. `MCPServer` wraps individual server sessions.
 - `common/llm_client.py` - `LLMClient` singleton wrapping `httpx.AsyncClient`, pointed at `OPENAI_BASE_URL_INTERNAL`.
 - `common/config.py` - All configuration via environment variables (loaded from `.env` via python-dotenv). Key vars: `OPENAI_BASE_URL_INTERNAL`, `OPENAI_API_KEY`, `MCP_SERVERS_CONFIG_PATH`, `MAX_TOOL_CALL_ITERATIONS`.
 - `models/responses_models.py` - Pydantic models for Responses API request/response/streaming types.
@@ -82,7 +82,9 @@ api_controller.py  -- FastAPI app with route definitions, CORS, startup/shutdown
 
 ## Configuration
 
-All config is via environment variables (see `common/config.py`). The CLI command `otc configure` writes a `.env` file interactively. MCP servers are configured in a JSON file pointed to by `MCP_SERVERS_CONFIG_PATH` (default: `src/open_responses_server/servers_config.json`).
+All config is via environment variables (see `common/config.py`). The CLI command `otc configure` writes a `.env` file interactively. MCP servers are configured in a JSON file pointed to by `MCP_SERVERS_CONFIG_PATH` (default: `src/open_responses_server/servers_config.json`). Note: this default path assumes running from the repo root; when installed via pip, set it to an absolute path.
+
+**Important:** The `/responses` endpoint only supports streaming (`stream=True`). Non-streaming requests return HTTP 501.
 
 ## Version & Releasing
 
@@ -96,7 +98,7 @@ Version lives in `src/open_responses_server/version.py` as `__version__` — the
 
 ## CLI Entry Point
 
-The `otc` command is defined in `pyproject.toml` pointing to `open_responses_server.cli:main`. Commands: `start`, `configure`, `help`.
+The `otc` command is defined in `pyproject.toml` pointing to `open_responses_server.cli:main`. Commands: `start`, `configure`, `help`. Also supports `--version` flag.
 
 ## PR Workflow
 
 
@@ -81,6 +81,7 @@ docker run -p 8080:8080 \
   ghcr.io/teabranch/open-responses-server:latest
 ```
 
+Docker images are available for linux/amd64, linux/arm64, and linux/arm/v7 architectures.
 Works great with docker-compose.yaml for Codex + your own model.
 
 ⸻
@@ -90,7 +91,7 @@ Works great with docker-compose.yaml for Codex + your own model.
 Minimal config to connect your AI backend:
 
 ```
-OPENAI_BASE_URL_INTERNAL=http://localhost:11434  # Ollama, vLLM, Groq, etc.
+OPENAI_BASE_URL_INTERNAL=http://localhost:8000   # Your LLM backend (Ollama typically on :11434, vLLM on :8000)
 OPENAI_BASE_URL=http://localhost:8080            # This server's endpoint
 OPENAI_API_KEY=sk-mockapikey123456789            # Mock key tunneled to backend
 MCP_SERVERS_CONFIG_PATH=./mcps.json              # Path to mcps servers json file 
@@ -101,10 +102,21 @@ Server binding:
 API_ADAPTER_HOST=0.0.0.0
 API_ADAPTER_PORT=8080
 ```
-Optional logging:
+Streaming and connection:
 ```
-LOG_LEVEL=INFO
-LOG_FILE_PATH=./log/api_adapter.log
+STREAM_TIMEOUT=120.0                # HTTP timeout (seconds) for streaming requests
+HEARTBEAT_INTERVAL=15.0             # SSE keepalive interval (seconds)
+```
+Conversation and tool handling:
+```
+MAX_CONVERSATION_HISTORY=100        # Max stored conversation entries
+MAX_TOOL_CALL_ITERATIONS=25         # Max tool-call loop iterations
+MCP_TOOL_REFRESH_INTERVAL=10        # Seconds between MCP tool cache refreshes
+```
+Logging:
+```
+LOG_LEVEL=INFO                      # DEBUG, INFO, WARNING, ERROR, CRITICAL
+LOG_FILE_PATH=./log/api_adapter.log # Path to log file
 ```
 
 Configure with CLI tool:
 
@@ -3,47 +3,55 @@ title: CLI Usage
 nav_order: 5
 ---
 
-# CLI Usage
-
-To run the `cli.py` script and use it to manage the `server.py`, follow these steps:
-
-1. **Install uv and dependences**
-   Assumed you installed dependencies already.
-
-2. **Run the CLI Script**:
-   You can execute the `cli.py` script directly using Python. For example:
-   ```bash
-   uv run src/open_responses_server/cli.py <command>
-   ```
-   Replace `<command>` with one of the available commands (`start`, `configure`, or `help`).
-
-3. **Available Commands**:
-   - `start`: Starts the FastAPI server defined in `server.py`.
-   - `configure`: Allows you to configure server settings like host, port, API URLs, and API key.
-   - `help`: Displays help information about the CLI.
-
-4. **Example Usage**:
-   - To start the server:
-     ```bash
-     python src/open_responses_server/cli.py start
-     ```
-   - To configure the server:
-     ```bash
-     python src/open_responses_server/cli.py configure
-     ```
-   - To display help:
-     ```bash
-     python src/open_responses_server/cli.py help
-     ```
-
-5. **Make the Script Executable (Optional)**:
-   If you want to run the script without explicitly calling Python, you can make it executable:
-   ```bash
-   chmod +x src/open_responses_server/cli.py
-   ```
-   Then, run it directly:
-   ```bash
-   ./src/open_responses_server/cli.py <command>
-   ```
-
-Let me know if you need further assistance!
+## Overview
+
+The `otc` command is the CLI entry point for Open Responses Server, defined in
+`pyproject.toml` pointing to `open_responses_server.cli:main`.
+
+## Commands
+
+| Command | Description |
+| --- | --- |
+| `otc start` | Start the FastAPI server |
+| `otc configure` | Interactive configuration wizard (saves to `.env`) |
+| `otc help` | Display help information |
+| `otc --version` | Show version information |
+
+## Running after installation
+
+```bash
+# After pip install or uv pip install
+otc start
+otc configure
+otc --version
+```
+
+## Running from source
+
+```bash
+# Using uv
+uv run src/open_responses_server/cli.py start
+
+# Or directly with Python (venv must be activated)
+python src/open_responses_server/cli.py start
+```
+
+## Start command
+
+Starts the FastAPI server via uvicorn. The server binds to the host and port
+defined by `API_ADAPTER_HOST` and `API_ADAPTER_PORT` environment variables
+(defaults: `0.0.0.0:8080`).
+
+```bash
+otc start
+```
+
+## Configure command
+
+Interactive wizard that prompts for host, port, backend URL, external URL, and
+API key. Saves the configuration to a `.env` file in the current directory,
+merging with any existing values.
+
+```bash
+otc configure
+```
@@ -275,7 +275,19 @@ When processing Responses API input with `function_call_output` items
    containing the tool name and arguments, then adds the tool response.
    This handles resuming from external tool execution.
 
-## Pydantic Models Reference
+## Connection Keepalive (Heartbeat)
+
+When the backend LLM is slow to respond, the server sends SSE comment lines
+(`: heartbeat\n\n`) at the interval configured by `HEARTBEAT_INTERVAL`
+(default: 15 seconds). This prevents proxies and load balancers from closing
+idle connections.
+
+Heartbeats are standard SSE comments and should be ignored by compliant clients.
+The mechanism is implemented by `_with_heartbeat()` in `api_controller.py`,
+which wraps the response stream and injects heartbeat sentinels during idle
+periods.
+
+## Pydantic Models
 
 Defined in `src/open_responses_server/models/responses_models.py`.
 
 
@@ -27,8 +27,6 @@ a tool-call execution loop, plus a generic proxy for all other endpoints.
 | `server_entrypoint.py` | Uvicorn entry point (imports `app` from `api_controller`) |
 | `cli.py` | `otc` CLI: `start`, `configure`, `help` commands |
 | `version.py` | `__version__` string, read dynamically by setuptools |
-| `server.py` | Legacy duplicate of api_controller (not imported by active code) |
-| `is_mcp_tool.py` | Standalone utility, superseded by `MCPManager.is_mcp_tool()` |
 
 ## Request Routing
 
@@ -50,7 +48,7 @@ Client
   │    → Tool-call loop (up to MAX_TOOL_CALL_ITERATIONS)
   │    → Final response streamed or returned as JSON
   │
-  ├─ GET /health → {"status": "ok"}
+  ├─ GET /health → {"status": "ok", "adapter": "running"}
   ├─ GET /       → {"message": "Open Responses Server is running."}
   │
   └─ GET/POST /{path} (catch-all proxy)
@@ -71,28 +69,45 @@ All configuration is via environment variables, loaded from `.env` via
 | `API_ADAPTER_HOST` | `0.0.0.0` | Server bind address |
 | `API_ADAPTER_PORT` | `8080` | Server port |
 | `MCP_TOOL_REFRESH_INTERVAL` | `10` | Seconds between MCP tool cache refreshes |
-| `MCP_SERVERS_CONFIG_PATH` | `src/open_responses_server/servers_config.json` | Path to MCP servers JSON config |
+| `MCP_SERVERS_CONFIG_PATH` | `src/open_responses_server/servers_config.json` | Path to MCP servers JSON config (use absolute path when pip-installed) |
 | `MAX_CONVERSATION_HISTORY` | `100` | Max stored conversation entries |
 | `MAX_TOOL_CALL_ITERATIONS` | `25` | Max tool-call loop iterations |
+| `STREAM_TIMEOUT` | `120.0` | HTTP timeout (seconds) for streaming requests |
+| `HEARTBEAT_INTERVAL` | `15.0` | SSE keepalive interval (seconds) |
+| `LOG_LEVEL` | `INFO` | Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) |
+| `LOG_FILE_PATH` | `./log/api_adapter.log` | Path to log file |
 
 ### MCP Server Configuration
 
-The JSON file at `MCP_SERVERS_CONFIG_PATH` defines MCP servers:
+The JSON file at `MCP_SERVERS_CONFIG_PATH` defines MCP servers. Three transport
+types are supported: `stdio` (default), `sse`, and `streamable-http`.
 
 ```json
 {
   "mcpServers": {
-    "server-name": {
-      "command": "executable",
-      "args": ["arg1", "arg2"],
+    "stdio-server": {
+      "type": "stdio",
+      "command": "npx",
+      "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"],
       "env": {"KEY": "value"}
+    },
+    "sse-server": {
+      "type": "sse",
+      "url": "http://example.com/sse",
+      "headers": {"Authorization": "Bearer token"}
+    },
+    "http-server": {
+      "type": "streamable-http",
+      "url": "http://example.com/mcp",
+      "headers": {"Authorization": "Bearer token"}
     }
   }
 }
 ```
 
-Each server is started as a subprocess via `stdio_client` from the `mcp`
-library.
+The `type` field defaults to `stdio` if omitted. Stdio servers use `command`,
+`args`, and `env` fields. SSE and streamable-http servers use `url` and optional
+`headers` fields.
 
 ## Startup / Shutdown Lifecycle
 
 
@@ -326,99 +326,20 @@ async def stream_response():
 
         else:
             logger.info("Non-streaming response unsupported")
-            
+            raise HTTPException(
+                status_code=501,
+                detail="Non-streaming responses are not supported on /responses. Set stream=True."
+            )
+
+    except HTTPException:
+        raise
     except Exception as e:
         logger.error(f"Error in create_response: {str(e)}")
         raise HTTPException(
             status_code=500,
             detail=f"Error processing request: {str(e)}"
         )
 
-# @app.post("/responses")
-#async def create_response(request: Request):
-#    """
-#    Endpoint for the custom /responses API.
-#    Converts the request, calls the chat completions endpoint, and streams the converted response.
-#    """
-#    try:
-#        request_data = await request.json()
-#        
-#        # Log basic request information
-#        logger.info(f"Received request: model={request_data.get('model')}, stream={request_data.get('stream')}")
-#        
-#        # Log input content for better visibility
-#        if "input" in request_data and request_data["input"]:
-#            logger.info("==== REQUEST CONTENT ====")
-#            for i, item in enumerate(request_data["input"]):
-#                if isinstance(item, dict):
-#                    if item.get("type") == "message" and item.get("role") == "user":
-#                        if "content" in item and isinstance(item["content"], list):
-#                            for index, content_item in enumerate(item["content"]):
-#                                if isinstance(content_item, dict):
-                                    # Handle nested content structure like {"type": "input_text", "text": "actual message"}
-#                                    if content_item.get("type") == "input_text" and "text" in content_item:
-#                                        user_text = content_item.get("text", "")
-#                                        logger.info(f"USER INPUT: {user_text}")
-#                                    elif content_item.get("type") == "text" and "text" in content_item:
-#                                        user_text = content_item.get("text", "")
-#                                        logger.info(f"USER INPUT: {user_text}")
-#                                    # Handle other content types
-#                                    elif "type" in content_item:
-#                                        logger.info(f"USER INPUT ({content_item.get('type')}): {str(content_item)[:100]}...")
-#                                elif isinstance(content_item, str):
-#                                    logger.info(f"USER INPUT: {content_item}")
-#                    elif item.get("type") == "function_call_output":
-#                        logger.info(f"FUNCTION RESULT: call_id={item.get('call_id')}, output={str(item.get('output', ''))[:100]}...")
-#                elif isinstance(item, str):
-#                    logger.info(f"USER INPUT: {item}")
-#            logger.info("=======================")
-
-#        # Inject MCP tools into the request before conversion
-#        mcp_tools = mcp_manager.get_mcp_tools()
-#        if mcp_tools:
-#            # Start with user-provided tools, or an empty list
-#            final_tools = request_data.get("tools", [])
-            
-#            # Get the names of the tools already in the list
-#            final_tool_names = {
-#                tool.get("function", {}).get("name") if tool.get("function") else tool.get("name")
-#                for tool in final_tools
-#                if (tool.get("function") and tool.get("function").get("name")) or tool.get("name")
-#            }
-            
-#            # Add only the new MCP tools that don't conflict
-#            for tool in mcp_tools:
-#                if tool.get("name") not in final_tool_names:
-#                    final_tools.append({"type": "function", "function": tool})
-            
-#            request_data["tools"] = final_tools
-#            logger.info(f"Injected {len(mcp_tools)} MCP tools into request")
-
-#        chat_request = convert_responses_to_chat_completions(request_data)
-        
-#        client = await LLMClient.get_client()
-        
-#        async def stream_response():
-#            try:
-#                async with client.stream("POST", "/v1/chat/completions", json=chat_request, timeout=STREAM_TIMEOUT) as response:
-#                    if response.status_code != 200:
-#                        error_content = await response.aread()
-#                        logger.error(f"Error from LLM API: {error_content.decode()}")
-#                        yield f"data: {json.dumps({'error': 'LLM API Error'})}\n\n"
-#                        return
-                    
-#                    async for event in process_chat_completions_stream(response, chat_request):
-#                        yield event
-#            except Exception as e:
-#                logger.error(f"Error in /responses stream: {e}")
-#                yield f"data: {json.dumps({'error': str(e)})}\n\n"
-        
-#        return StreamingResponse(stream_response(), media_type="text/event-stream")
-
-#    except Exception as e:
-#        logger.error(f"Error in create_response endpoint: {e}")
-#        raise HTTPException(status_code=500, detail=str(e))
-
 
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):