[Responses API] Disable response store by default (#22137)

WoosukKwon · web-flow · commit 6d98843b31fb · 2025-08-03T04:04:21.000-07:00
Signed-off-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -21,12 +21,16 @@ def default_server_args():
 
 
 @pytest.fixture(scope="module")
-def server(default_server_args):
-    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+def server_with_store(default_server_args):
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            default_server_args,
+            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+    ) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
+async def client(server_with_store):
+    async with server_with_store.get_async_client() as async_client:
         yield async_client
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -37,8 +37,11 @@ def default_image_server_args():
 
 @pytest.fixture(scope="module")
 def image_server(default_image_server_args):
-    with RemoteOpenAIServer(MODEL_NAME,
-                            default_image_server_args) as remote_server:
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            default_image_server_args,
+            env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
+    ) as remote_server:
         yield remote_server
 
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -11,6 +11,7 @@
 from fastapi import Request
 from openai.types.responses import ResponseOutputMessage, ResponseOutputText
 
+from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
@@ -89,15 +90,17 @@ def __init__(
             logger.info("Using default chat sampling params from %s: %s",
                         source, self.default_sampling_params)
 
+        # False by default.
+        self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
         # HACK(woosuk): This is a hack. We should use a better store.
-        # FIXME: This causes a memory leak since we never remove responses
-        # from the store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove responses from the store.
         self.response_store: dict[str, ResponsesResponse] = {}
         self.response_store_lock = asyncio.Lock()
 
         # HACK(woosuk): This is a hack. We should use a better store.
-        # FIXME: This causes a memory leak since we never remove messages
-        # from the store.
+        # FIXME: If enable_store=True, this may cause a memory leak since we
+        # never remove messages from the store.
         self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
 
         self.background_tasks: dict[str, asyncio.Task] = {}
@@ -118,6 +121,10 @@ async def create_responses(
         if self.engine_client.errored:
             raise self.engine_client.dead_error
 
+        # If store is not enabled, return an error.
+        if request.store and not self.enable_store:
+            return self._make_store_not_supported_error()
+
         # Handle the previous response ID.
         prev_response_id = request.previous_response_id
         if prev_response_id is not None:
@@ -456,3 +463,13 @@ def _make_not_found_error(self, response_id: str) -> ErrorResponse:
             message=f"Response with id '{response_id}' not found.",
             status_code=HTTPStatus.NOT_FOUND,
         )
+
+    def _make_store_not_supported_error(self) -> ErrorResponse:
+        return self.create_error_response(
+            err_type="invalid_request_error",
+            message=("`store=True` (default) is not supported. Please set "
+                     "`store=False` in Responses API or set "
+                     "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
+                     "starting the vLLM server."),
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -151,6 +151,7 @@
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ENABLE_RESPONSES_API_STORE: bool = False
 
 
 def get_default_cache_root():
@@ -1056,6 +1057,17 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
     lambda: bool(int(os.getenv(\
             "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
+
+    # Enables support for the "store" option in the OpenAI Responses API.
+    # When set to 1, vLLM's OpenAI server will retain the input and output
+    # messages for those requests in memory. By default, this is disabled (0).
+    # NOTE/WARNING:
+    # 1. Messages are kept in memory only (not persisted to disk) and will be
+    #    lost when the vLLM server shuts down.
+    # 2. Enabling this option will cause a memory leak, as stored messages are
+    #    never removed from memory until the server terminates.
+    "VLLM_ENABLE_RESPONSES_API_STORE":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]