Skip to content

Commit 6d98843

Browse files
authored
[Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent aefeea0 commit 6d98843

File tree

4 files changed

+46
-10
lines changed

4 files changed

+46
-10
lines changed

tests/v1/entrypoints/openai/responses/conftest.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,16 @@ def default_server_args():
2121

2222

2323
@pytest.fixture(scope="module")
24-
def server(default_server_args):
25-
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
24+
def server_with_store(default_server_args):
25+
with RemoteOpenAIServer(
26+
MODEL_NAME,
27+
default_server_args,
28+
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
29+
) as remote_server:
2630
yield remote_server
2731

2832

2933
@pytest_asyncio.fixture
30-
async def client(server):
31-
async with server.get_async_client() as async_client:
34+
async def client(server_with_store):
35+
async with server_with_store.get_async_client() as async_client:
3236
yield async_client

tests/v1/entrypoints/openai/responses/test_image.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,11 @@ def default_image_server_args():
3737

3838
@pytest.fixture(scope="module")
3939
def image_server(default_image_server_args):
40-
with RemoteOpenAIServer(MODEL_NAME,
41-
default_image_server_args) as remote_server:
40+
with RemoteOpenAIServer(
41+
MODEL_NAME,
42+
default_image_server_args,
43+
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
44+
) as remote_server:
4245
yield remote_server
4346

4447

vllm/entrypoints/openai/serving_responses.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from fastapi import Request
1212
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
1313

14+
from vllm import envs
1415
from vllm.config import ModelConfig
1516
from vllm.engine.protocol import EngineClient
1617
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
@@ -89,15 +90,17 @@ def __init__(
8990
logger.info("Using default chat sampling params from %s: %s",
9091
source, self.default_sampling_params)
9192

93+
# False by default.
94+
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
9295
# HACK(woosuk): This is a hack. We should use a better store.
93-
# FIXME: This causes a memory leak since we never remove responses
94-
# from the store.
96+
# FIXME: If enable_store=True, this may cause a memory leak since we
97+
# never remove responses from the store.
9598
self.response_store: dict[str, ResponsesResponse] = {}
9699
self.response_store_lock = asyncio.Lock()
97100

98101
# HACK(woosuk): This is a hack. We should use a better store.
99-
# FIXME: This causes a memory leak since we never remove messages
100-
# from the store.
102+
# FIXME: If enable_store=True, this may cause a memory leak since we
103+
# never remove messages from the store.
101104
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
102105

103106
self.background_tasks: dict[str, asyncio.Task] = {}
@@ -118,6 +121,10 @@ async def create_responses(
118121
if self.engine_client.errored:
119122
raise self.engine_client.dead_error
120123

124+
# If store is not enabled, return an error.
125+
if request.store and not self.enable_store:
126+
return self._make_store_not_supported_error()
127+
121128
# Handle the previous response ID.
122129
prev_response_id = request.previous_response_id
123130
if prev_response_id is not None:
@@ -456,3 +463,13 @@ def _make_not_found_error(self, response_id: str) -> ErrorResponse:
456463
message=f"Response with id '{response_id}' not found.",
457464
status_code=HTTPStatus.NOT_FOUND,
458465
)
466+
467+
def _make_store_not_supported_error(self) -> ErrorResponse:
468+
return self.create_error_response(
469+
err_type="invalid_request_error",
470+
message=("`store=True` (default) is not supported. Please set "
471+
"`store=False` in Responses API or set "
472+
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
473+
"starting the vLLM server."),
474+
status_code=HTTPStatus.BAD_REQUEST,
475+
)

vllm/envs.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
152152
VLLM_LOOPBACK_IP: str = ""
153153
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
154+
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
154155

155156

156157
def get_default_cache_root():
@@ -1056,6 +1057,17 @@ def get_vllm_port() -> Optional[int]:
10561057
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
10571058
lambda: bool(int(os.getenv(\
10581059
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
1060+
1061+
# Enables support for the "store" option in the OpenAI Responses API.
1062+
# When set to 1, vLLM's OpenAI server will retain the input and output
1063+
# messages for those requests in memory. By default, this is disabled (0).
1064+
# NOTE/WARNING:
1065+
# 1. Messages are kept in memory only (not persisted to disk) and will be
1066+
# lost when the vLLM server shuts down.
1067+
# 2. Enabling this option will cause a memory leak, as stored messages are
1068+
# never removed from memory until the server terminates.
1069+
"VLLM_ENABLE_RESPONSES_API_STORE":
1070+
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
10591071
}
10601072

10611073
# --8<-- [end:env-vars-definition]

0 commit comments

Comments
 (0)