@@ -108,8 +108,18 @@ async def openai_response_async(request_data: dict):
108108 )
109109 filtered_request_data ["model" ] = model_name # remove custom_llm_provider
110110
111+ # Remove tools in subsequent rounds (when previous_response_id is present)
111112 if (
112113 "tools" in filtered_request_data
114+ and "previous_response_id" in filtered_request_data
115+ and filtered_request_data ["previous_response_id" ] is not None
116+ ):
117+ # Remove tools in subsequent rounds regardless of caching status
118+ del filtered_request_data ["tools" ]
119+
120+ # Ensure thinking field consistency for cache usage
121+ if (
122+ "thinking" in filtered_request_data
113123 and "extra_body" in filtered_request_data
114124 and isinstance (filtered_request_data ["extra_body" ], dict )
115125 and "caching" in filtered_request_data ["extra_body" ]
@@ -118,20 +128,37 @@ async def openai_response_async(request_data: dict):
118128 and "previous_response_id" in filtered_request_data
119129 and filtered_request_data ["previous_response_id" ] is not None
120130 ):
121- # Remove tools when caching is enabled and previous_response_id is present
122- del filtered_request_data ["tools" ]
131+ # For cache usage, thinking should be consistent with previous round
132+ # If thinking is present but inconsistent, remove it to avoid cache miss
133+ # Note: This is a placeholder - actual consistency check requires state tracking
134+ pass
123135
124- # Remove instructions when caching is enabled with specific configuration
136+ # Ensure store field is true or default when caching is enabled
125137 if (
126- "instructions" in filtered_request_data
127- and "extra_body" in filtered_request_data
138+ "extra_body" in filtered_request_data
128139 and isinstance (filtered_request_data ["extra_body" ], dict )
129140 and "caching" in filtered_request_data ["extra_body" ]
130141 and isinstance (filtered_request_data ["extra_body" ]["caching" ], dict )
131142 and filtered_request_data ["extra_body" ]["caching" ].get ("type" ) == "enabled"
132143 ):
133- # Remove instructions when caching is enabled
134- del filtered_request_data ["instructions" ]
144+ # Set store to true when caching is enabled for writing
145+ if "store" not in filtered_request_data :
146+ filtered_request_data ["store" ] = True
147+ elif filtered_request_data ["store" ] is False :
148+ # Override false to true for cache writing
149+ filtered_request_data ["store" ] = True
150+
151+ # [NOTE] Due to the Volcano Ark settings, there is a conflict between the cache and the instructions field.
152+ # If a system prompt is needed, it should be placed in the system role message within the input, instead of using the instructions parameter.
153+ # https://www.volcengine.com/docs/82379/1585128
154+ instructions = filtered_request_data .pop ("instructions" , None )
155+ filtered_request_data ["input" ] = [
156+ {
157+ "content" : [{"text" : instructions , "type" : "input_text" }],
158+ "role" : "system" ,
159+ "type" : "message" ,
160+ }
161+ ] + filtered_request_data ["input" ]
135162
136163 client = OpenAI (
137164 base_url = request_data ["api_base" ],
@@ -164,7 +191,8 @@ async def acompletion(
164191 ) = self ._get_request_data (model , messages , tools , ** kwargs )
165192
166193 # 3. Call litellm.aresponses with the transformed request data
167- # Cannot be called directly; there is a litellm bug :
194+ # [NOTE] Cannot be called directly; there is a litellm bug,
195+ # Therefore, we cannot directly call litellm.aresponses:
168196 # https://github.com/BerriAI/litellm/issues/16267
169197 # raw_response = await aresponses(
170198 # **request_data,
0 commit comments