fix bug around system messages being discarded (#102)

mirodrr2 · mirodrr · web-flow · commit 82b0b8afd449 · 2025-03-04T23:50:32.000-05:00
Co-authored-by: michael rodriguez &lt;mirodrr@amazon.com&gt;
diff --git a/middleware/app.py b/middleware/app.py
@@ -890,6 +890,7 @@ async def proxy_request(request: Request):
             )
         provided_hash = hash_api_key(api_key)
 
+        # Prepare or load chat_history
         if history_enabled:
             if session_id is not None:
                 # Retrieve or verify existing session
@@ -902,44 +903,33 @@ async def proxy_request(request: Request):
                                 "error": "Unauthorized: API key does not match session owner"
                             },
                         )
-                    chat_history = (
-                        session_data["chat_history"]
-                        if session_data["chat_history"]
-                        else []
-                    )
+                    chat_history = session_data["chat_history"] or []
                 else:
                     chat_history = []
                     create_chat_history(session_id, chat_history, provided_hash)
             else:
-                # No session_id provided but enable_history = True, so create a new session
+                # No session_id but enable_history = True, so create a new session
                 session_id = str(uuid.uuid4())
                 chat_history = []
                 create_chat_history(session_id, chat_history, provided_hash)
-
-            # Merge incoming user messages into chat history
-            user_messages_this_round = [
-                m for m in data.get("messages", []) if m["role"] == "user"
-            ]
-            if user_messages_this_round:
-                chat_history.append(user_messages_this_round[-1])
-
-            # Overwrite data["messages"] with chat_history for the LLM request
-            data["messages"] = chat_history
         else:
-            # History is disabled and no valid session_id is provided.
-            # Pass messages through as-is.
+            # History not enabled: start with empty
             chat_history = []
 
-        # Merge incoming user messages into chat history
-        user_messages_this_round = [
-            m for m in data.get("messages", []) if m["role"] == "user"
-        ]
-        if user_messages_this_round:
-            chat_history.append(user_messages_this_round[-1])
+        # Merge incoming system/user messages into chat_history in original order
+        # (We generally skip adding "assistant" messages from the request side,
+        #  because those come from the model, not from the user.)
+        new_messages = data.get("messages", [])
+        for msg in new_messages:
+            if msg["role"] in ["system", "user"]:
+                chat_history.append(msg)
 
+        # Now data["messages"] should be the entire conversation the model sees
         data["messages"] = chat_history
 
-        # Check for prompt ARN logic
+        # ---------------------------------------------------------------------
+        # Handle optional "Bedrock Prompt" logic (unchanged from your snippet):
+        # ---------------------------------------------------------------------
         model_id = data.get("model")
         prompt_variables = data.pop("promptVariables", {})
         final_prompt_text = None
@@ -968,15 +958,14 @@ async def proxy_request(request: Request):
         if final_prompt_text:
             data["messages"] = [{"role": "user", "content": final_prompt_text}]
 
-        # client = AsyncOpenAI(api_key=api_key, base_url=LITELLM_ENDPOINT)
-
+        # ---------------------------------------------------------------------
+        # Stream vs. Non-Stream logic
+        # ---------------------------------------------------------------------
         if is_streaming:
-            # print(f"streaming")
             return await get_chat_stream(
                 api_key, data, session_id, chat_history, history_enabled
             )
         else:
-            # print(f"not streaming")
             headers = {
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {api_key}",
@@ -985,14 +974,14 @@ async def proxy_request(request: Request):
                 async with session.post(
                     f"{LITELLM_ENDPOINT}/v1/chat/completions",
                     headers=headers,
-                    json=data,  # Sending the data in the body
+                    json=data,
                 ) as resp:
-                    # Parse the response JSON
                     response_headers = dict(resp.headers)
-                    response_headers.pop("Content-Length")
-                    # print(response_headers)
+                    # Avoid passing through invalid content-length
+                    response_headers.pop("Content-Length", None)
                     response_dict = await resp.json()
 
+            # If there's a response from the assistant, save it to history
             if response_dict.get("choices"):
                 assistant_message = response_dict["choices"][0]["message"]
                 if history_enabled:
@@ -1001,6 +990,7 @@ async def proxy_request(request: Request):
                     )
                     update_chat_history(session_id, chat_history)
 
+            # Return session_id in the response if we have one
             if session_id:
                 response_dict["session_id"] = session_id
 
diff --git a/tests/openai_chat_test_file.py b/tests/openai_chat_test_file.py
@@ -11,6 +11,7 @@
 base_url = os.getenv("API_ENDPOINT")
 api_key = os.getenv("API_KEY")
 model_id = os.getenv("MODEL_ID")
+print(f'base_url: {base_url} api_key: {api_key} model_id: {model_id}')
 client = OpenAI(base_url=base_url, api_key=api_key)
 managed_prompt_arn = os.getenv("MANAGED_PROMPT_ARN")
 managed_prompt_variable_name = os.getenv("MANAGED_PROMPT_VARIABLE_NAME")
@@ -23,7 +24,7 @@
 
 async def stream_completion(
     prompt: str,
-    model: str = "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    model: str = "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
     extra_body: Dict[str, Any] = None,
 ) -> AsyncGenerator[Tuple[str, str], None]:
     """
@@ -55,8 +56,8 @@ async def stream_completion(
 
 
 def get_completion(
-    prompt: str,
-    model: str = "anthropic.claude-3-5-sonnet-20241022-v2:0",
+    messages: list,
+    model: str = "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
     extra_body: Dict[str, Any] = None,
 ) -> Tuple[str, str]:
     """
@@ -68,7 +69,7 @@ def get_completion(
 
     response = client.chat.completions.create(
         model=model,
-        messages=[{"role": "user", "content": prompt}],
+        messages=messages,
         stream=False,
         extra_body=extra_body,
     )
@@ -79,7 +80,7 @@ def get_completion(
 
 
 def test_openai_chat():
-    content, session_id = get_completion(small_prompt)
+    content, session_id = get_completion([{"role": "user", "content": small_prompt}])
     assert content is not None and content.strip()
     assert session_id is not None and session_id.strip()
     print(f"test_openai_chat response content: {content} session_id: {session_id}")
@@ -106,15 +107,15 @@ async def test_openai_chat_streaming():
 
 def test_openai_chat_history():
     print("First request:", flush=True)
-    response_content_1, session_id_1 = get_completion(small_prompt)
+    response_content_1, session_id_1 = get_completion([{"role": "system", "content": "You are a master storyteller"},{"role": "user", "content": small_prompt}], model_id, extra_body={"enable_history": True})
     assert response_content_1 is not None and response_content_1.strip()
     assert session_id_1 is not None and session_id_1.strip()
     print(f"Content: {response_content_1}")
     print(f"Session ID: {session_id_1}\n")
 
     print("\nSecond request (with session_id):", flush=True)
     response_content_2, session_id_2 = get_completion(
-        small_prompt_follow_up, extra_body={"session_id": session_id_1}
+        [{"role": "user", "content": small_prompt_follow_up}], model_id, extra_body={"session_id": session_id_1}
     )
     print(f"Content: {response_content_2}")
     print(f"Session ID: {session_id_2}\n")
@@ -171,7 +172,7 @@ def test_bedrock_managed_prompt():
 
     # Test with a managed prompt
     response_content, session_id = get_completion(
-        "",  # Empty prompt as it won't be used
+        [{"role": "user", "content": ""}],  # Empty prompt as it won't be used
         model=managed_prompt_arn,
         extra_body={
             "promptVariables": {
@@ -220,7 +221,7 @@ async def test_bedrock_managed_prompt_streaming():
 
 
 def test_large_prompt():
-    content, session_id = get_completion(large_prompt)
+    content, session_id = get_completion([{"role": "user", "content": large_prompt}])
     assert content is not None and content.strip()
     assert session_id is not None and session_id.strip()
     print(f"test_openai_chat response content: {content} session_id: {session_id}")
@@ -238,7 +239,7 @@ def test_invalid_api_key():
     # Attempt to make a request with the invalid client
     with pytest.raises(OpenAIError) as exc_info:
         response = invalid_client.chat.completions.create(
-            model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+            model="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
             messages=[{"role": "user", "content": small_prompt}],
             stream=False,
         )