infra: handle flaky tests (#30501)

ccurme · web-flow · commit 422ba4cde57e · 2025-03-26T13:28:56.000-04:00
diff --git a/libs/partners/anthropic/pyproject.toml b/libs/partners/anthropic/pyproject.toml
@@ -30,6 +30,7 @@ test = [
     "pytest-watcher<1.0.0,>=0.3.4",
     "pytest-asyncio<1.0.0,>=0.21.1",
     "defusedxml<1.0.0,>=0.7.1",
+    "pytest-retry<1.8.0,>=1.7.0",
     "pytest-timeout<3.0.0,>=2.3.1",
     "pytest-socket<1.0.0,>=0.7.0",
     "langchain-core",
diff --git a/libs/partners/anthropic/tests/integration_tests/test_chat_models.py b/libs/partners/anthropic/tests/integration_tests/test_chat_models.py
@@ -730,6 +730,7 @@ def test_thinking() -> None:
             assert block["signature"] and isinstance(block["signature"], str)
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 def test_redacted_thinking() -> None:
     llm = ChatAnthropic(
         model="claude-3-7-sonnet-latest",
diff --git a/libs/partners/anthropic/uv.lock b/libs/partners/anthropic/uv.lock
diff --git a/libs/partners/openai/pyproject.toml b/libs/partners/openai/pyproject.toml
@@ -30,6 +30,7 @@ test = [
     "pytest-watcher<1.0.0,>=0.3.4",
     "pytest-asyncio<1.0.0,>=0.21.1",
     "pytest-cov<5.0.0,>=4.1.0",
+    "pytest-retry<1.8.0,>=1.7.0",
     "pytest-socket<1.0.0,>=0.6.0",
     "pytest-xdist<4.0.0,>=3.6.1",
     "numpy<2,>=1; python_version < \"3.12\"",
diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py
@@ -53,6 +53,7 @@ def _check_response(response: Optional[BaseMessage]) -> None:
         assert tool_output["type"]
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 def test_web_search() -> None:
     llm = ChatOpenAI(model=MODEL_NAME)
     first_response = llm.invoke(
@@ -108,6 +109,7 @@ def test_web_search() -> None:
     _check_response(response)
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 async def test_web_search_async() -> None:
     llm = ChatOpenAI(model=MODEL_NAME)
     response = await llm.ainvoke(
@@ -129,6 +131,7 @@ async def test_web_search_async() -> None:
     _check_response(full)
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 def test_function_calling() -> None:
     def multiply(x: int, y: int) -> int:
         """return x * y"""
@@ -197,6 +200,7 @@ async def test_parsed_pydantic_schema_async() -> None:
     assert parsed.response
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 @pytest.mark.parametrize("schema", [Foo.model_json_schema(), FooDict])
 def test_parsed_dict_schema(schema: Any) -> None:
     llm = ChatOpenAI(model=MODEL_NAME, use_responses_api=True)
@@ -241,6 +245,7 @@ class InvalidJoke(TypedDict):
         )
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 @pytest.mark.parametrize("schema", [Foo.model_json_schema(), FooDict])
 async def test_parsed_dict_schema_async(schema: Any) -> None:
     llm = ChatOpenAI(model=MODEL_NAME, use_responses_api=True)
@@ -313,6 +318,7 @@ def test_route_from_model_kwargs() -> None:
     _ = next(llm.stream("Hello"))
 
 
+@pytest.mark.flaky(retries=3, delay=1)
 def test_computer_calls() -> None:
     llm = ChatOpenAI(model="computer-use-preview", model_kwargs={"truncation": "auto"})
     tool = {
diff --git a/libs/partners/openai/uv.lock b/libs/partners/openai/uv.lock