add a method to Messages to set a cache breakpoint (#335)

elimoss · web-flow · commit 3653973c4ed2 · 2026-02-04T11:37:12.000-08:00
diff --git a/src/aviary/env.py b/src/aviary/env.py
@@ -550,7 +550,13 @@ async def step(
     ) -> tuple[Messages, float, bool, bool]:
         msgs: Messages = await self.exec_tool_calls(  # type: ignore[assignment]
             action, state=self.state, concurrency=self.concurrent_tool_calls
-        ) or [Message(content=f"No tool calls input in tool request {action}.")]
+        ) or [
+            ToolResponseMessage(
+                content=f"No tool calls input in tool request {action}.",
+                name="",
+                tool_call_id="",
+            )
+        ]
         self.state.messages.extend(msgs)
         return msgs, self.state.reward, self.state.done, False
 
diff --git a/src/aviary/message.py b/src/aviary/message.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, ClassVar, Self
 
@@ -18,6 +19,8 @@
 
     import numpy as np
 
+logger = logging.getLogger(__name__)
+
 
 class Message(BaseModel):
     DEFAULT_ROLE: ClassVar[str] = "user"
@@ -60,6 +63,14 @@ class Message(BaseModel):
         repr=False,
     )
 
+    cache_breakpoint: bool = Field(
+        default=False,
+        description="Mark this message as a cache breakpoint for prompt caching. "
+        "When True, adds cache_control to the content during serialization.",
+        exclude=True,
+        repr=False,
+    )
+
     @field_validator("role")
     @classmethod
     def check_role(cls, v: str) -> str:
@@ -97,14 +108,44 @@ def _serialize(self, handler, info: SerializationInfo):
           as LLM APIs expect multimodal content as structured blocks.
         - Other structured content stays as a JSON string,
           as tool response content must be a string for LLM API compatibility.
+
+        For cache_breakpoint:
+        - When True, adds cache_control to the content for prompt caching.
+        - String content is converted to content block format, the list-of-dicts
+          representation that LLM APIs use for structured content, e.g.
+          [{"type": "text", "text": "hello", "cache_control": {"type": "ephemeral"}}].
+        - Multimodal content has cache_control added to the last block.
         """
         data = handler(self)
-        if (
-            self.is_multimodal
-            and "content" in data
-            and (info.context or {}).get("deserialize_content", True)
-        ):
+        deserialize_content = (info.context or {}).get("deserialize_content", True)
+        if self.is_multimodal and "content" in data and deserialize_content:
             data["content"] = json.loads(data["content"])
+
+        # Handle cache_breakpoint - add cache_control to content
+        # Skip when deserialize_content=False as it would convert string to list,
+        # breaking call sites that require string content (e.g., ToolResponseMessage)
+        if self.cache_breakpoint and not deserialize_content:
+            logger.warning(
+                "cache_breakpoint ignored: deserialize_content=False requires string content"
+            )
+        elif (
+            self.cache_breakpoint and "content" in data and data["content"] is not None
+        ):
+            cache_control = {"type": "ephemeral"}
+            if isinstance(data["content"], list):
+                # Multimodal: add cache_control to last block
+                if data["content"]:
+                    data["content"][-1]["cache_control"] = cache_control
+            else:
+                # String content: convert to content block format with cache_control
+                data["content"] = [
+                    {
+                        "type": "text",
+                        "text": data["content"],
+                        "cache_control": cache_control,
+                    }
+                ]
+
         if (info.context or {}).get("include_info"):
             data["info"] = self.info
         return data
diff --git a/tests/test_messages.py b/tests/test_messages.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+from lmi import LiteLLMModel
 
 from aviary.core import (
     Message,
@@ -372,3 +373,145 @@ def test_prepend_text(self, subtests) -> None:
             assert trm.content is not None
             content_list_original = json.loads(trm.content)
             assert len(content_list_original) == 3
+
+
+class TestCacheBreakpoint:
+    def test_default_is_false(self) -> None:
+        msg = Message(content="test")
+        assert not msg.cache_breakpoint
+
+    def test_serialization_without_cache_breakpoint(self) -> None:
+        data = Message(content="test").model_dump(exclude_none=True)
+        assert data == {"role": "user", "content": "test"}
+
+    @pytest.mark.parametrize(
+        ("content", "expected_content"),
+        [
+            (
+                "test",
+                [
+                    {
+                        "type": "text",
+                        "text": "test",
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            ),
+            (
+                [{"type": "text", "text": "first"}, {"type": "text", "text": "second"}],
+                [
+                    {"type": "text", "text": "first"},
+                    {
+                        "type": "text",
+                        "text": "second",
+                        "cache_control": {"type": "ephemeral"},
+                    },
+                ],
+            ),
+        ],
+    )
+    def test_serialization_with_cache_breakpoint(
+        self, content, expected_content
+    ) -> None:
+        data = Message(content=content, cache_breakpoint=True).model_dump(
+            exclude_none=True
+        )
+        assert data == {"role": "user", "content": expected_content}
+
+    def test_serialization_with_cache_breakpoint_empty_content(self) -> None:
+        data = Message(content=None, cache_breakpoint=True).model_dump(
+            exclude_none=True
+        )
+        # Should not crash, content stays None
+        assert data == {"role": "user"}
+
+    def test_cache_breakpoint_excluded_from_dump(self) -> None:
+        data = Message(content="test", cache_breakpoint=True).model_dump()
+        assert "cache_breakpoint" not in data
+
+    def test_cache_breakpoint_with_image_content(self) -> None:
+        data = Message.create_message(
+            text="Describe this image",
+            images=[
+                "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+            ],
+            cache_breakpoint=True,
+        ).model_dump(exclude_none=True)
+        # cache_control should be on the last block (the text block)
+        assert len(data["content"]) == 2
+        assert data["content"][0]["type"] == "image_url"
+        assert "cache_control" not in data["content"][0]
+        assert data["content"][1]["type"] == "text"
+        assert data["content"][1]["cache_control"] == {"type": "ephemeral"}
+
+    def test_cache_breakpoint_skipped_when_deserialize_content_false(self) -> None:
+        data = Message(content="test", cache_breakpoint=True).model_dump(
+            context={"deserialize_content": False}
+        )
+        # Content should remain a string, cache_breakpoint not applied
+        assert data["content"] == "test"
+
+    def test_cache_breakpoint_logs_warning_when_skipped(self, caplog) -> None:
+        import logging
+
+        msg = Message(content="test", cache_breakpoint=True)
+        with caplog.at_level(logging.WARNING):
+            msg.model_dump(context={"deserialize_content": False})
+        assert "cache_breakpoint ignored" in caplog.text
+
+
+def _make_long_content(prefix: str, num_items: int = 300) -> str:
+    """Generate long content for cache testing (>1024 tokens for Anthropic)."""
+    return prefix + " ".join(f"item_{i}" for i in range(num_items))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("model_name", "require_cache_hit"),
+    [
+        ("claude-3-5-haiku-20241022", True),
+        ("gpt-4o-mini", False),
+    ],
+)
+async def test_cache_breakpoint_live(model_name: str, require_cache_hit: bool) -> None:
+    """Verify cache breakpoint behavior with different providers.
+
+    For Anthropic: cache_breakpoint causes upstream content to be cached.
+    For OpenAI: LiteLLM correctly strips cache_control, and OpenAI's automatic
+    prefix caching may or may not activate.
+    """
+    system_msg = Message(role="system", content=_make_long_content("System: "))
+    user_context = Message(role="user", content=_make_long_content("Context: "))
+    user_context.cache_breakpoint = True
+    assistant_msg = Message(role="assistant", content="Acknowledged.")
+    user_question = Message(role="user", content="Summarize.")
+
+    messages = [system_msg, user_context, assistant_msg, user_question]
+    llm = LiteLLMModel(name=model_name)
+
+    # First request - may create cache or hit existing cache
+    result1 = await llm.call_single(messages)
+    if require_cache_hit:
+        cache_active = (result1.cache_creation_tokens or 0) > 0 or (
+            result1.cache_read_tokens or 0
+        ) > 0
+        assert cache_active, "Expected cache creation or cache read on first request"
+    else:
+        assert result1.text is not None
+
+    # Second request - should hit cache (for Anthropic) or may hit (for OpenAI)
+    result2 = await llm.call_single(messages)
+    if require_cache_hit:
+        assert (result2.cache_read_tokens or 0) > 0, (
+            "Expected cache hit on second request"
+        )
+        assert (result2.cache_read_tokens or 0) > 500, (
+            f"Expected >500 cached tokens, got {result2.cache_read_tokens}"
+        )
+    else:
+        assert result2.text is not None
+        # OpenAI's caching is automatic and not guaranteed
+        if result2.cache_read_tokens is not None and result2.cache_read_tokens > 0:
+            assert result2.cache_read_tokens > 500, (
+                f"Expected >500 cached tokens if cache hit, got {result2.cache_read_tokens}"
+            )
diff --git a/tests/test_tools.py b/tests/test_tools.py
@@ -762,6 +762,7 @@ def __init__(self):
 
 @pytest.mark.asyncio
 async def test_argref_by_name_async_functions() -> None:
+    # pylint: disable=unexpected-keyword-arg
     class MyState:
         def __init__(self):
             self.refs = {"foo": 1, "bar": 7}
@@ -809,6 +810,7 @@ async def async_list_direct(a: int, b: int) -> list[int]:  # noqa: RUF029
 
 @pytest.mark.asyncio
 async def test_argref_by_name_advanced_features() -> None:
+    # pylint: disable=unexpected-keyword-arg
     class MyState:
         def __init__(self):
             self.refs = {"foo": 1}
@@ -817,9 +819,9 @@ def __init__(self):
 
     # Define and test dereference via no state value found with return_direct
     @argref_by_name(return_direct=True)
-    def skip_deref_test(foo: float, a: str) -> str:
+    def skip_deref_test(ref_key: float, a: str) -> str:
         """Some docstring."""
-        return f"{foo} {a}"
+        return f"{ref_key} {a}"
 
     assert skip_deref_test("foo", "not in state", state=s) == "1 not in state"
     assert skip_deref_test("foo", "foo", state=s) == "1 1"
@@ -940,7 +942,7 @@ async def reset(self) -> tuple[list[Message], list[Tool]]:
         async def step(self, action):
             return await self.exec_tool_calls(action), False, 0, 0
 
-        async def export_frame(self):
+        def export_frame(self):
             pass
 
     with suppress_type_checks():
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -155,14 +155,13 @@ class SomeModel(BaseModel):
         assert isinstance(gauss_next, float | None)
 
     # 2. Check deserialized RNG behaves as original RNG
-    for i, deserialized_model in enumerate((
-        SomeModel.model_validate_json(model.model_dump_json()),  # JSON str
-        SomeModel.model_validate(model.model_dump(mode="json")),  # JSON dict
-    )):
-        if i == 0:
-            # Sample original model once so RNG aligns for both deserialized
-            # models in the `for` loop
-            sampled_original = model.rng.sample(list(range(10)), k=6)
+    json_str = model.model_dump_json()
+    json_dict = model.model_dump(mode="json")
+    sampled_original = model.rng.sample(list(range(10)), k=6)
+    for deserialized_model in (
+        SomeModel.model_validate_json(json_str),
+        SomeModel.model_validate(json_dict),
+    ):
         assert isinstance(deserialized_model.rng, random.Random)
         sampled_deserialized = deserialized_model.rng.sample(list(range(10)), k=6)
         assert sampled_original == sampled_deserialized, (
diff --git a/uv.lock b/uv.lock