WIP: Account for cache costs when calculating cost limit

andrew-aisi · andrew-aisi · commit e8101331d8f8 · 2025-06-09T02:25:37.000Z
Adds test with multiple models to validate behavior.
diff --git a/src/inspect_ai/model/_model.py b/src/inspect_ai/model/_model.py
@@ -617,7 +617,12 @@ async def generate() -> tuple[ModelOutput, BaseModel]:
                         output=existing,
                         call=None,
                     )
-                    # TODO: Update cost info based on the cache hit
+                    # Cost limits should still be updated on cache hits
+                    if existing.usage:
+                        total_cost = calculate_model_usage_cost(
+                            {cache_entry.model: existing.usage}
+                        )
+                        record_model_usage_cost(total_cost)
                     return existing, event
             else:
                 cache_entry = None
diff --git a/tests/test_sample_limits.py b/tests/test_sample_limits.py
@@ -340,6 +340,70 @@ def test_cost_limit(tmp_path):
     check_cost_limit_event(log, cost_limit)
 
 
+def test_multi_model_cost_limit(tmp_path):
+    # TODO: how can we support multiple models
+
+    # Build a temporary JSON file under tmp_path (set by pytest)
+    cost_file = tmp_path / "cost_config.json"
+    data = {
+        "model": {
+            "input_cost_per_token": 0.01,
+            "output_cost_per_token": 0.001,
+            "cache_read_input_token_cost": 0.10,
+        },
+        "other_model": {
+            "input_cost_per_token": 0.01,
+            "output_cost_per_token": 0.001,
+            "cache_read_input_token_cost": 0.10,
+        },
+    }
+    cost_file.write_text(json.dumps(data))
+
+    model1 = get_model(
+        "mockllm/model",
+        custom_outputs=repeat_forever(
+            mock_model_output(
+                # Configure so each generation produces 1 unique input, 1 input cache,
+                # and 1 output token (total of 3 tokens)
+                input_tokens=1,  # Unique input tokens
+                input_tokens_cache_read=1,  # Cached input tokens
+                output_tokens=1,
+                total_tokens=3,
+            )
+        ),
+    )
+
+    model2 = get_model(
+        "mockllm/other_model",
+        custom_outputs=repeat_forever(
+            mock_model_output(
+                # Configure so each generation produces 1 unique input, 1 input cache,
+                # and 1 output token (total of 3 tokens)
+                input_tokens=1,  # Unique input tokens
+                input_tokens_cache_read=1,  # Cached input tokens
+                output_tokens=1,
+                total_tokens=3,
+            )
+        ),
+    )
+    # With our simulated costs, each turn should cost $0.111 so after 10 turns
+    # we should hit the limit at 30 total tokens.
+    # The cost limit should be hit while the token and turn limits should not
+    token_limit = 31
+    message_limit = 21  # Expect 10 messages from "user", 10 from assistant
+    cost_limit = 1.00
+
+    log = eval(
+        Task(solver=looping_solver()),
+        model=[model1, model2],
+        token_limit=token_limit,
+        message_limit=message_limit,
+        cost_limit=cost_limit,
+        cost_file=cost_file,
+    )[0]
+    check_cost_limit_event(log, cost_limit)
+
+
 @pytest.mark.slow
 @skip_if_no_docker
 def test_working_limit_does_not_raise_during_sandbox_teardown() -> None: