Fix double counting of request tokens in evals

dmontagu · dmontagu · commit d7e208db994d · 2025-11-25T07:46:01.000-07:00
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -947,6 +947,8 @@ async def _run_once():
         #   That way users can customize this logic. We'd default to a function that does the current thing but also
         #   allow `None` to disable it entirely.
         for node in span_tree:
+            if 'gen_ai.request.model' not in node.attributes:
+                continue  # we only want to count the below specifically for the individual LLM requests, not agent runs
             for k, v in node.attributes.items():
                 if k == 'gen_ai.operation.name' and v == 'chat':
                     task_run.increment_metric('requests', 1)