Merge branch 'main' into feature/native-mcp-support

Artui · web-flow · commit f0e0c3856556 · 2025-10-09T23:50:43.000+02:00
diff --git a/docs/logfire.md b/docs/logfire.md
@@ -268,7 +268,14 @@ The following providers have dedicated documentation on Pydantic AI:
 
 ### Configuring data format
 
-Pydantic AI follows the [OpenTelemetry Semantic Conventions for Generative AI systems](https://opentelemetry.io/docs/specs/semconv/gen-ai/). Specifically, it follows version 1.37.0 of the conventions by default. To use [version 1.36.0](https://github.com/open-telemetry/semantic-conventions/blob/v1.36.0/docs/gen-ai/README.md) or older, pass [`InstrumentationSettings(version=1)`][pydantic_ai.models.instrumented.InstrumentationSettings] (the default is `version=2`). Moreover, those semantic conventions specify that messages should be captured as individual events (logs) that are children of the request span, whereas by default, Pydantic AI instead collects these events into a JSON array which is set as a single large attribute called `events` on the request span. To change this, use `event_mode='logs'`:
+Pydantic AI follows the [OpenTelemetry Semantic Conventions for Generative AI systems](https://opentelemetry.io/docs/specs/semconv/gen-ai/). Specifically, it follows version 1.37.0 of the conventions by default, with a few exceptions. Certain span and attribute names are not spec compliant by default for compatibility reasons, but can be made compliant by passing [`InstrumentationSettings(version=3)`][pydantic_ai.models.instrumented.InstrumentationSettings] (the default is currently `version=2`). This will change the following:
+
+- The span name `agent run` becomes `invoke_agent {gen_ai.agent.name}` (with the agent name filled in)
+- The span name `running tool` becomes `execute_tool {gen_ai.tool.name}` (with the tool name filled in)
+- The attribute name `tool_arguments` becomes `gen_ai.tool.call.arguments`
+- The attribute name `tool_response` becomes `gen_ai.tool.call.result`
+
+To use [OpenTelemetry semantic conventions version 1.36.0](https://github.com/open-telemetry/semantic-conventions/blob/v1.36.0/docs/gen-ai/README.md) or older, pass [`InstrumentationSettings(version=1)`][pydantic_ai.models.instrumented.InstrumentationSettings]. Moreover, those semantic conventions specify that messages should be captured as individual events (logs) that are children of the request span, whereas by default, Pydantic AI instead collects these events into a JSON array which is set as a single large attribute called `events` on the request span. To change this, use `event_mode='logs'`:
 
 ```python {title="instrumentation_settings_event_mode.py"}
 import logfire
diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -206,11 +206,70 @@ def averages(self) -> ReportCaseAggregate | None:
             return ReportCaseAggregate.average(self.cases)
         return None
 
+    def render(
+        self,
+        width: int | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
+        include_input: bool = False,
+        include_metadata: bool = False,
+        include_expected_output: bool = False,
+        include_output: bool = False,
+        include_durations: bool = True,
+        include_total_duration: bool = False,
+        include_removed_cases: bool = False,
+        include_averages: bool = True,
+        include_errors: bool = True,
+        include_error_stacktrace: bool = False,
+        include_evaluator_failures: bool = True,
+        input_config: RenderValueConfig | None = None,
+        metadata_config: RenderValueConfig | None = None,
+        output_config: RenderValueConfig | None = None,
+        score_configs: dict[str, RenderNumberConfig] | None = None,
+        label_configs: dict[str, RenderValueConfig] | None = None,
+        metric_configs: dict[str, RenderNumberConfig] | None = None,
+        duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
+    ) -> str:  # pragma: no cover
+        """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
+
+        If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
+        """
+        io_file = StringIO()
+        console = Console(width=width, file=io_file)
+        self.print(
+            width=width,
+            baseline=baseline,
+            console=console,
+            include_input=include_input,
+            include_metadata=include_metadata,
+            include_expected_output=include_expected_output,
+            include_output=include_output,
+            include_durations=include_durations,
+            include_total_duration=include_total_duration,
+            include_removed_cases=include_removed_cases,
+            include_averages=include_averages,
+            include_errors=include_errors,
+            include_error_stacktrace=include_error_stacktrace,
+            include_evaluator_failures=include_evaluator_failures,
+            input_config=input_config,
+            metadata_config=metadata_config,
+            output_config=output_config,
+            score_configs=score_configs,
+            label_configs=label_configs,
+            metric_configs=metric_configs,
+            duration_config=duration_config,
+            include_reasons=include_reasons,
+        )
+        Console(file=io_file)
+        return io_file.getvalue()
+
     def print(
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         *,
+        console: Console | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -230,11 +289,14 @@ def print(
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
-    ):  # pragma: no cover
+    ) -> None:  # pragma: no cover
         """Print this report to the console, optionally comparing it to a baseline report.
 
         If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
         """
+        if console is None:
+            console = Console(width=width)
+
         table = self.console_table(
             baseline=baseline,
             include_input=include_input,
@@ -255,7 +317,6 @@ def print(
             duration_config=duration_config,
             include_reasons=include_reasons,
         )
-        console = Console(width=width)
         console.print(table)
         if include_errors and self.failures:
             failures_table = self.failures_table(
@@ -358,10 +419,7 @@ def failures_table(
 
     def __str__(self) -> str:  # pragma: lax no cover
         """Return a string representation of the report."""
-        table = self.console_table()
-        io_file = StringIO()
-        Console(file=io_file).print(table)
-        return io_file.getvalue()
+        return self.render()
 
 
 EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])