release: 1.0.0-alpha.22 (#23)

stainless-app[bot] · dr3s · web-flow · commit 8a504bc101aa · 2025-05-08T12:11:09.000-04:00
* category df

* feat: print report by category

* release: 1.0.0-alpha.22

---------

Co-authored-by: dr3s &lt;dr3s@users.noreply.github.com&gt;
Co-authored-by: stainless-app[bot] &lt;142633134+stainless-app[bot]@users.noreply.github.com&gt;
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "1.0.0-alpha.21"
+  ".": "1.0.0-alpha.22"
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## 1.0.0-alpha.22 (2025-05-08)
+
+Full Changelog: [v1.0.0-alpha.21...v1.0.0-alpha.22](https://github.com/aymara-ai/aymara-sdk-python/compare/v1.0.0-alpha.21...v1.0.0-alpha.22)
+
+### Features
+
+* print report by category ([14d2d31](https://github.com/aymara-ai/aymara-sdk-python/commit/14d2d31d689ff48d0aac4b279a1e54c2204d6e18))
+
 ## 1.0.0-alpha.21 (2025-05-08)
 
 Full Changelog: [v1.0.0-alpha.20...v1.0.0-alpha.21](https://github.com/aymara-ai/aymara-sdk-python/compare/v1.0.0-alpha.20...v1.0.0-alpha.21)
diff --git a/examples/text_accuracy_eval.ipynb b/examples/text_accuracy_eval.ipynb
diff --git a/examples/text_safety_eval.ipynb b/examples/text_safety_eval.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "aymara-ai-sdk"
-version = "1.0.0-alpha.21"
+version = "1.0.0-alpha.22"
 description = "The official Python library for the aymara-ai API"
 dynamic = ["readme"]
 license = "Apache-2.0"
diff --git a/src/aymara_ai/_version.py b/src/aymara_ai/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "aymara_ai"
-__version__ = "1.0.0-alpha.21"  # x-release-please-version
+__version__ = "1.0.0-alpha.22"  # x-release-please-version
diff --git a/src/aymara_ai/lib/df.py b/src/aymara_ai/lib/df.py
@@ -1,10 +1,12 @@
+import re
 from typing import Any, Dict, List, Union
 
 import pandas as pd  # type: ignore
 
 from aymara_ai._models import BaseModel
 from aymara_ai.types.eval import Eval
 from aymara_ai.types.eval_prompt import EvalPrompt
+from aymara_ai.types.eval_suite_report import EvalSuiteReport
 from aymara_ai.types.evals.eval_run_result import EvalRunResult
 from aymara_ai.types.evals.scored_response import ScoredResponse
 
@@ -64,3 +66,69 @@ def to_df(results: Union[List[Union[BaseModel, Dict[str, Any]]], Dict[str, Any],
     rows = [r.to_dict() if isinstance(r, BaseModel) else r for r in results]
 
     return pd.DataFrame(rows)
+
+
+def to_reports_df(suite_report: EvalSuiteReport) -> pd.DataFrame:
+    """Create report by prompt category."""
+
+    rows = []
+    for report in suite_report.eval_run_reports:
+        if report.eval_run.evaluation.eval_type == "accuracy" if report.eval_run.evaluation else False:
+            # Extract sections using XML tags
+            passing_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.passing_responses_summary, re.DOTALL)
+            failing_sections = (
+                re.findall(r"<(\w+)>(.*?)</\1>", report.failing_responses_summary, re.DOTALL)
+                if report.failing_responses_summary
+                else []
+            )
+            advice_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.improvement_advice, re.DOTALL)
+
+            # Create a mapping of question types to their content
+            passing_by_type = {tag: content.strip() for tag, content in passing_sections}
+            failing_by_type = {tag: content.strip() for tag, content in failing_sections}
+            advice_by_type = {tag: content.strip() for tag, content in advice_sections}
+
+            # Get ordered unique question types while preserving order
+            categories = []
+            for tag, _ in passing_sections + failing_sections:
+                if tag not in categories:
+                    categories.append(tag)  # type: ignore
+
+            # Create a row for each question type
+            for q_type in categories:  # type: ignore
+                rows.append(  # type: ignore
+                    {
+                        "eval_name": report.eval_run.evaluation.name
+                        if report.eval_run.evaluation
+                        else report.eval_run.name,
+                        "prompt_category": q_type,
+                        "passing_responses_summary": passing_by_type.get(q_type, ""),
+                        "failing_responses_summary": failing_by_type.get(q_type, ""),
+                        "improvement_advice": advice_by_type.get(q_type, ""),
+                    }
+                )
+        else:
+            # Handle non-accuracy tests as before
+            rows.append(  # type: ignore
+                {
+                    "eval_name": report.eval_run.evaluation.name
+                    if report.eval_run.evaluation
+                    else report.eval_run.name,
+                    "passing_responses_summary": report.passing_responses_summary,
+                    "failing_responses_summary": report.failing_responses_summary,
+                    "improvement_advice": report.improvement_advice,
+                }
+            )
+
+    # Add overall summary if available
+    if suite_report.overall_passing_responses_summary or suite_report.overall_failing_responses_summary:
+        rows.append(  # type: ignore
+            {
+                "eval_name": "Overall",
+                "passing_responses_summary": suite_report.overall_passing_responses_summary,
+                "failing_responses_summary": suite_report.overall_failing_responses_summary,
+                "improvement_advice": suite_report.overall_improvement_advice,
+            }
+        )
+
+    return pd.DataFrame(rows)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- ".": "1.0.0-alpha.21"`
	`2`	`+ ".": "1.0.0-alpha.22"`
`3`	`3`	`}`