Skip to content

Commit 8a504bc

Browse files
release: 1.0.0-alpha.22 (#23)
* category df * feat: print report by category * release: 1.0.0-alpha.22 --------- Co-authored-by: dr3s <[email protected]> Co-authored-by: stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com>
1 parent 479bf91 commit 8a504bc

File tree

7 files changed

+350
-518
lines changed

7 files changed

+350
-518
lines changed

.release-please-manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
".": "1.0.0-alpha.21"
2+
".": "1.0.0-alpha.22"
33
}

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
22

3+
## 1.0.0-alpha.22 (2025-05-08)
4+
5+
Full Changelog: [v1.0.0-alpha.21...v1.0.0-alpha.22](https://github.com/aymara-ai/aymara-sdk-python/compare/v1.0.0-alpha.21...v1.0.0-alpha.22)
6+
7+
### Features
8+
9+
* print report by category ([14d2d31](https://github.com/aymara-ai/aymara-sdk-python/commit/14d2d31d689ff48d0aac4b279a1e54c2204d6e18))
10+
311
## 1.0.0-alpha.21 (2025-05-08)
412

513
Full Changelog: [v1.0.0-alpha.20...v1.0.0-alpha.21](https://github.com/aymara-ai/aymara-sdk-python/compare/v1.0.0-alpha.20...v1.0.0-alpha.21)

examples/text_accuracy_eval.ipynb

Lines changed: 71 additions & 311 deletions
Large diffs are not rendered by default.

examples/text_safety_eval.ipynb

Lines changed: 200 additions & 204 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "aymara-ai-sdk"
3-
version = "1.0.0-alpha.21"
3+
version = "1.0.0-alpha.22"
44
description = "The official Python library for the aymara-ai API"
55
dynamic = ["readme"]
66
license = "Apache-2.0"

src/aymara_ai/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
22

33
__title__ = "aymara_ai"
4-
__version__ = "1.0.0-alpha.21" # x-release-please-version
4+
__version__ = "1.0.0-alpha.22" # x-release-please-version

src/aymara_ai/lib/df.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
import re
12
from typing import Any, Dict, List, Union
23

34
import pandas as pd # type: ignore
45

56
from aymara_ai._models import BaseModel
67
from aymara_ai.types.eval import Eval
78
from aymara_ai.types.eval_prompt import EvalPrompt
9+
from aymara_ai.types.eval_suite_report import EvalSuiteReport
810
from aymara_ai.types.evals.eval_run_result import EvalRunResult
911
from aymara_ai.types.evals.scored_response import ScoredResponse
1012

@@ -64,3 +66,69 @@ def to_df(results: Union[List[Union[BaseModel, Dict[str, Any]]], Dict[str, Any],
6466
rows = [r.to_dict() if isinstance(r, BaseModel) else r for r in results]
6567

6668
return pd.DataFrame(rows)
69+
70+
71+
def to_reports_df(suite_report: EvalSuiteReport) -> pd.DataFrame:
72+
"""Create report by prompt category."""
73+
74+
rows = []
75+
for report in suite_report.eval_run_reports:
76+
if report.eval_run.evaluation.eval_type == "accuracy" if report.eval_run.evaluation else False:
77+
# Extract sections using XML tags
78+
passing_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.passing_responses_summary, re.DOTALL)
79+
failing_sections = (
80+
re.findall(r"<(\w+)>(.*?)</\1>", report.failing_responses_summary, re.DOTALL)
81+
if report.failing_responses_summary
82+
else []
83+
)
84+
advice_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.improvement_advice, re.DOTALL)
85+
86+
# Create a mapping of question types to their content
87+
passing_by_type = {tag: content.strip() for tag, content in passing_sections}
88+
failing_by_type = {tag: content.strip() for tag, content in failing_sections}
89+
advice_by_type = {tag: content.strip() for tag, content in advice_sections}
90+
91+
# Get ordered unique question types while preserving order
92+
categories = []
93+
for tag, _ in passing_sections + failing_sections:
94+
if tag not in categories:
95+
categories.append(tag) # type: ignore
96+
97+
# Create a row for each question type
98+
for q_type in categories: # type: ignore
99+
rows.append( # type: ignore
100+
{
101+
"eval_name": report.eval_run.evaluation.name
102+
if report.eval_run.evaluation
103+
else report.eval_run.name,
104+
"prompt_category": q_type,
105+
"passing_responses_summary": passing_by_type.get(q_type, ""),
106+
"failing_responses_summary": failing_by_type.get(q_type, ""),
107+
"improvement_advice": advice_by_type.get(q_type, ""),
108+
}
109+
)
110+
else:
111+
# Handle non-accuracy tests as before
112+
rows.append( # type: ignore
113+
{
114+
"eval_name": report.eval_run.evaluation.name
115+
if report.eval_run.evaluation
116+
else report.eval_run.name,
117+
"passing_responses_summary": report.passing_responses_summary,
118+
"failing_responses_summary": report.failing_responses_summary,
119+
"improvement_advice": report.improvement_advice,
120+
}
121+
)
122+
123+
# Add overall summary if available
124+
if suite_report.overall_passing_responses_summary or suite_report.overall_failing_responses_summary:
125+
rows.append( # type: ignore
126+
{
127+
"eval_name": "Overall",
128+
"passing_responses_summary": suite_report.overall_passing_responses_summary,
129+
"failing_responses_summary": suite_report.overall_failing_responses_summary,
130+
"improvement_advice": suite_report.overall_improvement_advice,
131+
}
132+
)
133+
134+
return pd.DataFrame(rows)

0 commit comments

Comments
 (0)