|
| 1 | +import re |
1 | 2 | from typing import Any, Dict, List, Union |
2 | 3 |
|
3 | 4 | import pandas as pd # type: ignore |
4 | 5 |
|
5 | 6 | from aymara_ai._models import BaseModel |
6 | 7 | from aymara_ai.types.eval import Eval |
7 | 8 | from aymara_ai.types.eval_prompt import EvalPrompt |
| 9 | +from aymara_ai.types.eval_suite_report import EvalSuiteReport |
8 | 10 | from aymara_ai.types.evals.eval_run_result import EvalRunResult |
9 | 11 | from aymara_ai.types.evals.scored_response import ScoredResponse |
10 | 12 |
|
@@ -64,3 +66,69 @@ def to_df(results: Union[List[Union[BaseModel, Dict[str, Any]]], Dict[str, Any], |
64 | 66 | rows = [r.to_dict() if isinstance(r, BaseModel) else r for r in results] |
65 | 67 |
|
66 | 68 | return pd.DataFrame(rows) |
| 69 | + |
| 70 | + |
| 71 | +def to_reports_df(suite_report: EvalSuiteReport) -> pd.DataFrame: |
| 72 | + """Create report by prompt category.""" |
| 73 | + |
| 74 | + rows = [] |
| 75 | + for report in suite_report.eval_run_reports: |
| 76 | + if report.eval_run.evaluation.eval_type == "accuracy" if report.eval_run.evaluation else False: |
| 77 | + # Extract sections using XML tags |
| 78 | + passing_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.passing_responses_summary, re.DOTALL) |
| 79 | + failing_sections = ( |
| 80 | + re.findall(r"<(\w+)>(.*?)</\1>", report.failing_responses_summary, re.DOTALL) |
| 81 | + if report.failing_responses_summary |
| 82 | + else [] |
| 83 | + ) |
| 84 | + advice_sections = re.findall(r"<(\w+)>(.*?)</\1>", report.improvement_advice, re.DOTALL) |
| 85 | + |
| 86 | + # Create a mapping of question types to their content |
| 87 | + passing_by_type = {tag: content.strip() for tag, content in passing_sections} |
| 88 | + failing_by_type = {tag: content.strip() for tag, content in failing_sections} |
| 89 | + advice_by_type = {tag: content.strip() for tag, content in advice_sections} |
| 90 | + |
| 91 | + # Get ordered unique question types while preserving order |
| 92 | + categories = [] |
| 93 | + for tag, _ in passing_sections + failing_sections: |
| 94 | + if tag not in categories: |
| 95 | + categories.append(tag) # type: ignore |
| 96 | + |
| 97 | + # Create a row for each question type |
| 98 | + for q_type in categories: # type: ignore |
| 99 | + rows.append( # type: ignore |
| 100 | + { |
| 101 | + "eval_name": report.eval_run.evaluation.name |
| 102 | + if report.eval_run.evaluation |
| 103 | + else report.eval_run.name, |
| 104 | + "prompt_category": q_type, |
| 105 | + "passing_responses_summary": passing_by_type.get(q_type, ""), |
| 106 | + "failing_responses_summary": failing_by_type.get(q_type, ""), |
| 107 | + "improvement_advice": advice_by_type.get(q_type, ""), |
| 108 | + } |
| 109 | + ) |
| 110 | + else: |
| 111 | + # Handle non-accuracy tests as before |
| 112 | + rows.append( # type: ignore |
| 113 | + { |
| 114 | + "eval_name": report.eval_run.evaluation.name |
| 115 | + if report.eval_run.evaluation |
| 116 | + else report.eval_run.name, |
| 117 | + "passing_responses_summary": report.passing_responses_summary, |
| 118 | + "failing_responses_summary": report.failing_responses_summary, |
| 119 | + "improvement_advice": report.improvement_advice, |
| 120 | + } |
| 121 | + ) |
| 122 | + |
| 123 | + # Add overall summary if available |
| 124 | + if suite_report.overall_passing_responses_summary or suite_report.overall_failing_responses_summary: |
| 125 | + rows.append( # type: ignore |
| 126 | + { |
| 127 | + "eval_name": "Overall", |
| 128 | + "passing_responses_summary": suite_report.overall_passing_responses_summary, |
| 129 | + "failing_responses_summary": suite_report.overall_failing_responses_summary, |
| 130 | + "improvement_advice": suite_report.overall_improvement_advice, |
| 131 | + } |
| 132 | + ) |
| 133 | + |
| 134 | + return pd.DataFrame(rows) |
0 commit comments