diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 75670e2c3849..4453ddc4824e 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -9,6 +9,7 @@ - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used. ### Bugs Fixed +- Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported. ### Other Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index 19efbe69e4e3..73ba459375c7 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True): eval_group_id: str eval_run_id: str grader_name_map: Dict[str, str] + # Total number of expected rows in the original dataset. Used to + # re-align AOAI grader results to guard against silent row drops + # causing horizontal concatenation misalignment. + expected_rows: int def _split_evaluators_and_grader_configs( @@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation( ) return OAIEvalRunCreationInfo( - client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map + client=client, + eval_group_id=eval_group_info.id, + eval_run_id=eval_run_id, + grader_name_map=grader_name_map, + expected_rows=len(data), ) @@ -214,7 +222,7 @@ def _get_single_run_results( ) # Convert run results into a dictionary of metrics - run_metrics = {} + run_metrics: Dict[str, Any] = {} if run_results.per_testing_criteria_results is None: msg = ( "AOAI evaluation run returned no results, despite 'completed' status. This might" @@ -231,28 +239,16 @@ def _get_single_run_results( grader_name = run_info["grader_name_map"][criteria_result.testing_criteria] passed = criteria_result.passed failed = criteria_result.failed - ratio = passed / (passed + failed) + ratio = passed / (passed + failed) if (passed + failed) else 0.0 formatted_column_name = f"{grader_name}.pass_rate" run_metrics[formatted_column_name] = ratio - # Get full results and convert them into a dataframe. - # Notes on raw full data output from OAI eval runs: - # Each row in the full results list in itself a list. - # Each entry corresponds to one grader's results from the criteria list - # that was inputted to the eval group. - # Each entry is a dictionary, with a name, sample, passed boolean, and score number. - # The name is used to figure out which grader the entry refers to, the sample is ignored. - # The passed and score values are then added to the results dictionary, prepended with the grader's name - # as entered by the user in the inputted dictionary. - # Other values, if they exist, are also added to the results dictionary. - # Collect all results with pagination - all_results = [] - next_cursor = None + all_results: List[Any] = [] + next_cursor: Optional[str] = None limit = 100 # Max allowed by API while True: - # Build kwargs for the API call list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit} if next_cursor is not None: list_kwargs["after"] = next_cursor @@ -265,28 +261,25 @@ def _get_single_run_results( # Check for more pages if hasattr(raw_list_results, "has_more") and raw_list_results.has_more: if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0: - # Get the last item's ID for cursor-based pagination next_cursor = raw_list_results.data[-1].id else: break else: break - listed_results = {"index": []} - # raw data has no order guarantees, we need to sort them by their - # datasource_item_id + listed_results: Dict[str, List[Any]] = {"index": []} + # Raw data has no order guarantees; capture datasource_item_id per row for ordering. for row_result in all_results: - # Add the datasource_item_id for later sorting listed_results["index"].append(row_result.datasource_item_id) for single_grader_row_result in row_result.results: grader_name = run_info["grader_name_map"][single_grader_row_result["name"]] for name, value in single_grader_row_result.items(): - if name in ["name"]: # Todo decide if we also want to exclude "sample" + if name in ["name"]: continue if name.lower() == "passed": - # create a `_result` column for each grader + # Create a `_result` column for each grader result_column_name = f"outputs.{grader_name}.{grader_name}_result" - if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"? + if len(result_column_name) < 50: if result_column_name not in listed_results: listed_results[result_column_name] = [] listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value]) @@ -296,23 +289,67 @@ def _get_single_run_results( listed_results[formatted_column_name] = [] listed_results[formatted_column_name].append(value) - # Ensure all columns have the same length as the index + # Ensure all columns are the same length as the 'index' list num_rows = len(listed_results["index"]) for col_name in list(listed_results.keys()): if col_name != "index": col_length = len(listed_results[col_name]) if col_length < num_rows: - # Pad with None values listed_results[col_name].extend([None] * (num_rows - col_length)) elif col_length > num_rows: - # This shouldn't happen, but truncate if it does listed_results[col_name] = listed_results[col_name][:num_rows] output_df = pd.DataFrame(listed_results) - # sort by index - output_df = output_df.sort_values("index", ascending=[True]) - # remove index column - output_df.drop(columns=["index"], inplace=True) + + # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex. + if "index" not in output_df.columns: + output_df["index"] = list(range(len(output_df))) + + # Deterministic ordering by original datasource_item_id + output_df = output_df.sort_values("index", ascending=True) + + # Keep a temporary row-id copy for debugging/inspection. + # Use underscores (not hyphens) to avoid pandas column handling quirks. + output_df["__azure_ai_evaluation_index"] = output_df["index"] + + # Preserve original ids as index, then pad to expected length + output_df.set_index("index", inplace=True) + + expected = run_info.get("expected_rows", None) + if expected is not None: + pre_len = len(output_df) + # Assumes original datasource_item_id space is 0..expected-1 + output_df = output_df.reindex(range(expected)) + if pre_len != expected: + missing_rows = expected - pre_len + LOGGER.warning( + "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.", + run_info["eval_run_id"], + pre_len, + expected, + missing_rows, + ) + # Add a per-grader 'row_missing' boolean for padded rows + grader_user_names: Set[str] = set() + for col in output_df.columns: + if col.startswith("outputs."): + parts = col.split(".") + if len(parts) > 2: + grader_user_names.add(parts[1]) + if grader_user_names: + missing_index_mask = output_df.isna().all(axis=1) + for g in grader_user_names: + col_name = f"outputs.{g}.row_missing" + if col_name not in output_df: + output_df[col_name] = False + output_df.loc[missing_index_mask, col_name] = True + + # Drop the temporary helper column before returning (no public surface change) + if "__azure_ai_evaluation_index" in output_df.columns: + output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore") + + # Reset to RangeIndex so downstream concatenation aligns on position + output_df.reset_index(drop=True, inplace=True) return output_df, run_metrics @@ -406,8 +443,15 @@ def _get_graders_and_column_mappings( :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]] """ + if column_mappings is None: + return [({name: grader}, None) for name, grader in graders.items()] default_mapping = column_mappings.get("default", None) - return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()] + if default_mapping is None: + default_mapping = {} + return [ + ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping)) + for name, grader in graders.items() + ] def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]: diff --git a/sdk/evaluation/azure-ai-evaluation/cspell.json b/sdk/evaluation/azure-ai-evaluation/cspell.json index 10214c4c5b1f..95bfaa6b039c 100644 --- a/sdk/evaluation/azure-ai-evaluation/cspell.json +++ b/sdk/evaluation/azure-ai-evaluation/cspell.json @@ -19,7 +19,8 @@ "qrels", "ollama", "prompty", - "Likert" + "Likert", + "isna" ], "ignorePaths": [ "sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/onedp/models/_enums.py", diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_alignment_missing_rows.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_alignment_missing_rows.py new file mode 100644 index 000000000000..f1eced6670bf --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_alignment_missing_rows.py @@ -0,0 +1,90 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +import logging +from typing import List +from unittest.mock import Mock, patch + +import pandas as pd +import pytest + +from azure.ai.evaluation._evaluate._evaluate_aoai import ( + OAIEvalRunCreationInfo, + _get_single_run_results, +) + + +class MockOutputItem: + def __init__(self, id: str, datasource_item_id: int, results: List[dict]): + self.id = id + self.datasource_item_id = datasource_item_id + self.results = results + + +class MockOutputItemsList: + def __init__(self, data, has_more=False): + self.data = data + self.has_more = has_more + + +@pytest.mark.unittest +def test_aoai_results_preserve_order_with_unordered_output_items(caplog): + """AOAI output_items can arrive unordered; results should align to row ids (0..N-1).""" + mock_client = Mock() + expected_rows = 5 + run_info = OAIEvalRunCreationInfo( + client=mock_client, + eval_group_id="grp", + eval_run_id="run", + grader_name_map={"grader-1": "rel"}, + expected_rows=expected_rows, + ) + + # Completed run; pass_rate comes from per_testing_criteria_results + mock_run_results = Mock() + mock_run_results.status = "completed" + mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=4, failed=1)] + + # Unordered items: ids [3,0,4,1,2]; score equals its id for easy checks + unordered_items = [ + MockOutputItem(id="i3", datasource_item_id=3, results=[{"name": "grader-1", "passed": True, "score": 3.0}]), + MockOutputItem(id="i0", datasource_item_id=0, results=[{"name": "grader-1", "passed": True, "score": 0.0}]), + MockOutputItem(id="i4", datasource_item_id=4, results=[{"name": "grader-1", "passed": False, "score": 4.0}]), + MockOutputItem(id="i1", datasource_item_id=1, results=[{"name": "grader-1", "passed": True, "score": 1.0}]), + MockOutputItem(id="i2", datasource_item_id=2, results=[{"name": "grader-1", "passed": True, "score": 2.0}]), + ] + mock_client.evals.runs.output_items.list.return_value = MockOutputItemsList(data=unordered_items, has_more=False) + + caplog.set_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate_aoai") + + with patch( + "azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion", + return_value=mock_run_results, + ): + df, metrics = _get_single_run_results(run_info) + + # Shape and index + assert len(df) == expected_rows + assert list(df.index) == list(range(expected_rows)) + + score_col = "outputs.rel.score" + assert score_col in df.columns + + # Each row i should have score == float(i), proving correct alignment after sort/reindex + for i in range(expected_rows): + assert df.loc[i, score_col] == float(i) + + # No missing-row padding in this test; the row_missing flag should not exist + missing_flag_col = "outputs.rel.row_missing" + assert missing_flag_col not in df.columns + + # Pass rate surfaced from per_testing_criteria_results + assert metrics["rel.pass_rate"] == 4 / 5 + + # No warning about padding missing rows in this scenario + assert not any( + "missing row(s) padded with NaN for alignment" in rec.message + for rec in caplog.records + if rec.levelno >= logging.WARNING + ) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_evaluation_pagination.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_evaluation_pagination.py index 707b4b52f63d..ed4f74173dfa 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_evaluation_pagination.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_evaluation_pagination.py @@ -44,6 +44,7 @@ def test_single_page_results(self): eval_group_id="test-group", eval_run_id="test-run", grader_name_map={"grader-1": "test_grader"}, + expected_rows=10, ) # Mock the wait_for_run_conclusion response @@ -93,6 +94,7 @@ def test_multi_page_results(self): eval_group_id="test-group", eval_run_id="test-run", grader_name_map={"grader-1": "test_grader"}, + expected_rows=250, ) # Mock run results @@ -164,6 +166,7 @@ def test_empty_page_handling(self): eval_group_id="test-group", eval_run_id="test-run", grader_name_map={"grader-1": "test_grader"}, + expected_rows=5, ) mock_run_results = Mock() @@ -205,23 +208,28 @@ def test_result_ordering_preservation(self): eval_group_id="test-group", eval_run_id="test-run", grader_name_map={"grader-1": "test_grader"}, + expected_rows=10, ) mock_run_results = Mock() mock_run_results.status = "completed" mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=20, failed=0)] - # Create results in non-sequential order across pages + # Create results in non-sequential order across pages, covering ids 0..9 exactly page1_items = [ MockOutputItem( - id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}] + id=f"item-{i}", + datasource_item_id=i, # was i * 2 + results=[{"name": "grader-1", "passed": True, "score": i}], ) for i in [5, 3, 8, 1, 9] ] page2_items = [ MockOutputItem( - id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}] + id=f"item-{i}", + datasource_item_id=i, # was i * 2 + results=[{"name": "grader-1", "passed": True, "score": i}], ) for i in [2, 7, 4, 6, 0] ] @@ -238,7 +246,7 @@ def test_result_ordering_preservation(self): ): df, metrics = _get_single_run_results(run_info) - # Verify results are sorted by datasource_item_id + # Verify results are sorted by datasource_item_id (0..9) scores = df["outputs.test_grader.score"].tolist() - expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # Sorted by datasource_item_id + expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] assert scores == expected_scores