Azure · nagkumar91 · May 28, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
@@ -9,6 +9,7 @@
 - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
 
 ### Bugs Fixed
+- Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
 
 ### Other Changes
 

@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
     eval_group_id: str
     eval_run_id: str
     grader_name_map: Dict[str, str]
+    # Total number of expected rows in the original dataset. Used to
+    # re-align AOAI grader results to guard against silent row drops
+    # causing horizontal concatenation misalignment.
+    expected_rows: int
 
 
 def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
     )
 
     return OAIEvalRunCreationInfo(
-        client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
+        client=client,
+        eval_group_id=eval_group_info.id,
+        eval_run_id=eval_run_id,
+        grader_name_map=grader_name_map,
+        expected_rows=len(data),
     )
 
 
@@ -214,7 +222,7 @@ def _get_single_run_results(
         )
 
     # Convert run results into a dictionary of metrics
-    run_metrics = {}
+    run_metrics: Dict[str, Any] = {}
     if run_results.per_testing_criteria_results is None:
         msg = (
             "AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
         grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
         passed = criteria_result.passed
         failed = criteria_result.failed
-        ratio = passed / (passed + failed)
+        ratio = passed / (passed + failed) if (passed + failed) else 0.0
         formatted_column_name = f"{grader_name}.pass_rate"
         run_metrics[formatted_column_name] = ratio
 
-    # Get full results and convert them into a dataframe.
-    # Notes on raw full data output from OAI eval runs:
-    # Each row in the full results list in itself a list.
-    # Each entry corresponds to one grader's results from the criteria list
-    # that was inputted to the eval group.
-    # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
-    # The name is used to figure out which grader the entry refers to, the sample is ignored.
-    # The passed and score values are then added to the results dictionary, prepended with the grader's name
-    # as entered by the user in the inputted dictionary.
-    # Other values, if they exist, are also added to the results dictionary.
-
     # Collect all results with pagination
-    all_results = []
-    next_cursor = None
+    all_results: List[Any] = []
+    next_cursor: Optional[str] = None
     limit = 100  # Max allowed by API
 
     while True:
-        # Build kwargs for the API call
         list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
         if next_cursor is not None:
             list_kwargs["after"] = next_cursor
@@ -265,28 +261,25 @@ def _get_single_run_results(
         # Check for more pages
         if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
             if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
-                # Get the last item's ID for cursor-based pagination
                 next_cursor = raw_list_results.data[-1].id
             else:
                 break
         else:
             break
 
-    listed_results = {"index": []}
-    # raw data has no order guarantees, we need to sort them by their
-    # datasource_item_id
+    listed_results: Dict[str, List[Any]] = {"index": []}
+    # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
     for row_result in all_results:
-        # Add the datasource_item_id for later sorting
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
             grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
             for name, value in single_grader_row_result.items():
-                if name in ["name"]:  # Todo decide if we also want to exclude "sample"
+                if name in ["name"]:
                     continue
                 if name.lower() == "passed":
-                    # create a `_result` column for each grader
+                    # Create a `_result` column for each grader
                     result_column_name = f"outputs.{grader_name}.{grader_name}_result"
-                    if len(result_column_name) < 50:  # TODO: is this the limit? Should we keep "passed"?
+                    if len(result_column_name) < 50:
                         if result_column_name not in listed_results:
                             listed_results[result_column_name] = []
                         listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +289,67 @@ def _get_single_run_results(
                     listed_results[formatted_column_name] = []
                 listed_results[formatted_column_name].append(value)
 
-    # Ensure all columns have the same length as the index
+    # Ensure all columns are the same length as the 'index' list
     num_rows = len(listed_results["index"])
     for col_name in list(listed_results.keys()):
         if col_name != "index":
             col_length = len(listed_results[col_name])
             if col_length < num_rows:
-                # Pad with None values
                 listed_results[col_name].extend([None] * (num_rows - col_length))
             elif col_length > num_rows:
-                # This shouldn't happen, but truncate if it does
                 listed_results[col_name] = listed_results[col_name][:num_rows]
 
     output_df = pd.DataFrame(listed_results)
-    # sort by index
-    output_df = output_df.sort_values("index", ascending=[True])
-    # remove index column
-    output_df.drop(columns=["index"], inplace=True)
+
+    # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
+    if "index" not in output_df.columns:
+        output_df["index"] = list(range(len(output_df)))
+
+    # Deterministic ordering by original datasource_item_id
+    output_df = output_df.sort_values("index", ascending=True)
+
+    # Keep a temporary row-id copy for debugging/inspection.
+    # Use underscores (not hyphens) to avoid pandas column handling quirks.
+    output_df["__azure_ai_evaluation_index"] = output_df["index"]
+
+    # Preserve original ids as index, then pad to expected length
+    output_df.set_index("index", inplace=True)
+
+    expected = run_info.get("expected_rows", None)
+    if expected is not None:
+        pre_len = len(output_df)
+        # Assumes original datasource_item_id space is 0..expected-1
+        output_df = output_df.reindex(range(expected))
+        if pre_len != expected:
+            missing_rows = expected - pre_len
+            LOGGER.warning(
+                "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
+                run_info["eval_run_id"],
+                pre_len,
+                expected,
+                missing_rows,
+            )
+            # Add a per-grader 'row_missing' boolean for padded rows
+            grader_user_names: set[str] = set()
+            for col in output_df.columns:
+                if col.startswith("outputs."):
+                    parts = col.split(".")
+                    if len(parts) > 2:
+                        grader_user_names.add(parts[1])
+            if grader_user_names:
+                missing_index_mask = output_df.isna().all(axis=1)
+                for g in grader_user_names:
+                    col_name = f"outputs.{g}.row_missing"
+                    if col_name not in output_df:
+                        output_df[col_name] = False
+                    output_df.loc[missing_index_mask, col_name] = True
+
+    # Drop the temporary helper column before returning (no public surface change)
+    if "__azure_ai_evaluation_index" in output_df.columns:
+        output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
+
+    # Reset to RangeIndex so downstream concatenation aligns on position
+    output_df.reset_index(drop=True, inplace=True)
     return output_df, run_metrics
 
 
@@ -406,6 +443,8 @@ def _get_graders_and_column_mappings(
     :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
     """
 
+    if column_mappings is None:
+        return [({name: grader}, None) for name, grader in graders.items()]
     default_mapping = column_mappings.get("default", None)
     return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
 

@@ -0,0 +1,90 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+import logging
+from typing import List
+from unittest.mock import Mock, patch
+
+import pandas as pd
+import pytest
+
+from azure.ai.evaluation._evaluate._evaluate_aoai import (
+    OAIEvalRunCreationInfo,
+    _get_single_run_results,
+)
+
+
+class MockOutputItem:
+    def __init__(self, id: str, datasource_item_id: int, results: List[dict]):
+        self.id = id
+        self.datasource_item_id = datasource_item_id
+        self.results = results
+
+
+class MockOutputItemsList:
+    def __init__(self, data, has_more=False):
+        self.data = data
+        self.has_more = has_more
+
+
+@pytest.mark.unittest
+def test_aoai_results_preserve_order_with_unordered_output_items(caplog):
+    """AOAI output_items can arrive unordered; results should align to row ids (0..N-1)."""
+    mock_client = Mock()
+    expected_rows = 5
+    run_info = OAIEvalRunCreationInfo(
+        client=mock_client,
+        eval_group_id="grp",
+        eval_run_id="run",
+        grader_name_map={"grader-1": "rel"},
+        expected_rows=expected_rows,
+    )
+
+    # Completed run; pass_rate comes from per_testing_criteria_results
+    mock_run_results = Mock()
+    mock_run_results.status = "completed"
+    mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=4, failed=1)]
+
+    # Unordered items: ids [3,0,4,1,2]; score equals its id for easy checks
+    unordered_items = [
+        MockOutputItem(id="i3", datasource_item_id=3, results=[{"name": "grader-1", "passed": True, "score": 3.0}]),
+        MockOutputItem(id="i0", datasource_item_id=0, results=[{"name": "grader-1", "passed": True, "score": 0.0}]),
+        MockOutputItem(id="i4", datasource_item_id=4, results=[{"name": "grader-1", "passed": False, "score": 4.0}]),
+        MockOutputItem(id="i1", datasource_item_id=1, results=[{"name": "grader-1", "passed": True, "score": 1.0}]),
+        MockOutputItem(id="i2", datasource_item_id=2, results=[{"name": "grader-1", "passed": True, "score": 2.0}]),
+    ]
+    mock_client.evals.runs.output_items.list.return_value = MockOutputItemsList(data=unordered_items, has_more=False)
+
+    caplog.set_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate_aoai")
+
+    with patch(
+        "azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion",
+        return_value=mock_run_results,
+    ):
+        df, metrics = _get_single_run_results(run_info)
+
+    # Shape and index
+    assert len(df) == expected_rows
+    assert list(df.index) == list(range(expected_rows))
+
+    score_col = "outputs.rel.score"
+    assert score_col in df.columns
+
+    # Each row i should have score == float(i), proving correct alignment after sort/reindex
+    for i in range(expected_rows):
+        assert df.loc[i, score_col] == float(i)
+
+    # No missing-row padding in this test; the row_missing flag should not exist
+    missing_flag_col = "outputs.rel.row_missing"
+    assert missing_flag_col not in df.columns
+
+    # Pass rate surfaced from per_testing_criteria_results
+    assert metrics["rel.pass_rate"] == 4 / 5
+
+    # No warning about padding missing rows in this scenario
+    assert not any(
+        "missing row(s) padded with NaN for alignment" in rec.message
+        for rec in caplog.records
+        if rec.levelno >= logging.WARNING
+    )
@@ -44,6 +44,7 @@ def test_single_page_results(self):
             eval_group_id="test-group",
             eval_run_id="test-run",
             grader_name_map={"grader-1": "test_grader"},
+            expected_rows=10,
         )
 
         # Mock the wait_for_run_conclusion response
@@ -93,6 +94,7 @@ def test_multi_page_results(self):
             eval_group_id="test-group",
             eval_run_id="test-run",
             grader_name_map={"grader-1": "test_grader"},
+            expected_rows=250,
         )
 
         # Mock run results
@@ -164,6 +166,7 @@ def test_empty_page_handling(self):
             eval_group_id="test-group",
             eval_run_id="test-run",
             grader_name_map={"grader-1": "test_grader"},
+            expected_rows=5,
         )
 
         mock_run_results = Mock()
@@ -205,23 +208,28 @@ def test_result_ordering_preservation(self):
             eval_group_id="test-group",
             eval_run_id="test-run",
             grader_name_map={"grader-1": "test_grader"},
+            expected_rows=10,
         )
 
         mock_run_results = Mock()
         mock_run_results.status = "completed"
         mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=20, failed=0)]
 
-        # Create results in non-sequential order across pages
+        # Create results in non-sequential order across pages, covering ids 0..9 exactly
         page1_items = [
             MockOutputItem(
-                id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}]
+                id=f"item-{i}",
+                datasource_item_id=i,  # was i * 2
+                results=[{"name": "grader-1", "passed": True, "score": i}],
             )
             for i in [5, 3, 8, 1, 9]
         ]
 
         page2_items = [
             MockOutputItem(
-                id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}]
+                id=f"item-{i}",
+                datasource_item_id=i,  # was i * 2
+                results=[{"name": "grader-1", "passed": True, "score": i}],
             )
             for i in [2, 7, 4, 6, 0]
         ]
@@ -238,7 +246,7 @@ def test_result_ordering_preservation(self):
         ):
             df, metrics = _get_single_run_results(run_info)
 
-        # Verify results are sorted by datasource_item_id
+        # Verify results are sorted by datasource_item_id (0..9)
         scores = df["outputs.test_grader.score"].tolist()
-        expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]  # Sorted by datasource_item_id
+        expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
         assert scores == expected_scores