Skip to content

Commit 612c6dc

Browse files
nagkumar91Nagkumar ArkalgudNagkumar ArkalgudCopilotCopilot
authored
Bugfix/aoai row mismatch (#42466)
* Prepare evals SDK Release * Fix bug * Fix for ADV_CONV for FDP projects * Update release date * re-add pyrit to matrix * Change grader ids * Update unit test * replace all old grader IDs in tests * Update platform-matrix.json Add pyrit and not remove the other one * Update test to ensure everything is mocked * tox/black fixes * Skip that test with issues * update grader ID according to API View feedback * Update test * remove string check for grader ID * Update changelog and officialy start freeze * update the enum according to suggestions * update the changelog * Finalize logic * Initial plan * Fix client request ID headers in azure-ai-evaluation Co-authored-by: nagkumar91 <[email protected]> * Fix client request ID header format in rai_service.py Co-authored-by: nagkumar91 <[email protected]> * Passing threshold in AzureOpenAIScoreModelGrader * Add changelog * Adding the self.pass_threshold instead of pass_threshold * Add the python grader * Remove redundant test * Add class to exception list and format code * Add properties to evaluation upload run for FDP * Remove debug * Remove the redundant property * Fix changelog * Fix the multiple features added section * removed the properties in update * fix(evaluation): pad AOAI grader results to expected_rows to prevent row misalignment; add missing-rows unit test * Update the test and changelog * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py Co-authored-by: Copilot <[email protected]> * Update sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py Co-authored-by: Copilot <[email protected]> * Fix the indent * Lint fixes * update cspell --------- Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: Nagkumar Arkalgud <[email protected]> Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: nagkumar91 <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent 94d5f9c commit 612c6dc

File tree

5 files changed

+183
-39
lines changed

5 files changed

+183
-39
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
1010

1111
### Bugs Fixed
12+
- Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
1213

1314
### Other Changes
1415
- Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py

Lines changed: 77 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
2929
eval_group_id: str
3030
eval_run_id: str
3131
grader_name_map: Dict[str, str]
32+
# Total number of expected rows in the original dataset. Used to
33+
# re-align AOAI grader results to guard against silent row drops
34+
# causing horizontal concatenation misalignment.
35+
expected_rows: int
3236

3337

3438
def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
157161
)
158162

159163
return OAIEvalRunCreationInfo(
160-
client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
164+
client=client,
165+
eval_group_id=eval_group_info.id,
166+
eval_run_id=eval_run_id,
167+
grader_name_map=grader_name_map,
168+
expected_rows=len(data),
161169
)
162170

163171

@@ -214,7 +222,7 @@ def _get_single_run_results(
214222
)
215223

216224
# Convert run results into a dictionary of metrics
217-
run_metrics = {}
225+
run_metrics: Dict[str, Any] = {}
218226
if run_results.per_testing_criteria_results is None:
219227
msg = (
220228
"AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
231239
grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232240
passed = criteria_result.passed
233241
failed = criteria_result.failed
234-
ratio = passed / (passed + failed)
242+
ratio = passed / (passed + failed) if (passed + failed) else 0.0
235243
formatted_column_name = f"{grader_name}.pass_rate"
236244
run_metrics[formatted_column_name] = ratio
237245

238-
# Get full results and convert them into a dataframe.
239-
# Notes on raw full data output from OAI eval runs:
240-
# Each row in the full results list in itself a list.
241-
# Each entry corresponds to one grader's results from the criteria list
242-
# that was inputted to the eval group.
243-
# Each entry is a dictionary, with a name, sample, passed boolean, and score number.
244-
# The name is used to figure out which grader the entry refers to, the sample is ignored.
245-
# The passed and score values are then added to the results dictionary, prepended with the grader's name
246-
# as entered by the user in the inputted dictionary.
247-
# Other values, if they exist, are also added to the results dictionary.
248-
249246
# Collect all results with pagination
250-
all_results = []
251-
next_cursor = None
247+
all_results: List[Any] = []
248+
next_cursor: Optional[str] = None
252249
limit = 100 # Max allowed by API
253250

254251
while True:
255-
# Build kwargs for the API call
256252
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257253
if next_cursor is not None:
258254
list_kwargs["after"] = next_cursor
@@ -265,28 +261,25 @@ def _get_single_run_results(
265261
# Check for more pages
266262
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267263
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268-
# Get the last item's ID for cursor-based pagination
269264
next_cursor = raw_list_results.data[-1].id
270265
else:
271266
break
272267
else:
273268
break
274269

275-
listed_results = {"index": []}
276-
# raw data has no order guarantees, we need to sort them by their
277-
# datasource_item_id
270+
listed_results: Dict[str, List[Any]] = {"index": []}
271+
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
278272
for row_result in all_results:
279-
# Add the datasource_item_id for later sorting
280273
listed_results["index"].append(row_result.datasource_item_id)
281274
for single_grader_row_result in row_result.results:
282275
grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
283276
for name, value in single_grader_row_result.items():
284-
if name in ["name"]: # Todo decide if we also want to exclude "sample"
277+
if name in ["name"]:
285278
continue
286279
if name.lower() == "passed":
287-
# create a `_result` column for each grader
280+
# Create a `_result` column for each grader
288281
result_column_name = f"outputs.{grader_name}.{grader_name}_result"
289-
if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
282+
if len(result_column_name) < 50:
290283
if result_column_name not in listed_results:
291284
listed_results[result_column_name] = []
292285
listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +289,67 @@ def _get_single_run_results(
296289
listed_results[formatted_column_name] = []
297290
listed_results[formatted_column_name].append(value)
298291

299-
# Ensure all columns have the same length as the index
292+
# Ensure all columns are the same length as the 'index' list
300293
num_rows = len(listed_results["index"])
301294
for col_name in list(listed_results.keys()):
302295
if col_name != "index":
303296
col_length = len(listed_results[col_name])
304297
if col_length < num_rows:
305-
# Pad with None values
306298
listed_results[col_name].extend([None] * (num_rows - col_length))
307299
elif col_length > num_rows:
308-
# This shouldn't happen, but truncate if it does
309300
listed_results[col_name] = listed_results[col_name][:num_rows]
310301

311302
output_df = pd.DataFrame(listed_results)
312-
# sort by index
313-
output_df = output_df.sort_values("index", ascending=[True])
314-
# remove index column
315-
output_df.drop(columns=["index"], inplace=True)
303+
304+
# If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
305+
if "index" not in output_df.columns:
306+
output_df["index"] = list(range(len(output_df)))
307+
308+
# Deterministic ordering by original datasource_item_id
309+
output_df = output_df.sort_values("index", ascending=True)
310+
311+
# Keep a temporary row-id copy for debugging/inspection.
312+
# Use underscores (not hyphens) to avoid pandas column handling quirks.
313+
output_df["__azure_ai_evaluation_index"] = output_df["index"]
314+
315+
# Preserve original ids as index, then pad to expected length
316+
output_df.set_index("index", inplace=True)
317+
318+
expected = run_info.get("expected_rows", None)
319+
if expected is not None:
320+
pre_len = len(output_df)
321+
# Assumes original datasource_item_id space is 0..expected-1
322+
output_df = output_df.reindex(range(expected))
323+
if pre_len != expected:
324+
missing_rows = expected - pre_len
325+
LOGGER.warning(
326+
"AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
327+
run_info["eval_run_id"],
328+
pre_len,
329+
expected,
330+
missing_rows,
331+
)
332+
# Add a per-grader 'row_missing' boolean for padded rows
333+
grader_user_names: Set[str] = set()
334+
for col in output_df.columns:
335+
if col.startswith("outputs."):
336+
parts = col.split(".")
337+
if len(parts) > 2:
338+
grader_user_names.add(parts[1])
339+
if grader_user_names:
340+
missing_index_mask = output_df.isna().all(axis=1)
341+
for g in grader_user_names:
342+
col_name = f"outputs.{g}.row_missing"
343+
if col_name not in output_df:
344+
output_df[col_name] = False
345+
output_df.loc[missing_index_mask, col_name] = True
346+
347+
# Drop the temporary helper column before returning (no public surface change)
348+
if "__azure_ai_evaluation_index" in output_df.columns:
349+
output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
350+
351+
# Reset to RangeIndex so downstream concatenation aligns on position
352+
output_df.reset_index(drop=True, inplace=True)
316353
return output_df, run_metrics
317354

318355

@@ -406,8 +443,15 @@ def _get_graders_and_column_mappings(
406443
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
407444
"""
408445

446+
if column_mappings is None:
447+
return [({name: grader}, None) for name, grader in graders.items()]
409448
default_mapping = column_mappings.get("default", None)
410-
return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
449+
if default_mapping is None:
450+
default_mapping = {}
451+
return [
452+
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
453+
for name, grader in graders.items()
454+
]
411455

412456

413457
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:

sdk/evaluation/azure-ai-evaluation/cspell.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
"qrels",
2020
"ollama",
2121
"prompty",
22-
"Likert"
22+
"Likert",
23+
"isna"
2324
],
2425
"ignorePaths": [
2526
"sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/onedp/models/_enums.py",
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# ---------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# ---------------------------------------------------------
4+
5+
import logging
6+
from typing import List
7+
from unittest.mock import Mock, patch
8+
9+
import pandas as pd
10+
import pytest
11+
12+
from azure.ai.evaluation._evaluate._evaluate_aoai import (
13+
OAIEvalRunCreationInfo,
14+
_get_single_run_results,
15+
)
16+
17+
18+
class MockOutputItem:
19+
def __init__(self, id: str, datasource_item_id: int, results: List[dict]):
20+
self.id = id
21+
self.datasource_item_id = datasource_item_id
22+
self.results = results
23+
24+
25+
class MockOutputItemsList:
26+
def __init__(self, data, has_more=False):
27+
self.data = data
28+
self.has_more = has_more
29+
30+
31+
@pytest.mark.unittest
32+
def test_aoai_results_preserve_order_with_unordered_output_items(caplog):
33+
"""AOAI output_items can arrive unordered; results should align to row ids (0..N-1)."""
34+
mock_client = Mock()
35+
expected_rows = 5
36+
run_info = OAIEvalRunCreationInfo(
37+
client=mock_client,
38+
eval_group_id="grp",
39+
eval_run_id="run",
40+
grader_name_map={"grader-1": "rel"},
41+
expected_rows=expected_rows,
42+
)
43+
44+
# Completed run; pass_rate comes from per_testing_criteria_results
45+
mock_run_results = Mock()
46+
mock_run_results.status = "completed"
47+
mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=4, failed=1)]
48+
49+
# Unordered items: ids [3,0,4,1,2]; score equals its id for easy checks
50+
unordered_items = [
51+
MockOutputItem(id="i3", datasource_item_id=3, results=[{"name": "grader-1", "passed": True, "score": 3.0}]),
52+
MockOutputItem(id="i0", datasource_item_id=0, results=[{"name": "grader-1", "passed": True, "score": 0.0}]),
53+
MockOutputItem(id="i4", datasource_item_id=4, results=[{"name": "grader-1", "passed": False, "score": 4.0}]),
54+
MockOutputItem(id="i1", datasource_item_id=1, results=[{"name": "grader-1", "passed": True, "score": 1.0}]),
55+
MockOutputItem(id="i2", datasource_item_id=2, results=[{"name": "grader-1", "passed": True, "score": 2.0}]),
56+
]
57+
mock_client.evals.runs.output_items.list.return_value = MockOutputItemsList(data=unordered_items, has_more=False)
58+
59+
caplog.set_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate_aoai")
60+
61+
with patch(
62+
"azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion",
63+
return_value=mock_run_results,
64+
):
65+
df, metrics = _get_single_run_results(run_info)
66+
67+
# Shape and index
68+
assert len(df) == expected_rows
69+
assert list(df.index) == list(range(expected_rows))
70+
71+
score_col = "outputs.rel.score"
72+
assert score_col in df.columns
73+
74+
# Each row i should have score == float(i), proving correct alignment after sort/reindex
75+
for i in range(expected_rows):
76+
assert df.loc[i, score_col] == float(i)
77+
78+
# No missing-row padding in this test; the row_missing flag should not exist
79+
missing_flag_col = "outputs.rel.row_missing"
80+
assert missing_flag_col not in df.columns
81+
82+
# Pass rate surfaced from per_testing_criteria_results
83+
assert metrics["rel.pass_rate"] == 4 / 5
84+
85+
# No warning about padding missing rows in this scenario
86+
assert not any(
87+
"missing row(s) padded with NaN for alignment" in rec.message
88+
for rec in caplog.records
89+
if rec.levelno >= logging.WARNING
90+
)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_evaluation_pagination.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def test_single_page_results(self):
4444
eval_group_id="test-group",
4545
eval_run_id="test-run",
4646
grader_name_map={"grader-1": "test_grader"},
47+
expected_rows=10,
4748
)
4849

4950
# Mock the wait_for_run_conclusion response
@@ -93,6 +94,7 @@ def test_multi_page_results(self):
9394
eval_group_id="test-group",
9495
eval_run_id="test-run",
9596
grader_name_map={"grader-1": "test_grader"},
97+
expected_rows=250,
9698
)
9799

98100
# Mock run results
@@ -164,6 +166,7 @@ def test_empty_page_handling(self):
164166
eval_group_id="test-group",
165167
eval_run_id="test-run",
166168
grader_name_map={"grader-1": "test_grader"},
169+
expected_rows=5,
167170
)
168171

169172
mock_run_results = Mock()
@@ -205,23 +208,28 @@ def test_result_ordering_preservation(self):
205208
eval_group_id="test-group",
206209
eval_run_id="test-run",
207210
grader_name_map={"grader-1": "test_grader"},
211+
expected_rows=10,
208212
)
209213

210214
mock_run_results = Mock()
211215
mock_run_results.status = "completed"
212216
mock_run_results.per_testing_criteria_results = [Mock(testing_criteria="grader-1", passed=20, failed=0)]
213217

214-
# Create results in non-sequential order across pages
218+
# Create results in non-sequential order across pages, covering ids 0..9 exactly
215219
page1_items = [
216220
MockOutputItem(
217-
id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}]
221+
id=f"item-{i}",
222+
datasource_item_id=i, # was i * 2
223+
results=[{"name": "grader-1", "passed": True, "score": i}],
218224
)
219225
for i in [5, 3, 8, 1, 9]
220226
]
221227

222228
page2_items = [
223229
MockOutputItem(
224-
id=f"item-{i}", datasource_item_id=i * 2, results=[{"name": "grader-1", "passed": True, "score": i}]
230+
id=f"item-{i}",
231+
datasource_item_id=i, # was i * 2
232+
results=[{"name": "grader-1", "passed": True, "score": i}],
225233
)
226234
for i in [2, 7, 4, 6, 0]
227235
]
@@ -238,7 +246,7 @@ def test_result_ordering_preservation(self):
238246
):
239247
df, metrics = _get_single_run_results(run_info)
240248

241-
# Verify results are sorted by datasource_item_id
249+
# Verify results are sorted by datasource_item_id (0..9)
242250
scores = df["outputs.test_grader.score"].tolist()
243-
expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # Sorted by datasource_item_id
251+
expected_scores = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
244252
assert scores == expected_scores

0 commit comments

Comments
 (0)