Skip to content

Commit ad4ae74

Browse files
authored
Fix column mapping bug for AOAI evaluators with custom data mapping (Azure#43429)
* fix nesting bug for custom data mapping * address comments * remove extra code and fix test case * run formatter * use dumps
1 parent 633ff37 commit ad4ae74

File tree

6 files changed

+893
-11
lines changed

6 files changed

+893
-11
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# ---------------------------------------------------------
44

5+
import json
56
import logging
67
import re
78

@@ -60,13 +61,15 @@ def _split_evaluators_and_grader_configs(
6061
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
6162
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
6263
"""
64+
LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
6365
true_evaluators = {}
6466
aoai_graders = {}
6567
for key, value in evaluators.items():
6668
if isinstance(value, AzureOpenAIGrader):
6769
aoai_graders[key] = value
6870
else:
6971
true_evaluators[key] = value
72+
LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
7073
return true_evaluators, aoai_graders
7174

7275

@@ -103,11 +106,18 @@ def _begin_aoai_evaluation(
103106
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
104107
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
105108

106-
for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
109+
grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
110+
LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
111+
112+
for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
113+
LOGGER.info(
114+
f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
115+
)
107116
all_eval_run_info.append(
108117
_begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
109118
)
110119

120+
LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
111121
return all_eval_run_info
112122

113123

@@ -133,6 +143,7 @@ def _begin_single_aoai_evaluation(
133143
"""
134144

135145
# Format data for eval group creation
146+
LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
136147
grader_name_list = []
137148
grader_list = []
138149
# It's expected that all graders supplied for a single eval run use the same credentials
@@ -143,10 +154,12 @@ def _begin_single_aoai_evaluation(
143154
grader_name_list.append(name)
144155
grader_list.append(grader._grader_config)
145156
effective_column_mapping: Dict[str, str] = column_mapping or {}
157+
LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
146158
data_source_config = _generate_data_source_config(data, effective_column_mapping)
159+
LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
147160

148161
# Create eval group
149-
# import pdb; pdb.set_trace()
162+
LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
150163
eval_group_info = client.evals.create(
151164
data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
152165
)
@@ -167,6 +180,7 @@ def _begin_single_aoai_evaluation(
167180
grader_name_map[criteria.id] = name
168181

169182
# Create eval run
183+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
170184
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
171185
LOGGER.info(
172186
f"AOAI: Eval run created with id {eval_run_id}."
@@ -197,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
197211
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
198212
"""
199213

214+
LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
200215
run_metrics = {}
201216
output_df = pd.DataFrame()
202-
for run_info in all_run_info:
217+
for idx, run_info in enumerate(all_run_info):
218+
LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
203219
cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
204220
output_df = pd.concat([output_df, cur_output_df], axis=1)
205221
run_metrics.update(cur_run_metrics)
206222

223+
LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
207224
return output_df, run_metrics
208225

209226

@@ -223,8 +240,10 @@ def _get_single_run_results(
223240
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
224241
"""
225242
# Wait for evaluation run to complete
243+
LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
226244
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
227245

246+
LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
228247
if run_results.status != "completed":
229248
raise EvaluationException(
230249
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -235,6 +254,7 @@ def _get_single_run_results(
235254
)
236255

237256
# Convert run results into a dictionary of metrics
257+
LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
238258
run_metrics: Dict[str, Any] = {}
239259
if run_results.per_testing_criteria_results is None:
240260
msg = (
@@ -255,8 +275,10 @@ def _get_single_run_results(
255275
ratio = passed / (passed + failed) if (passed + failed) else 0.0
256276
formatted_column_name = f"{grader_name}.pass_rate"
257277
run_metrics[formatted_column_name] = ratio
278+
LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
258279

259280
# Collect all results with pagination
281+
LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
260282
all_results: List[Any] = []
261283
next_cursor: Optional[str] = None
262284
limit = 100 # Max allowed by API
@@ -280,6 +302,7 @@ def _get_single_run_results(
280302
else:
281303
break
282304

305+
LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
283306
listed_results: Dict[str, List[Any]] = {"index": []}
284307
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
285308
for row_result in all_results:
@@ -329,6 +352,7 @@ def _get_single_run_results(
329352

330353
# Ensure all columns are the same length as the 'index' list
331354
num_rows = len(listed_results["index"])
355+
LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
332356
for col_name in list(listed_results.keys()):
333357
if col_name != "index":
334358
col_length = len(listed_results[col_name])
@@ -356,6 +380,7 @@ def _get_single_run_results(
356380
expected = run_info.get("expected_rows", None)
357381
if expected is not None:
358382
pre_len = len(output_df)
383+
LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
359384
# Assumes original datasource_item_id space is 0..expected-1
360385
output_df = output_df.reindex(range(expected))
361386
if pre_len != expected:
@@ -388,6 +413,9 @@ def _get_single_run_results(
388413

389414
# Reset to RangeIndex so downstream concatenation aligns on position
390415
output_df.reset_index(drop=True, inplace=True)
416+
LOGGER.info(
417+
f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
418+
)
391419
return output_df, run_metrics
392420

393421

@@ -481,11 +509,16 @@ def _get_graders_and_column_mappings(
481509
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
482510
"""
483511

512+
LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
484513
if column_mappings is None:
514+
LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
485515
return [({name: grader}, None) for name, grader in graders.items()]
486516
default_mapping = column_mappings.get("default", None)
487517
if default_mapping is None:
488518
default_mapping = {}
519+
LOGGER.info(
520+
f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
521+
)
489522
return [
490523
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
491524
for name, grader in graders.items()
@@ -593,17 +626,23 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
593626
helper function.
594627
"""
595628
# Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
629+
LOGGER.info(
630+
f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
631+
)
596632
referenced_paths: List[str] = []
597633
for v in column_mapping.values():
598634
m = DATA_PATH_PATTERN.match(v)
599635
if m:
600636
referenced_paths.append(m.group(1))
601637

638+
LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
602639
# Decide if we have nested structures
603640
has_nested = any("." in p for p in referenced_paths)
641+
LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
604642

605643
if not referenced_paths or not has_nested:
606644
# Legacy flat behavior (existing logic): treat each mapping key as independent string field
645+
LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
607646
data_source_config = {
608647
"type": "custom",
609648
"item_schema": {
@@ -617,6 +656,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
617656
for key in column_mapping.keys():
618657
props[key] = {"type": "string"}
619658
req.append(key)
659+
LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
620660
return data_source_config
621661

622662
# NEW: If all nested paths share the same first segment (e.g. 'item'),
@@ -625,12 +665,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
625665
first_segments = {p.split(".")[0] for p in referenced_paths}
626666
strip_wrapper = False
627667
wrapper_name = None
668+
LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
628669
if len(first_segments) == 1:
629670
only_seg = next(iter(first_segments))
630671
# We only strip if that segment looks like the canonical wrapper.
631672
if only_seg == WRAPPER_KEY:
632673
strip_wrapper = True
633674
wrapper_name = only_seg
675+
LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
634676

635677
effective_paths = referenced_paths
636678
if strip_wrapper:
@@ -645,9 +687,12 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
645687
# If stripping produced at least one usable path, adopt; else fall back to original.
646688
if stripped:
647689
effective_paths = stripped
690+
LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
648691

692+
LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
649693
nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
650694

695+
LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
651696
return {
652697
"type": "custom",
653698
"item_schema": nested_schema,
@@ -697,6 +742,23 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
697742
:return: A dictionary that can be used as the data source input for an OAI evaluation run.
698743
:rtype: Dict[str, Any]
699744
"""
745+
746+
def _convert_value_to_string(val: Any) -> str:
747+
"""Convert a value to string representation for AOAI evaluation."""
748+
if val is None:
749+
return ""
750+
elif isinstance(val, (str, int, float, bool)):
751+
return str(val)
752+
else:
753+
try: # Attempt to JSON serialize lists/dicts
754+
return json.dumps(val, ensure_ascii=False)
755+
except (TypeError, ValueError):
756+
# Fallback for unserializable objects
757+
return str(val)
758+
759+
LOGGER.info(
760+
f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
761+
)
700762
# Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
701763
# relative_parts excludes the wrapper (so schema + content align).
702764
path_specs: List[Tuple[str, List[str], str]] = []
@@ -746,24 +808,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
746808
leaf_name = pieces[-1]
747809
path_specs.append((formatted_entry, [leaf_name], run_col))
748810

811+
LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
749812
content: List[Dict[str, Any]] = []
750813

751814
for _, row in input_data_df.iterrows():
752815
item_root: Dict[str, Any] = {}
753816

817+
# Track which dataframe columns have been processed via column_mapping
818+
processed_cols: Set[str] = set()
819+
754820
for _, rel_parts, df_col in path_specs:
755821
# Safely fetch value
756822
val = row.get(df_col, None)
757823

758824
# Convert value to string to match schema's "type": "string" leaves.
759-
# (If you later infer types, you can remove the stringify.)
760-
if val is None:
761-
str_val = ""
762-
elif isinstance(val, (str, int, float, bool)):
763-
str_val = str(val)
764-
else:
765-
# Lists / dicts / other -> string for now
766-
str_val = str(val)
825+
str_val = _convert_value_to_string(val)
767826

768827
# Insert into nested dict
769828
cursor = item_root
@@ -776,8 +835,19 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
776835
leaf_key = rel_parts[-1]
777836
cursor[leaf_key] = str_val
778837

838+
# Mark this dataframe column as processed
839+
processed_cols.add(df_col)
840+
841+
# Add any unmapped dataframe columns directly to item_root
842+
for col_name in input_data_df.columns:
843+
if col_name not in processed_cols:
844+
val = row.get(col_name, None)
845+
str_val = _convert_value_to_string(val)
846+
item_root[col_name] = str_val
847+
779848
content.append({WRAPPER_KEY: item_root})
780849

850+
LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
781851
return {
782852
"type": "jsonl",
783853
"source": {
@@ -812,6 +882,7 @@ def _begin_eval_run(
812882
:rtype: str
813883
"""
814884

885+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
815886
data_source = _get_data_source(input_data_df, column_mapping)
816887
eval_run = client.evals.runs.create(
817888
eval_id=eval_group_id,
@@ -820,6 +891,7 @@ def _begin_eval_run(
820891
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
821892
# TODO decide if we want to add our own timeout value?
822893
)
894+
LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
823895
return eval_run.id
824896

825897

@@ -856,8 +928,11 @@ def _wait_for_run_conclusion(
856928
if total_wait > max_wait_seconds:
857929
wait_interval -= total_wait - max_wait_seconds
858930
sleep(wait_interval)
931+
iters += 1
859932
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
933+
LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
860934
if response.status not in ["queued", "in_progress"]:
935+
LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
861936
return response
862937
if total_wait > max_wait_seconds:
863938
raise EvaluationException(
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"query": "What is the capital of France?", "response": "Paris is the capital of France.", "ground_truth": "Paris"}
2+
{"query": "What is 2+2?", "response": "The answer is 4.", "ground_truth": "4"}
3+
{"query": "Who wrote Hamlet?", "response": "William Shakespeare wrote Hamlet.", "ground_truth": "Shakespeare"}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"item": {"query": "What security policies exist?", "context": {"company": {"policy": {"security": {"passwords": {"rotation_days": 90, "min_length": 12}, "network": {"vpn": {"required": true, "provider": "Cisco"}}}}}}, "response": "Password rotation is required every 90 days with minimum 12 characters. VPN is required using Cisco provider.", "ground_truth": "Security policies include password rotation every 90 days and VPN requirement."}}
2+
{"item": {"query": "What are the database settings?", "context": {"company": {"infrastructure": {"database": {"host": "db.example.com", "port": 5432, "type": "PostgreSQL"}}}}, "response": "The database is PostgreSQL hosted at db.example.com on port 5432.", "ground_truth": "PostgreSQL database on db.example.com:5432"}}
3+
{"item": {"query": "What is the deployment process?", "context": {"company": {"devops": {"deployment": {"strategy": "blue-green", "frequency": "daily", "tools": ["Jenkins", "Kubernetes"]}}}}, "response": "We use blue-green deployment strategy daily with Jenkins and Kubernetes.", "ground_truth": "Blue-green deployment daily using Jenkins and Kubernetes"}}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"item": {"query": "Simple query", "response": "Simple response", "ground_truth": "Simple truth"}}
2+
{"item": {"query": "Another query", "response": "Another response", "ground_truth": "Another truth"}}

0 commit comments

Comments
 (0)