22# Copyright (c) Microsoft Corporation. All rights reserved.
33# ---------------------------------------------------------
44
5+ import json
56import logging
67import re
78
@@ -60,13 +61,15 @@ def _split_evaluators_and_grader_configs(
6061 :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
6162 :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
6263 """
64+ LOGGER .info (f"AOAI: Splitting { len (evaluators )} evaluators into AOAI graders and standard evaluators..." )
6365 true_evaluators = {}
6466 aoai_graders = {}
6567 for key , value in evaluators .items ():
6668 if isinstance (value , AzureOpenAIGrader ):
6769 aoai_graders [key ] = value
6870 else :
6971 true_evaluators [key ] = value
72+ LOGGER .info (f"AOAI: Found { len (aoai_graders )} AOAI graders and { len (true_evaluators )} standard evaluators." )
7073 return true_evaluators , aoai_graders
7174
7275
@@ -103,11 +106,18 @@ def _begin_aoai_evaluation(
103106 LOGGER .info ("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group..." )
104107 all_eval_run_info : List [OAIEvalRunCreationInfo ] = []
105108
106- for selected_graders , selected_column_mapping in _get_graders_and_column_mappings (graders , column_mappings ):
109+ grader_mapping_list = list (_get_graders_and_column_mappings (graders , column_mappings ))
110+ LOGGER .info (f"AOAI: Will create { len (grader_mapping_list )} separate evaluation run(s) based on column mappings." )
111+
112+ for idx , (selected_graders , selected_column_mapping ) in enumerate (grader_mapping_list ):
113+ LOGGER .info (
114+ f"AOAI: Starting evaluation run { idx + 1 } /{ len (grader_mapping_list )} with { len (selected_graders )} grader(s)..."
115+ )
107116 all_eval_run_info .append (
108117 _begin_single_aoai_evaluation (selected_graders , data , selected_column_mapping , run_name )
109118 )
110119
120+ LOGGER .info (f"AOAI: Successfully created { len (all_eval_run_info )} evaluation run(s)." )
111121 return all_eval_run_info
112122
113123
@@ -133,6 +143,7 @@ def _begin_single_aoai_evaluation(
133143 """
134144
135145 # Format data for eval group creation
146+ LOGGER .info (f"AOAI: Preparing evaluation for { len (graders )} grader(s): { list (graders .keys ())} " )
136147 grader_name_list = []
137148 grader_list = []
138149 # It's expected that all graders supplied for a single eval run use the same credentials
@@ -143,10 +154,12 @@ def _begin_single_aoai_evaluation(
143154 grader_name_list .append (name )
144155 grader_list .append (grader ._grader_config )
145156 effective_column_mapping : Dict [str , str ] = column_mapping or {}
157+ LOGGER .info (f"AOAI: Generating data source config with { len (effective_column_mapping )} column mapping(s)..." )
146158 data_source_config = _generate_data_source_config (data , effective_column_mapping )
159+ LOGGER .info (f"AOAI: Data source config generated with schema type: { data_source_config .get ('type' )} " )
147160
148161 # Create eval group
149- # import pdb; pdb.set_trace( )
162+ LOGGER . info ( f"AOAI: Creating eval group with { len ( grader_list ) } testing criteria..." )
150163 eval_group_info = client .evals .create (
151164 data_source_config = data_source_config , testing_criteria = grader_list , metadata = {"is_foundry_eval" : "true" }
152165 )
@@ -167,6 +180,7 @@ def _begin_single_aoai_evaluation(
167180 grader_name_map [criteria .id ] = name
168181
169182 # Create eval run
183+ LOGGER .info (f"AOAI: Creating eval run '{ run_name } ' with { len (data )} data rows..." )
170184 eval_run_id = _begin_eval_run (client , eval_group_info .id , run_name , data , effective_column_mapping )
171185 LOGGER .info (
172186 f"AOAI: Eval run created with id { eval_run_id } ."
@@ -197,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
197211 :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
198212 """
199213
214+ LOGGER .info (f"AOAI: Retrieving results from { len (all_run_info )} evaluation run(s)..." )
200215 run_metrics = {}
201216 output_df = pd .DataFrame ()
202- for run_info in all_run_info :
217+ for idx , run_info in enumerate (all_run_info ):
218+ LOGGER .info (f"AOAI: Fetching results for run { idx + 1 } /{ len (all_run_info )} (ID: { run_info ['eval_run_id' ]} )..." )
203219 cur_output_df , cur_run_metrics = _get_single_run_results (run_info )
204220 output_df = pd .concat ([output_df , cur_output_df ], axis = 1 )
205221 run_metrics .update (cur_run_metrics )
206222
223+ LOGGER .info (f"AOAI: Successfully retrieved all results. Combined dataframe shape: { output_df .shape } " )
207224 return output_df , run_metrics
208225
209226
@@ -223,8 +240,10 @@ def _get_single_run_results(
223240 :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
224241 """
225242 # Wait for evaluation run to complete
243+ LOGGER .info (f"AOAI: Waiting for eval run { run_info ['eval_run_id' ]} to complete..." )
226244 run_results = _wait_for_run_conclusion (run_info ["client" ], run_info ["eval_group_id" ], run_info ["eval_run_id" ])
227245
246+ LOGGER .info (f"AOAI: Eval run { run_info ['eval_run_id' ]} completed with status: { run_results .status } " )
228247 if run_results .status != "completed" :
229248 raise EvaluationException (
230249 message = f"AOAI evaluation run { run_info ['eval_group_id' ]} /{ run_info ['eval_run_id' ]} "
@@ -235,6 +254,7 @@ def _get_single_run_results(
235254 )
236255
237256 # Convert run results into a dictionary of metrics
257+ LOGGER .info (f"AOAI: Processing results and calculating metrics for run { run_info ['eval_run_id' ]} ..." )
238258 run_metrics : Dict [str , Any ] = {}
239259 if run_results .per_testing_criteria_results is None :
240260 msg = (
@@ -255,8 +275,10 @@ def _get_single_run_results(
255275 ratio = passed / (passed + failed ) if (passed + failed ) else 0.0
256276 formatted_column_name = f"{ grader_name } .pass_rate"
257277 run_metrics [formatted_column_name ] = ratio
278+ LOGGER .info (f"AOAI: Grader '{ grader_name } ': { passed } passed, { failed } failed, pass_rate={ ratio :.4f} " )
258279
259280 # Collect all results with pagination
281+ LOGGER .info (f"AOAI: Collecting output items for run { run_info ['eval_run_id' ]} with pagination..." )
260282 all_results : List [Any ] = []
261283 next_cursor : Optional [str ] = None
262284 limit = 100 # Max allowed by API
@@ -280,6 +302,7 @@ def _get_single_run_results(
280302 else :
281303 break
282304
305+ LOGGER .info (f"AOAI: Collected { len (all_results )} total output items across all pages." )
283306 listed_results : Dict [str , List [Any ]] = {"index" : []}
284307 # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
285308 for row_result in all_results :
@@ -329,6 +352,7 @@ def _get_single_run_results(
329352
330353 # Ensure all columns are the same length as the 'index' list
331354 num_rows = len (listed_results ["index" ])
355+ LOGGER .info (f"AOAI: Processing { num_rows } result rows into dataframe..." )
332356 for col_name in list (listed_results .keys ()):
333357 if col_name != "index" :
334358 col_length = len (listed_results [col_name ])
@@ -356,6 +380,7 @@ def _get_single_run_results(
356380 expected = run_info .get ("expected_rows" , None )
357381 if expected is not None :
358382 pre_len = len (output_df )
383+ LOGGER .info (f"AOAI: Validating result count: expected { expected } rows, received { pre_len } rows." )
359384 # Assumes original datasource_item_id space is 0..expected-1
360385 output_df = output_df .reindex (range (expected ))
361386 if pre_len != expected :
@@ -388,6 +413,9 @@ def _get_single_run_results(
388413
389414 # Reset to RangeIndex so downstream concatenation aligns on position
390415 output_df .reset_index (drop = True , inplace = True )
416+ LOGGER .info (
417+ f"AOAI: Successfully processed run { run_info ['eval_run_id' ]} with final dataframe shape: { output_df .shape } "
418+ )
391419 return output_df , run_metrics
392420
393421
@@ -481,11 +509,16 @@ def _get_graders_and_column_mappings(
481509 :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
482510 """
483511
512+ LOGGER .info (f"AOAI: Organizing { len (graders )} graders with column mappings..." )
484513 if column_mappings is None :
514+ LOGGER .info ("AOAI: No column mappings provided, each grader will have its own eval run." )
485515 return [({name : grader }, None ) for name , grader in graders .items ()]
486516 default_mapping = column_mappings .get ("default" , None )
487517 if default_mapping is None :
488518 default_mapping = {}
519+ LOGGER .info (
520+ f"AOAI: Using default mapping with { len (default_mapping )} entries for graders without specific mappings."
521+ )
489522 return [
490523 ({name : grader }, None if column_mappings is None else column_mappings .get (name , default_mapping ))
491524 for name , grader in graders .items ()
@@ -593,17 +626,23 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
593626 helper function.
594627 """
595628 # Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
629+ LOGGER .info (
630+ f"AOAI: Generating data source config for { len (input_data_df )} rows with { len (column_mapping )} column mapping(s)..."
631+ )
596632 referenced_paths : List [str ] = []
597633 for v in column_mapping .values ():
598634 m = DATA_PATH_PATTERN .match (v )
599635 if m :
600636 referenced_paths .append (m .group (1 ))
601637
638+ LOGGER .info (f"AOAI: Found { len (referenced_paths )} referenced paths in column mappings: { referenced_paths } " )
602639 # Decide if we have nested structures
603640 has_nested = any ("." in p for p in referenced_paths )
641+ LOGGER .info (f"AOAI: Schema generation mode: { 'nested' if has_nested else 'flat' } " )
604642
605643 if not referenced_paths or not has_nested :
606644 # Legacy flat behavior (existing logic): treat each mapping key as independent string field
645+ LOGGER .info ("AOAI: Using flat schema generation (no nested structures detected)." )
607646 data_source_config = {
608647 "type" : "custom" ,
609648 "item_schema" : {
@@ -617,6 +656,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
617656 for key in column_mapping .keys ():
618657 props [key ] = {"type" : "string" }
619658 req .append (key )
659+ LOGGER .info (f"AOAI: Flat schema generated with { len (props )} properties: { list (props .keys ())} " )
620660 return data_source_config
621661
622662 # NEW: If all nested paths share the same first segment (e.g. 'item'),
@@ -625,12 +665,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
625665 first_segments = {p .split ("." )[0 ] for p in referenced_paths }
626666 strip_wrapper = False
627667 wrapper_name = None
668+ LOGGER .info (f"AOAI: First segments in referenced paths: { first_segments } " )
628669 if len (first_segments ) == 1 :
629670 only_seg = next (iter (first_segments ))
630671 # We only strip if that segment looks like the canonical wrapper.
631672 if only_seg == WRAPPER_KEY :
632673 strip_wrapper = True
633674 wrapper_name = only_seg
675+ LOGGER .info (f"AOAI: All paths start with wrapper '{ WRAPPER_KEY } ', will strip from schema." )
634676
635677 effective_paths = referenced_paths
636678 if strip_wrapper :
@@ -645,9 +687,12 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
645687 # If stripping produced at least one usable path, adopt; else fall back to original.
646688 if stripped :
647689 effective_paths = stripped
690+ LOGGER .info (f"AOAI: Effective paths after stripping wrapper: { effective_paths } " )
648691
692+ LOGGER .info (f"AOAI: Building nested schema from { len (effective_paths )} effective paths..." )
649693 nested_schema = _build_schema_tree_from_paths (effective_paths , force_leaf_type = "string" )
650694
695+ LOGGER .info (f"AOAI: Nested schema generated successfully with type '{ nested_schema .get ('type' )} '" )
651696 return {
652697 "type" : "custom" ,
653698 "item_schema" : nested_schema ,
@@ -697,6 +742,23 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
697742 :return: A dictionary that can be used as the data source input for an OAI evaluation run.
698743 :rtype: Dict[str, Any]
699744 """
745+
746+ def _convert_value_to_string (val : Any ) -> str :
747+ """Convert a value to string representation for AOAI evaluation."""
748+ if val is None :
749+ return ""
750+ elif isinstance (val , (str , int , float , bool )):
751+ return str (val )
752+ else :
753+ try : # Attempt to JSON serialize lists/dicts
754+ return json .dumps (val , ensure_ascii = False )
755+ except (TypeError , ValueError ):
756+ # Fallback for unserializable objects
757+ return str (val )
758+
759+ LOGGER .info (
760+ f"AOAI: Building data source from { len (input_data_df )} rows with { len (column_mapping )} column mappings..."
761+ )
700762 # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
701763 # relative_parts excludes the wrapper (so schema + content align).
702764 path_specs : List [Tuple [str , List [str ], str ]] = []
@@ -746,24 +808,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
746808 leaf_name = pieces [- 1 ]
747809 path_specs .append ((formatted_entry , [leaf_name ], run_col ))
748810
811+ LOGGER .info (f"AOAI: Processed { len (path_specs )} path specifications from column mappings." )
749812 content : List [Dict [str , Any ]] = []
750813
751814 for _ , row in input_data_df .iterrows ():
752815 item_root : Dict [str , Any ] = {}
753816
817+ # Track which dataframe columns have been processed via column_mapping
818+ processed_cols : Set [str ] = set ()
819+
754820 for _ , rel_parts , df_col in path_specs :
755821 # Safely fetch value
756822 val = row .get (df_col , None )
757823
758824 # Convert value to string to match schema's "type": "string" leaves.
759- # (If you later infer types, you can remove the stringify.)
760- if val is None :
761- str_val = ""
762- elif isinstance (val , (str , int , float , bool )):
763- str_val = str (val )
764- else :
765- # Lists / dicts / other -> string for now
766- str_val = str (val )
825+ str_val = _convert_value_to_string (val )
767826
768827 # Insert into nested dict
769828 cursor = item_root
@@ -776,8 +835,19 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
776835 leaf_key = rel_parts [- 1 ]
777836 cursor [leaf_key ] = str_val
778837
838+ # Mark this dataframe column as processed
839+ processed_cols .add (df_col )
840+
841+ # Add any unmapped dataframe columns directly to item_root
842+ for col_name in input_data_df .columns :
843+ if col_name not in processed_cols :
844+ val = row .get (col_name , None )
845+ str_val = _convert_value_to_string (val )
846+ item_root [col_name ] = str_val
847+
779848 content .append ({WRAPPER_KEY : item_root })
780849
850+ LOGGER .info (f"AOAI: Generated { len (content )} content items for data source." )
781851 return {
782852 "type" : "jsonl" ,
783853 "source" : {
@@ -812,6 +882,7 @@ def _begin_eval_run(
812882 :rtype: str
813883 """
814884
885+ LOGGER .info (f"AOAI: Creating eval run '{ run_name } ' for eval group { eval_group_id } ..." )
815886 data_source = _get_data_source (input_data_df , column_mapping )
816887 eval_run = client .evals .runs .create (
817888 eval_id = eval_group_id ,
@@ -820,6 +891,7 @@ def _begin_eval_run(
820891 metadata = {"sample_generation" : "off" , "file_format" : "jsonl" , "is_foundry_eval" : "true" },
821892 # TODO decide if we want to add our own timeout value?
822893 )
894+ LOGGER .info (f"AOAI: Eval run created successfully with ID: { eval_run .id } " )
823895 return eval_run .id
824896
825897
@@ -856,8 +928,11 @@ def _wait_for_run_conclusion(
856928 if total_wait > max_wait_seconds :
857929 wait_interval -= total_wait - max_wait_seconds
858930 sleep (wait_interval )
931+ iters += 1
859932 response = client .evals .runs .retrieve (eval_id = eval_group_id , run_id = eval_run_id )
933+ LOGGER .info (f"AOAI: Polling iteration { iters } , status: { response .status } , total wait: { total_wait :.1f} s" )
860934 if response .status not in ["queued" , "in_progress" ]:
935+ LOGGER .info (f"AOAI: Eval run { eval_run_id } reached terminal status: { response .status } " )
861936 return response
862937 if total_wait > max_wait_seconds :
863938 raise EvaluationException (
0 commit comments