9
9
import re
10
10
import tempfile
11
11
import json
12
- from typing import Any , Callable , Dict , List , Literal , Optional , Set , Tuple , TypedDict , Union , cast
12
+ from typing import Any , Callable , Dict , Iterable , Iterator , List , Literal , Optional , Set , Tuple , TypedDict , Union , cast
13
13
14
14
from openai import OpenAI , AzureOpenAI
15
15
from azure .ai .evaluation ._legacy ._adapters ._constants import LINE_NUMBER
@@ -1135,11 +1135,36 @@ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter",
1135
1135
# via target mapping.
1136
1136
# If both the data and the output dictionary of the target function
1137
1137
# have the same column, then the target function value is used.
1138
+ # NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
1139
+ # Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
1138
1140
if input_data_df is not None :
1141
+ if "conversation" in input_data_df .columns or "messages" in input_data_df .columns :
1142
+ # No action is taken when 'conversation' or 'messages' columns are present,
1143
+ # as these indicate chat/conversation data which should not be flattened or mapped by default.
1144
+ pass
1145
+ else :
1146
+ input_data_df = _flatten_object_columns_for_default_mapping (input_data_df )
1147
+
1148
+ # Build default mapping for leaves:
1149
+ if input_data_df is not None :
1150
+ # First, map flattened nested columns (those containing a dot) to leaf names.
1151
+ for col in input_data_df .columns :
1152
+ # Skip target output columns
1153
+ if col .startswith (Prefixes .TSG_OUTPUTS ):
1154
+ continue
1155
+ # Skip root container columns (no dot) here; they'll be handled below if truly primitive.
1156
+ if "." in col :
1157
+ leaf_name = col .split ("." )[- 1 ]
1158
+ if leaf_name not in column_mapping ["default" ]:
1159
+ column_mapping ["default" ][leaf_name ] = f"${{data.{ col } }}"
1160
+
1161
+ # Then, handle remaining top-level primitive columns (original logic).
1139
1162
for col in input_data_df .columns :
1140
- # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
1141
- # Also ignore columns that are already in config, since they've been covered by target mapping.
1142
- if not col .startswith (Prefixes .TSG_OUTPUTS ) and col not in column_mapping ["default" ].keys ():
1163
+ if (
1164
+ not col .startswith (Prefixes .TSG_OUTPUTS )
1165
+ and col not in column_mapping ["default" ].keys ()
1166
+ and "." not in col # only pure top-level primitives
1167
+ ):
1143
1168
column_mapping ["default" ][col ] = f"${{data.{ col } }}"
1144
1169
1145
1170
return __ValidatedData (
@@ -1153,6 +1178,79 @@ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter",
1153
1178
)
1154
1179
1155
1180
1181
+ def _flatten_object_columns_for_default_mapping (
1182
+ df : pd .DataFrame , root_prefixes : Optional [Iterable [str ]] = None
1183
+ ) -> pd .DataFrame :
1184
+ """Flatten nested dictionary-valued columns into dotted leaf columns.
1185
+
1186
+ For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
1187
+ leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
1188
+ columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
1189
+ all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
1190
+ are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
1191
+
1192
+ Example
1193
+ If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
1194
+ columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
1195
+
1196
+ :param df: Input DataFrame to flatten in place.
1197
+ :type df: ~pandas.DataFrame
1198
+ :param root_prefixes: Optional iterable restricting which top-level columns are considered
1199
+ for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
1200
+ :type root_prefixes: Optional[Iterable[str]]
1201
+ :return: The same DataFrame instance (returned for convenient chaining).
1202
+ :rtype: ~pandas.DataFrame
1203
+ """
1204
+ candidate_cols = []
1205
+ if root_prefixes is not None :
1206
+ candidate_cols = [c for c in root_prefixes if c in df .columns ]
1207
+ else :
1208
+ # pick columns where at least one non-null value is a dict
1209
+ for c in df .columns :
1210
+ series = df [c ]
1211
+ if series .map (lambda v : isinstance (v , dict )).any ():
1212
+ candidate_cols .append (c )
1213
+
1214
+ def _extract_leaves (obj : Any , prefix : str ) -> Iterator [Tuple [str , Any ]]:
1215
+ if isinstance (obj , dict ):
1216
+ for k , v in obj .items ():
1217
+ new_prefix = f"{ prefix } .{ k } " if prefix else k
1218
+ if isinstance (v , dict ):
1219
+ yield from _extract_leaves (v , new_prefix )
1220
+ else :
1221
+ # treat list / primitive / None as leaf
1222
+ yield new_prefix , v
1223
+
1224
+ for root_col in candidate_cols :
1225
+ # Build a union of leaf paths across rows to ensure consistent columns
1226
+ leaf_paths : Set [str ] = set ()
1227
+ for val in df [root_col ]:
1228
+ if isinstance (val , dict ):
1229
+ for path , _ in _extract_leaves (val , root_col ):
1230
+ leaf_paths .add (path )
1231
+
1232
+ if not leaf_paths :
1233
+ continue
1234
+
1235
+ # Create each flattened column if absent
1236
+ for path in leaf_paths :
1237
+ if path in df .columns :
1238
+ continue # already present
1239
+ relative_keys = path [len (root_col ) + 1 :].split ("." ) if len (path ) > len (root_col ) else []
1240
+
1241
+ def getter (root_val : Any ) -> Any :
1242
+ cur = root_val
1243
+ for rk in relative_keys :
1244
+ if not isinstance (cur , dict ):
1245
+ return None
1246
+ cur = cur .get (rk , None )
1247
+ return cur
1248
+
1249
+ df [path ] = df [root_col ].map (lambda rv : getter (rv ) if isinstance (rv , dict ) else None )
1250
+
1251
+ return df
1252
+
1253
+
1156
1254
def _run_callable_evaluators (
1157
1255
validated_data : __ValidatedData ,
1158
1256
fail_on_evaluator_errors : bool = False ,
0 commit comments