2
2
# Copyright (c) Microsoft Corporation. All rights reserved.
3
3
# ---------------------------------------------------------
4
4
import copy
5
+ from hmac import new
5
6
import json
6
7
import os
7
8
import shutil
27
28
from azure .ai .generative .evaluate ._utils import _is_flow , load_jsonl , _get_artifact_dir_path , _copy_artifact
28
29
from azure .ai .generative .evaluate ._mlflow_log_collector import RedirectUserOutputStreams
29
30
from azure .ai .generative .evaluate ._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING , SUPPORTED_TASK_TYPE , CHAT , \
30
- TYPE_TO_KWARGS_MAPPING , SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
31
+ SUPPORTED_TASK_TYPE_TO_METRICS_MAPPING
31
32
from azure .ai .generative .evaluate ._evaluation_result import EvaluationResult
32
33
from ._metrics_handler ._prompt_metric_handler import PromptMetricHandler
33
34
@@ -67,27 +68,6 @@ def _get_metric_handler_class(
67
68
return handler
68
69
69
70
70
- def _validate_data (data , prediction_data , truth_data ):
71
- errors = []
72
- prediction_data_column = ""
73
- truth_data_column = ""
74
-
75
- if isinstance (prediction_data , str ):
76
- prediction_data_column = data [0 ].get (prediction_data , None )
77
-
78
- if isinstance (truth_data , str ):
79
- truth_data_column = data [0 ].get (truth_data , None )
80
-
81
- if prediction_data_column is None :
82
- errors .append ("prediction_data column not found in data" )
83
-
84
- if truth_data_column is None :
85
- errors .append ("truth_data column not found in data" )
86
-
87
- if len (errors ) > 1 :
88
- raise Exception (f'Invalid data { " ," .join (errors )} ' )
89
-
90
-
91
71
def _log_metrics (run_id , metrics ):
92
72
"""
93
73
Helper method to log metrics into specified run.
@@ -135,7 +115,7 @@ def evaluate(
135
115
task_type : Optional [str ] = None ,
136
116
metrics_list : Optional [List [str ]] = None ,
137
117
model_config : Optional [Dict [str , str ]] = None ,
138
- data_mapping : Optional [Mapping ] = None ,
118
+ data_mapping : Optional [Dict [ str , str ] ] = None ,
139
119
output_path : Optional [str ] = None ,
140
120
** kwargs
141
121
):
@@ -154,9 +134,9 @@ def evaluate(
154
134
:keyword metrics_list: List of metrics to calculate. A default list is picked based on task_type if not set.
155
135
:paramtype metrics_list: Optional[List[str]]
156
136
:keyword model_config: GPT configuration details needed for AI-assisted metrics.
157
- :paramtype model_config: Dict[str, str]
137
+ :paramtype model_config: Optional[ Dict[str, str] ]
158
138
:keyword data_mapping: GPT configuration details needed for AI-assisted metrics.
159
- :paramtype data_mapping: typing.Mapping
139
+ :paramtype data_mapping: Optional[Dict[str, str]]
160
140
:keyword output_path: The local folder path to save evaluation artifacts to if set
161
141
:paramtype output_path: Optional[str]
162
142
:keyword tracking_uri: Tracking uri to log evaluation results to AI Studio
@@ -182,8 +162,20 @@ def evaluate(
182
162
if model_config :
183
163
metrics_config .update ({"openai_params" : model_config })
184
164
165
+
185
166
if data_mapping :
186
- metrics_config .update (data_mapping )
167
+ import warnings
168
+
169
+ new_data_mapping = dict (data_mapping )
170
+ if "y_pred" in new_data_mapping :
171
+ warnings .warn ("y_pred is deprecated, please use \" answer\" instead" )
172
+ value = data_mapping .pop ("y_pred" )
173
+ new_data_mapping .update ({"answer" : value })
174
+ if "y_test" in new_data_mapping :
175
+ warnings .warn ("y_test is deprecated, please use \" ground_truth\" instead" )
176
+ value = data_mapping .pop ("y_test" )
177
+ new_data_mapping .update ({"ground_truth" : value })
178
+ data_mapping = new_data_mapping
187
179
188
180
sweep_args = kwargs .pop ("sweep_args" , None )
189
181
if sweep_args :
@@ -230,8 +222,6 @@ def _evaluate(
230
222
evaluation_name = None ,
231
223
target = None ,
232
224
data = None ,
233
- truth_data = None ,
234
- prediction_data = None ,
235
225
task_type = None ,
236
226
metrics = None ,
237
227
data_mapping = None ,
@@ -248,14 +238,8 @@ def _evaluate(
248
238
test_data = data
249
239
_data_is_file = False
250
240
251
- if "y_pred" in data_mapping :
252
- prediction_data = data_mapping .get ("y_pred" )
253
-
254
- if "y_test" in data_mapping :
255
- truth_data = data_mapping .get ("y_test" )
256
-
257
- if target is None and prediction_data is None :
258
- raise Exception ("target and prediction data cannot be null" )
241
+ if "answer" in data_mapping :
242
+ prediction_data = data_mapping .get ("answer" )
259
243
260
244
if task_type not in SUPPORTED_TASK_TYPE :
261
245
raise Exception (f"task type { task_type } is not supported" )
@@ -281,8 +265,6 @@ def _evaluate(
281
265
282
266
asset_handler = asset_handler_class (
283
267
asset = target ,
284
- prediction_data = prediction_data ,
285
- ground_truth = truth_data ,
286
268
test_data = test_data ,
287
269
metrics_config = metrics_config ,
288
270
** kwargs
@@ -299,8 +281,6 @@ def _evaluate(
299
281
custom_prompt_metrics = [metric for metric in metrics if isinstance (metric , PromptMetric )]
300
282
code_metrics = [metric for metric in metrics if isinstance (metric , CodeMetric )]
301
283
302
- # TODO : Once PF is used for inbuilt metrics parallelize submission of metrics calculation of different kind
303
-
304
284
if custom_prompt_metrics :
305
285
for metric in custom_prompt_metrics :
306
286
metrics_config .setdefault (metric .name , {param : param for param in metric .parameters })
@@ -309,12 +289,8 @@ def _evaluate(
309
289
task_type = "custom-prompt-metric" ,
310
290
metrics = custom_prompt_metrics ,
311
291
prediction_data = asset_handler .prediction_data ,
312
- truth_data = asset_handler .ground_truth ,
313
292
test_data = asset_handler .test_data ,
314
293
metrics_mapping = metrics_config ,
315
- prediction_data_column_name = prediction_data if isinstance (prediction_data , str ) else None ,
316
- ground_truth_column_name = truth_data if isinstance (truth_data , str ) else None ,
317
- type_to_kwargs = "custom-prompt-metric"
318
294
)
319
295
320
296
prompt_metric_results = prompt_metric_handler .calculate_metrics ()
@@ -328,12 +304,8 @@ def _evaluate(
328
304
task_type = "custom-code-metric" ,
329
305
metrics = code_metrics ,
330
306
prediction_data = asset_handler .prediction_data ,
331
- truth_data = asset_handler .ground_truth ,
332
307
test_data = asset_handler .test_data ,
333
308
metrics_mapping = metrics_config ,
334
- prediction_data_column_name = prediction_data if isinstance (prediction_data , str ) else None ,
335
- ground_truth_column_name = truth_data if isinstance (truth_data , str ) else None ,
336
- type_to_kwargs = "code-prompt-metric"
337
309
)
338
310
339
311
code_metric_results = code_metric_handler .calculate_metrics ()
@@ -347,12 +319,10 @@ def _evaluate(
347
319
task_type = SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING [task_type ],
348
320
metrics = inbuilt_metrics ,
349
321
prediction_data = asset_handler .prediction_data ,
350
- truth_data = asset_handler .ground_truth ,
322
+ input_output_data = asset_handler .input_output_data ,
351
323
test_data = asset_handler .test_data ,
352
324
metrics_mapping = metrics_config ,
353
- prediction_data_column_name = prediction_data if isinstance (prediction_data , str ) else None ,
354
- ground_truth_column_name = truth_data if isinstance (truth_data , str ) else None ,
355
- type_to_kwargs = TYPE_TO_KWARGS_MAPPING [task_type ]
325
+ data_mapping = data_mapping ,
356
326
)
357
327
358
328
inbuilt_metrics_results = inbuilt_metrics_handler .calculate_metrics ()
@@ -393,6 +363,7 @@ def _evaluate(
393
363
eval_artifact_df = _get_instance_table (metrics_results , task_type , asset_handler ).to_json (orient = "records" ,
394
364
lines = True ,
395
365
force_ascii = False )
366
+ # eval_artifact_df = result.to_json(orient="records", lines=True, force_ascii=False)
396
367
tmp_path = os .path .join (tmpdir , "eval_results.jsonl" )
397
368
398
369
with open (tmp_path , "w" , encoding = "utf-8" ) as f :
@@ -480,22 +451,14 @@ def _get_chat_instance_table(metrics):
480
451
481
452
482
453
def _get_instance_table (metrics , task_type , asset_handler ):
483
- if metrics .get ("artifacts" ):
484
- metrics .get ("artifacts" ).pop ("bertscore" , None )
454
+
485
455
if task_type == CHAT :
486
456
instance_level_metrics_table = _get_chat_instance_table (metrics .get ("artifacts" ))
487
457
else :
488
458
instance_level_metrics_table = pd .DataFrame (metrics .get ("artifacts" ))
489
459
490
- prediction_data = asset_handler .prediction_data
491
- for column in asset_handler .prediction_data .columns .values :
492
- if column in asset_handler .test_data .columns .values :
493
- prediction_data .drop (column , axis = 1 , inplace = True )
494
-
495
460
combined_table = pd .concat (
496
- [asset_handler .test_data ,
497
- prediction_data ,
498
- asset_handler .ground_truth ,
461
+ [asset_handler .input_output_data ,
499
462
instance_level_metrics_table
500
463
],
501
464
axis = 1 ,
0 commit comments